In [1]:
from config import *
import json
import re
import string
import nltk
from nltk.corpus import stopwords

# download stopwords
nltk.download('stopwords')

# load stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/azureuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
print(DATASETS_PATH, QUERY_TRAIN_PATH, QUERY_TEST_PATH, SHORT_ABSTRACT_PATH, INSTANCE_TYPES_EN_PATH)

/mntnvme/datasets /mntnvme/datasets/DBpedia/smarttask_dbpedia_train.json /mntnvme/datasets/DBpedia/smarttask_dbpedia_test_questions.json /mntnvme/datasets/DBpedia/short_abstracts_en.ttl /mntnvme/datasets/DBpedia/instance_types_en.ttl


In [3]:
def head(path, n=5, encoding="utf-8"):
    res = []
    if n <= 0:
        return res
    with open(path, encoding=encoding) as f:
        for i, line in enumerate(f):
            if i >= n:
                break
            res.append(line)
            if i < 5:
                print(line)
    return res

In [11]:
instance_types = head(INSTANCE_TYPES_EN_PATH, 100)
print("--" * 20)
short_abstracts = head(SHORT_ABSTRACT_PATH, 100)

# started 2017-03-31T16:36:17Z

<http://dbpedia.org/resource/Anarchism> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .

<http://dbpedia.org/resource/Achilles> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://www.w3.org/2002/07/owl#Thing> .

<http://dbpedia.org/resource/Autism> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/Disease> .

<http://dbpedia.org/resource/Alabama> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://dbpedia.org/ontology/AdministrativeRegion> .

----------------------------------------
# started 2017-02-07T14:14:05Z

<http://dbpedia.org/resource/Animalia_(book)> <http://www.w3.org/2000/01/rdf-schema#comment> "Animalia is an illustrated children's book by Graeme Base. It was originally published in 1986, followed by a tenth anniversary edition in 1996, and a 25th anniversary edition in 2012. Over three million copies have been sold. A special numbered and signed anniversary e

In [6]:
def preprocess(line, remove_stopwords=False):
    line = line.strip().lower().replace("_", " ").translate(str.maketrans('', '', string.punctuation))
    return " ".join([
        term 
        for term in re.sub(r"\s+", " ", line).split(" ") 
        if term not in stop_words
    ]).strip() if remove_stopwords else line

In [7]:
def parse_instance_types(line):
    if line == None or line[0] == "#":
        return
    
    line = line.strip().replace('/>', '>').split(' ')
    if len(line) < 3:
        return
    entity = preprocess(line[0][1:-1].split("/")[-1]) # remove < and >, get entity + preprocess
    instance_type = preprocess(line[2][1:-1].split("/")[-1][4:]) # remove < and >, get instance type + preprocess
    return {
        "_id": entity,
        "doc" : {"instance_type": instance_type},
        "_op_type": "update"
    }

In [8]:
instance_types_test = parse_instance_types(instance_types[1])
print(instance_types_test)

{'_id': 'anarchism', 'doc': {'instance_type': 'thing'}, '_op_type': 'update'}


In [9]:
def parse_abstracts(line):
    if line == None or line[0] == "#":
        return
    
    line = line.strip().replace('@en .', '') \
        .replace('"', '').replace('\\', '') \
        .replace('\'', '').replace('/>', '>').split(' ')

    if len(line) < 3:
        return
    entity = preprocess(line[0][1:-1].split("/")[-1])
    abstract = preprocess(' '.join(line[2:]), True)
    return {
        "_id": entity,
        "_source": {"abstract": abstract, "instance_type": "_"}
    }

In [10]:
abstract_test = parse_abstracts(short_abstracts[1])
print(abstract_test)

{'_id': 'animalia book', '_source': {'abstract': 'animalia illustrated childrens book graeme base originally published 1986 followed tenth anniversary edition 1996 25th anniversary edition 2012 three million copies sold special numbered signed anniversary edition also published 1996 embossed gold jacket', 'instance_type': '_'}}
