In [10]:
import os
import re
import time
import json
import copy
import random
import pandas as pd

# FILE_DIR = os.path.dirname(__file__)
FILE_DIR = os.path.abspath(".")
PROJECT_DIR = os.path.abspath(f"{FILE_DIR}/..")
DATA_PATH = f"{PROJECT_DIR}/data"
RAW_DATA_PATH = f"{DATA_PATH}/raw"
PREPROCESSED_DATA_PATH = f"{DATA_PATH}/preprocessed"
os.makedirs(PREPROCESSED_DATA_PATH, exist_ok = True)

import sys
sys.path.append(PROJECT_DIR)

from data import settings
from utils import utils

In [11]:
def splitIntoFolds(modes, facts, pos, neg, nFolds = 5, seed = None):
    '''For datasets as nell that have only 1 mega-example'''
    folds = []
    random.seed(seed)
    random.shuffle(pos)
    random.shuffle(neg)
    random.seed(None)
    totalPosPerFold = int(len(pos)/nFolds)
    totalNegPerFold = int(len(neg)/nFolds)
    for i in range(nFolds):  
        foldPos = pos[i*totalPosPerFold:(i+1)*totalPosPerFold]
        foldNeg = neg[i*totalNegPerFold:(i+1)*totalNegPerFold]
        folds.append({
            "modes": modes,
            "facts": facts,
            "pos": foldPos, 
            "neg": foldNeg
        })
    return folds

In [12]:
def balanceNeg(neg, totalPos, negPosRatio = 1, seed=None):
    '''Receives negative examples and balance them according to the number of positive examples'''
    assert type(negPosRatio) is int
    neg = copy.deepcopy(neg)
    size = negPosRatio * totalPos
    random.seed(seed)
    random.shuffle(neg)
    neg = neg[:size]
    random.seed(None)
    return neg

def generateAllNeg(pos):
    '''Receives positive examples and generates all negative examples'''
    pos = copy.deepcopy(pos)
    neg = []
    objects = set()
    subjects = {}
    for entities in pos:
        if entities[0] not in subjects:
            subjects[entities[0]] = set()
        subjects[entities[0]].add(entities[1])
        objects.add(entities[1])
        # for same type
        objects.add(entities[0])
    for entities in pos:
        key = entities[0]
        target_objects = objects.difference(subjects[key])
        for objc in target_objects:
            neg.append([key, objc])
    return neg

In [13]:
def prepareFold(
    positiveLiterals: list, 
    negativeLiterals: list, 
    modes: list, 
    targetPredicate: str, 
    balanced: int = 1,
    seed = None
) -> dict:
    facts = {k:v for k,v in positiveLiterals.items() if k != targetPredicate}
    pos = positiveLiterals.get(targetPredicate)
    neg = negativeLiterals.get(targetPredicate, None)

    if neg is None:
        neg = generateAllNeg(pos)

    if balanced is not None:
        totalPositiveExamples = len(pos)
        neg = balanceNeg(neg, totalPositiveExamples, balanced, seed = seed)

    # Converting facts, positive examples, and negative examples to literals
    facts = [
        f"{relation}({','.join(terms)})." for relation, termsList in facts.items() for terms in termsList
    ]

    pos = [f"{targetPredicate}({','.join(terms)})." for terms in pos]
    neg = [f"{targetPredicate}({','.join(terms)})." for terms in neg]

    return {
        "modes": modes,
        "facts": facts,
        "pos": pos,
        "neg": neg
    }

In [14]:
def getCoraDataset(modes: list, targetPredicate: str, balanced: int = 1, seed: int = None, **kwargs):
    cleanModes = utils.cleanPreds(modes)
    acceptedPredicates = [utils.extractRelationNameFromPred(pred) for pred in cleanModes]
    
    allPositiveLiterals = []
    allNegativeLiterals = []
    i = -1
    with open(f'{RAW_DATA_PATH}/coralearn.pl') as f:
        for line in f:
            b = re.search('^begin\(model\([0-9\w]*\)\).$', line)
            n = re.search('^neg\((\w+)\(([\w, ]+)*\)\).$', line)
            m = re.search('^(\w+)\(([\w, ]+)*\).$', line)
            if b:
                i += 1
                allPositiveLiterals.append({})
                allNegativeLiterals.append({})
            if m:
                relation = re.sub('[ _]', '', m.group(1))
                entities = re.sub('[ _]', '', m.group(2)).split(',')
                if not acceptedPredicates or relation in acceptedPredicates:
                    if relation not in allPositiveLiterals[i]:
                        allPositiveLiterals[i][relation] = []
                    allPositiveLiterals[i][relation].append(entities)
            if n:
                relation = re.sub('[ _]', '', n.group(1))
                entities = re.sub('[ _]', '', n.group(2)).split(',')
                if not acceptedPredicates or relation in acceptedPredicates:
                    if relation not in allNegativeLiterals[i]:
                        allNegativeLiterals[i][relation] = []
                    allNegativeLiterals[i][relation].append(entities)

    folds = []

    for foldPositiveLiterals, foldNegativeLiterals in zip(allPositiveLiterals, allNegativeLiterals):
        fold = prepareFold(foldPositiveLiterals, foldNegativeLiterals, modes, targetPredicate, balanced, seed)
        folds.append(fold)

    return folds

In [15]:
def getUWCSEDataset(modes: list, targetPredicate: str, balanced: int = 1, seed: int = None, **kwargs):
    cleanModes = utils.cleanPreds(modes)
    acceptedPredicates = [utils.extractRelationNameFromPred(pred) for pred in cleanModes]
    
    allPositiveLiterals = []
    allNegativeLiterals = []
    fold = {}
    fold_i = 0
    i = 0
    with open(f"{RAW_DATA_PATH}/uwcselearn.pl") as f:
        for line in f:
            n = re.search('^neg\((\w+)\(([\w, ]+)*\)\).$', line)
            m = re.search('^(\w+)\(([\w, ]+)*\).$', line)
            if m:
                relation = re.sub('[ _]', '', m.group(1))
                entities = re.sub('[ _]', '', m.group(2)).split(',')
                if entities[0] not in fold:
                    fold[entities[0]] = fold_i
                    i = fold_i
                    allPositiveLiterals.append({})
                    allNegativeLiterals.append({})
                    fold_i += 1
                else:
                    i = fold[entities[0]]
                entities = entities[1:]
                if not acceptedPredicates or relation in acceptedPredicates:
                    if relation not in allPositiveLiterals[i]:
                        allPositiveLiterals[i][relation] = []
                    allPositiveLiterals[i][relation].append(entities)
            if n:
                relation = re.sub('[ _]', '', n.group(1))
                entities = re.sub('[ _]', '', n.group(2)).split(',')
                if entities[0] not in fold:
                    fold[entities[0]] = fold_i
                    i = fold_i
                    allPositiveLiterals.append({})
                    allNegativeLiterals.append({})
                    fold_i += 1
                else:
                    i = fold[entities[0]]
                entities = entities[1:]
                if not acceptedPredicates or relation in acceptedPredicates:
                    if relation not in allNegativeLiterals[i]:
                        allNegativeLiterals[i][relation] = []
                    allNegativeLiterals[i][relation].append(entities)

    folds = []

    for foldPositiveLiterals, foldNegativeLiterals in zip(allPositiveLiterals, allNegativeLiterals):
        fold = prepareFold(foldPositiveLiterals, foldNegativeLiterals, modes, targetPredicate, balanced, seed)
        folds.append(fold)

    return folds

In [16]:
def getIMDBDataset(modes: list, targetPredicate: str, balanced: int = 1, seed: int = None, **kwargs):
    cleanModes = utils.cleanPreds(modes)
    acceptedPredicates = [utils.extractRelationNameFromPred(pred) for pred in cleanModes]

    allPositiveLiterals = []
    allNegativeLiterals = []
    i = -1
    with open(f'{RAW_DATA_PATH}/imdb.pl') as f:
        for line in f:
            b = re.search('^begin\(model\([0-9\w]*\)\).$', line)
            m = re.search('^(\w+)\(([\w, ]+)*\).$', line)
            n = re.search('^neg\((\w+)\(([\w, ]+)*\)\).$', line)
            if b:
                i += 1
                allPositiveLiterals.append({})
                allNegativeLiterals.append({})
            if m:
                relation = re.sub('[ _]', '', m.group(1))
                entities = re.sub('[ _]', '', m.group(2)).split(',')
                if not acceptedPredicates or relation in acceptedPredicates:
                    if relation not in allPositiveLiterals[i]:
                        allPositiveLiterals[i][relation] = []
                    allPositiveLiterals[i][relation].append(entities)
            if n:
                relation = re.sub('[ _]', '', n.group(1))
                entities = re.sub('[ _]', '', n.group(2)).split(',')
                if not acceptedPredicates or relation in acceptedPredicates:
                    if relation not in allNegativeLiterals[i]:
                        allNegativeLiterals[i][relation] = []
                    allNegativeLiterals[i][relation].append(entities)

    folds = []

    for foldPositiveLiterals, foldNegativeLiterals in zip(allPositiveLiterals, allNegativeLiterals):
        fold = prepareFold(foldPositiveLiterals, foldNegativeLiterals, modes, targetPredicate, balanced, seed)
        folds.append(fold)

    return folds

In [17]:
def getYeastDataset(modes: list, targetPredicate: str, balanced: int = 1, seed: int = None, **kwargs):
    cleanModes = utils.cleanPreds(modes)
    acceptedPredicates = [utils.extractRelationNameFromPred(pred) for pred in cleanModes]

    allPositiveLiterals = [{},{},{},{}]
    for i in range(4):
        with open(f'{RAW_DATA_PATH}/yeast/yeast-fold' + str(i+1) + '.db') as f:
            for line in f:
                m = re.search('^([\w_]+)\(([\w, "_-]+)*\)$', line.lower())
                if m:
                    relation = re.sub('[_]', '', m.group(1))
                    entities = re.sub('[ _"-]', '', m.group(2))
                    entities = entities.split(',')
                    if not acceptedPredicates or relation in acceptedPredicates:
                        if relation not in allPositiveLiterals[i]:
                            allPositiveLiterals[i][relation] = []
                        if relation == 'proteinclass':
                            if 'classprotein' not in allPositiveLiterals[i]:
                                allPositiveLiterals[i]['classprotein'] = []
                            allPositiveLiterals[i]['classprotein'].append(entities[::-1])
                        allPositiveLiterals[i][relation].append(entities)
    allNegativeLiterals = [{},{},{},{}]

    folds = []

    for foldPositiveLiterals, foldNegativeLiterals in zip(allPositiveLiterals, allNegativeLiterals):
        fold = prepareFold(foldPositiveLiterals, foldNegativeLiterals, modes, targetPredicate, balanced, seed)
        folds.append(fold)

    return folds

In [18]:
def getTwitterDataset(modes: list, targetPredicate: str, balanced: int = 1, seed: int = None, **kwargs):
    cleanModes = utils.cleanPreds(modes)
    acceptedPredicates = [utils.extractRelationNameFromPred(pred) for pred in cleanModes]
    
    allPositiveLiterals = [{},{}]
    for i in range(2):
        with open(f'{RAW_DATA_PATH}/twitter/twitter-fold' + str(i+1) + '.db') as f:
            for line in f:
                m = re.search('^([\w_]+)\(([\w, "_-]+)*\)$', line.lower())
                if m:
                    relation = m.group(1)
                    entities = m.group(2)
                    entities = re.sub('[ _"-]', '', entities)
                    entities = entities.split(',')
                    if not acceptedPredicates or relation in acceptedPredicates:
                        if relation not in allPositiveLiterals[i]:
                            allPositiveLiterals[i][relation] = []
                        allPositiveLiterals[i][relation].append(entities)
    allNegativeLiterals = [{},{}]

    folds = []

    for foldPositiveLiterals, foldNegativeLiterals in zip(allPositiveLiterals, allNegativeLiterals):
        fold = prepareFold(foldPositiveLiterals, foldNegativeLiterals, modes, targetPredicate, balanced, seed)
        folds.append(fold)

    return folds

In [19]:
def getNellSportsDataset(modes: list, targetPredicate: str, balanced: int = 1, seed: int = None, nFolds: int = 3, **kwargs):
    cleanModes = utils.cleanPreds(modes)
    acceptedPredicates = [utils.extractRelationNameFromPred(pred) for pred in cleanModes]
    
    def clearCharacters(value):
        value = value.lower()
        value = re.sub('[^a-z]', '', value)
        return value

    allPositiveLiterals = [{}]
    dataset = pd.read_csv(f'{RAW_DATA_PATH}/nell/NELL.sports.08m.1070.small.csv')
    for data in dataset.values:
        entity = clearCharacters((data[1].split(':'))[2])
        relation = clearCharacters((data[4].split(':'))[1])
        value = clearCharacters((data[5].split(':'))[2])

        if entity and relation and value:
            if not acceptedPredicates or relation in acceptedPredicates:
                if relation not in allPositiveLiterals[0]:
                    allPositiveLiterals[0][relation] = []
                allPositiveLiterals[0][relation].append([entity, value])
    allNegativeLiterals = [{}]

    data = prepareFold(allPositiveLiterals[0], allNegativeLiterals[0], modes, targetPredicate, balanced, seed)
    folds = splitIntoFolds(modes, data["facts"], data["pos"], data["neg"], nFolds, seed)

    return folds

In [20]:
def getNellFinancesDataset(modes: list, targetPredicate: str, balanced: int = 1, seed: int = None, nFolds = 3, **kwargs):
    cleanModes = utils.cleanPreds(modes)
    acceptedPredicates = [utils.extractRelationNameFromPred(pred) for pred in cleanModes]
    
    def clearCharacters(value):
        value = value.lower()
        value = re.sub('[^a-z]', '', value)
        return value

    companyceo = {}
    companyeconomicsector = {}
    bankchiefexecutiveceo = {}
    allPositiveLiterals = [{}]
    dataset = pd.read_csv(f'{RAW_DATA_PATH}/nell/NELL.finances.08m.1115.small.csv')
    for data in dataset.values:
        entity = clearCharacters((data[1].split(':'))[2])
        relation = clearCharacters((data[4].split(':'))[1])
        value = clearCharacters((data[5].split(':'))[2])

        if entity and value:
            if relation == 'companyceo':
                companyceo[entity] = value
            elif relation == 'companyeconomicsector':
                companyeconomicsector[entity] = value
            elif relation == 'bankchiefexecutiveceo':
                bankchiefexecutiveceo[entity] = value

        if entity and relation and value:
            if not acceptedPredicates or relation in acceptedPredicates:
                if relation not in allPositiveLiterals[0]:
                    allPositiveLiterals[0][relation] = []
                allPositiveLiterals[0][relation].append([entity, value])
    for key, value in companyceo.items():
        if key in companyeconomicsector:
            if 'ceoeconomicsector' not in allPositiveLiterals[0]:
                allPositiveLiterals[0]['ceoeconomicsector'] = []
            allPositiveLiterals[0]['ceoeconomicsector'].append([value, companyeconomicsector[key]])
    for key, value in bankchiefexecutiveceo.items():
        if key in companyeconomicsector:
            if 'ceoeconomicsector' not in allPositiveLiterals[0]:
                allPositiveLiterals[0]['ceoeconomicsector'] = []
            allPositiveLiterals[0]['ceoeconomicsector'].append([value, companyeconomicsector[key]])
    allNegativeLiterals = [{}]

    data = prepareFold(allPositiveLiterals[0], allNegativeLiterals[0], modes, targetPredicate, balanced, seed)
    folds = splitIntoFolds(modes, data["facts"], data["pos"], data["neg"], nFolds, seed)

    return folds

In [21]:
def saveDataset(data, datasetName):
    dataDir = f'{PREPROCESSED_DATA_PATH}/{datasetName}'
    numFolds = len(data)
    for foldIdx in range(numFolds):
        foldDir = f"{dataDir}/fold{foldIdx:02}"
        os.makedirs(foldDir, exist_ok = True)

        with open(f"{foldDir}/facts.pl", 'w') as f:
            f.write("\n".join(data[foldIdx]["facts"]))

        with open(f"{foldDir}/pos.pl", 'w') as f:
            f.write("\n".join(data[foldIdx]["pos"]))

        with open(f"{foldDir}/neg.pl", 'w') as f:
            f.write("\n".join(data[foldIdx]["neg"]))

        with open(f"{foldDir}/modes.pl", 'w') as f:
            f.write("\n".join(data[foldIdx]["modes"]))

In [22]:
def saveAllDatasets(balanced = 2, seed = None, defaultNFolds = 3):
    start = time.time()
    datasetName = "cora"
    data = getCoraDataset(settings.modes[datasetName], settings.target[datasetName], balanced, seed)
    print('%s seconds generating %s' % (time.time() - start, datasetName))
    saveDataset(data, datasetName)

    start = time.time()
    datasetName = "uwcse"
    data = getUWCSEDataset(settings.modes[datasetName], settings.target[datasetName], balanced, seed)
    print('%s seconds generating %s' % (time.time() - start, datasetName))
    saveDataset(data, datasetName)

    start = time.time()
    datasetName = "imdb"
    data = getIMDBDataset(settings.modes[datasetName], settings.target[datasetName], balanced, seed)
    print('%s seconds generating %s' % (time.time() - start, datasetName))
    saveDataset(data, datasetName)

    start = time.time()
    datasetName = "yeast"
    data = getYeastDataset(settings.modes[datasetName], settings.target[datasetName], balanced, seed)
    print('%s seconds generating %s' % (time.time() - start, datasetName))
    saveDataset(data, datasetName)

    start = time.time()
    datasetName = "twitter"
    data = getTwitterDataset(settings.modes[datasetName], settings.target[datasetName], balanced, seed)
    print('%s seconds generating %s' % (time.time() - start, datasetName))
    saveDataset(data, datasetName)

    start = time.time()
    datasetName = "nell_sports"
    data = getNellSportsDataset(settings.modes[datasetName], settings.target[datasetName], balanced, seed, defaultNFolds)
    print('%s seconds generating %s' % (time.time() - start, datasetName))
    saveDataset(data, datasetName)

    start = time.time()
    datasetName = "nell_finances"
    data = getNellFinancesDataset(settings.modes[datasetName], settings.target[datasetName], balanced, seed, defaultNFolds)
    print('%s seconds generating %s' % (time.time() - start, datasetName))
    saveDataset(data, datasetName)

In [23]:
saveAllDatasets(balanced = settings.negPosRatio, seed = settings.seed, defaultNFolds = settings.defaultNFolds)

4.094753742218018 seconds generating cora
14.54465103149414 seconds generating uwcse
3.324641466140747 seconds generating imdb
0.6563987731933594 seconds generating yeast
0.3515353202819824 seconds generating twitter
1.389390468597412 seconds generating nell_sports
4.176867723464966 seconds generating nell_finances
