In [2]:
import os
import re
import pandas
import numpy as np
from itertools import combinations, permutations

In [3]:
# Build data frame
# columns: category, subcategory, subcategory ID, word1, word2

# creates a dictionary from given list (list of dicts = dataframe rows)
row = lambda elements: {col:elem for col, elem in zip(("category", "relation", "relation_ID", "word_1", "word_2"), elements)}


# go through all directories and files and create rows 
def read_files(verbose=False):
    rows = []
    data_folder = "datasets/BATS_3.0/"
    
    # 4 categories
    for category_dir in sorted(os.listdir(data_folder)):
        if not os.path.isdir(data_folder + category_dir): continue
        if verbose: print(category_dir)
        category = re.sub('_',' ',category_dir)[2:]
        # 10 relations per category
        for subcategory_dir in sorted(os.listdir(data_folder + category_dir)):
            if verbose: print(subcategory_dir)
            temp = subcategory_dir.split(' ')
            subcategory_ID = temp[0]
            subcategory = re.sub('\].txt', '', re.sub('\[','',' '.join(temp[1:])))
            # 50 pairs per relation
            with open(data_folder + category_dir + '/' + subcategory_dir) as f:
                for line in f:
                    word1, word2_list = line.strip().split('\t')
                    word2_list = word2_list.split('/')
                    # break up multiple answers
                    for word2 in word2_list:
                        rows.append(row((category, subcategory, subcategory_ID, word1, word2)))
    return rows

def get_relations(df):
    return df.relation.drop_duplicates()

In [4]:
rows = read_files()
df = pandas.DataFrame(rows)

In [6]:
df.sample(15)

Unnamed: 0,category,relation,relation_ID,word_1,word_2
792,Derivational morphology,adj+ness_reg,D05,related,relatedness
858,Derivational morphology,verb+able_reg,D07,adjust,adjustable
4965,Lexicographic semantics,hyponyms - misc,L03,tool,abrader
1223,Encyclopedic semantics,UK_city - county,E03,ely,cambridgeshire
4157,Lexicographic semantics,hyponyms - misc,L03,cutlery,carving_fork
1634,Encyclopedic semantics,animal - shelter,E08,beaver,pen
2170,Lexicographic semantics,hypernyms - animals,L01,coyote,canid
2732,Lexicographic semantics,hypernyms - animals,L01,viper,snake
1123,Encyclopedic semantics,country - capital,E01,tbilisi,georgia
5542,Lexicographic semantics,meronyms - part,L06,bird,uropygial_gland


In [7]:
def create_questions():
    question_list = []
    data_folder = "datasets/BATS_3.0/"
    
    # 4 categories
    for category_dir in sorted(os.listdir(data_folder)):
        if not os.path.isdir(data_folder + category_dir): continue
        # 10 relations per category
        for subcategory_dir in sorted(os.listdir(data_folder + category_dir)):
            # 50 pairs per relation
            with open(data_folder + category_dir + '/' + subcategory_dir) as f:
                pairs = []
                for line in f:
                    word1, word2_list = line.strip().split('\t')
                    # break up multiple choices
                    #for word2 in word2_list.split('/'):
                    pairs.append((word1, word2_list))
                temp = [q for q in permutations(pairs, 2)]
                for x in temp:
                    if not x[0][0] == x[1][0]:
                        question_list.append(x)
                
    return np.array(question_list)
    

In [8]:
q = create_questions()

In [9]:
q.shape

(98000, 2, 2)