In [7]:
import os
import warnings
import random
import pandas as pd
from itertools import combinations
import sys
import networkx as nx
import spacy
import traceback
import pickle

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder


pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

random.seed(100)
warnings.filterwarnings('ignore')




In [8]:
# !python -m spacy download en_core_web_lg
# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("en_core_web_lg")

In [9]:
def generateNetwork(sentence_list, ent1_word_idx, ent2_word_idx):
    try:
        sentence = ' '.join(sentence_list)
        doc = nlp(sentence)
        edges = []
        for word_idx, token in enumerate(doc):
            for child in token.children:
                edges.append((token.i,
                          child.i))

        graph = nx.Graph(edges)
        ## If shortest path not found
        try:
            shortest_path_length = nx.shortest_path_length(graph, source=ent1_word_idx, target=ent2_word_idx)
            shortest_path = nx.shortest_path(graph, source=ent1_word_idx, target=ent2_word_idx)
        except: 
            shortest_path_length = -1
            shortest_path = "no_path_found"
        return shortest_path_length, shortest_path
    except:
        print("NETWORK - NO PATH FOUND: ", sentence_list, ent1_word_idx, ent2_word_idx)
        print(traceback.format_exc())
        return -1, "no_path_found"

In [10]:
ENT_TYPES = ["PER", "GPE", "LOC", "ORG"]
def recode_entity_types(value):
    if value.lower() in ["person", "per"]:
        return "PER"
    elif value.lower() in ["organisation", "org"]:
        return "ORG"
    elif value.lower() in ["gpe"]:
        return "GPE"
    elif value.lower() in ["location", "loc"]:
        return "LOC"
    else:
        return "OTHER"

In [11]:
def getIndex(row, sentence, entity):
    try:
        return sentence.index(entity.split(" ")[0]) + 1
    except:
        print("INDEX NOT FOUND - ", sentence, entity, entity.split(" ")[0])
        print(traceback.format_exc())
        return -1

In [12]:
def getFeatures(file, dataset):
    try:
        if dataset not in  ["ace2004", "ace2005", "tacred"]:
            df = pd.read_csv(file, sep="\t", index_col=False, dtype={'text': str, 'entity type': str, 'pos':str , 'tag': str, 'dep': str, 'sent_index': int, 'entity mention ID': str})
        else:    
            df = pd.read_csv(file, index_col=False)  ## For ACE2004 and ACE2005 as sep is different. 
        df = df.rename(columns={"entity_id": "entity mention ID", "sent_idx": "sent_index", "entity_type" : "entity type"})
        df.astype({'text': str, 'entity type': str, 'pos':str , 'dep': str, 'sent_index': int, 'entity mention ID': str})
        df = df.fillna("")
        # filtered_df = df.where(df["entity mention ID"]!="").groupby(["sent_index", "entity mention ID"], as_index=False).agg({'text': ' '.join, 'entity type': '-'.join, 'pos' : '-'.join, 'tag' : '-'.join,'dep': '-'.join})
         ## For ACE2004, ACE2005, tacred, kbp37 as no tag exists
        filtered_df = df.where(df["entity mention ID"]!="").groupby(["sent_index", "entity mention ID"], as_index=False).agg({'text': ' '.join, 'entity type': '-'.join, 'pos' : '-'.join, 'dep': '-'.join}) 
        display(filtered_df)
        ## Calculating entity-entity distances per sentence
        # sent_df = filtered_df.groupby(["sent_index"], as_index=False).agg({'text': list, 'entity type': list, 'pos' : list, 'tag' : list, 'dep': list})
         ##  For ACE2004, ACE2005, tacred, kbp37 as no tag exists
        sent_df = filtered_df.groupby(["sent_index"], as_index=False).agg({'text': list, 'entity type': list, 'pos' : list, 'dep': list})

        text_pairs = []
        ent_type_pairs = []
        pos_pairs = []
        tag_pairs = []
        dep_pairs = []
        
        pos_ignore_list = ["PRON", "PROPN-PART", "DET", "AUX", "CONJ", "CCONJ", "PART","PUNCT", "SYM"]

        for index, row in sent_df.iterrows():
            text_pairs.append(list(combinations(row["text"], r=2)))
            ent_type_pairs.append(list(combinations(row["entity type"], r=2)))
            pos_pairs.append(list(combinations(row["pos"], r=2)))
            dep_pairs.append(list(combinations(row["dep"], r=2)))
            # tag_pairs.append(list(combinations(row["tag"], r=2)))  ## For  ACE2004, ACE2005, tacred, kbp37 as no tag exists

        sent_entity_df = pd.DataFrame()
        sent_entity_df["sent_index"] = sent_df["sent_index"]
        sent_entity_df["entity_pairs"] = text_pairs
        sent_entity_df["ent_type_pairs"] = ent_type_pairs
        sent_entity_df["pos_pairs"] = pos_pairs
        sent_entity_df["dep_pairs"] = dep_pairs
        # sent_entity_df["tag_pairs"] = tag_pairs  ## For ACE2004, ACE2005, tacred, kbp37 as no tag exists 

        final_df = sent_entity_df.set_index('sent_index').apply(lambda x: x.apply(pd.Series).stack()).reset_index().drop('level_1', 1)

        sentences = df.groupby(['sent_index'], as_index=False).agg({'text': list, 'dep': list})
        sentences["sentence_length"] = sentences.apply(lambda x : len(x["text"]), axis=1)
        sentences["root_index"] = sentences.apply(lambda x : int(x["dep"].index("ROOT")), axis=1)
        sentences["root_word"] = sentences.apply(lambda x : x["text"][x["root_index"]], axis=1)
        sentences = sentences.rename(columns={"text": "sentence"})
        final_df = pd.merge(final_df, sentences, on='sent_index', sort=False)
        final_df["entity_1_index"] = final_df.apply(lambda x : getIndex(x, x['sentence'], x['entity_pairs'][0]), axis=1)
        final_df["entity_2_index"] = final_df.apply(lambda x : getIndex(x, x['sentence'], x['entity_pairs'][1]), axis=1)
        final_df["entity_distance"] = final_df.apply(lambda x : x['entity_2_index'] - x['entity_1_index'], axis=1)
        final_df["no_words_before_entity_1"] = final_df["entity_1_index"] - 1 
        final_df["no_words_after_entity_2"] = final_df["sentence_length"] - final_df["entity_2_index"] - 1

        ## Calculating root-entity distances for each sentence
        final_df["entity_1_root_distance"] = final_df.apply(lambda x: x['root_index'] - x['entity_1_index'], axis=1)
        final_df["entity_2_root_distance"] = final_df.apply(lambda x: x['entity_2_index'] - x['root_index'], axis=1)

        # Expand tuples
        final_df[['entity 1 name', 'entity 2 name']] = final_df['entity_pairs'].apply(pd.Series)

        final_df[['entity_type_1', 'entity_type_2']] = final_df['ent_type_pairs'].apply(pd.Series)
        final_df[['entity_pos_1', 'entity_pos_2']] = final_df['pos_pairs'].apply(pd.Series)
        final_df[['entity_dep_1', 'entity_dep_2']] = final_df['dep_pairs'].apply(pd.Series)
        # final_df[['entity_tag_1', 'entity_tag_2']] = final_df['tag_pairs'].apply(pd.Series)  ## For  ACE2004, ACE2005, tacred, kbp37 as no tag exists 
        final_df[["shortest_distance", "shortest_path"]] = final_df.apply(lambda x : generateNetwork(x["sentence"], x["entity_1_index"], x["entity_2_index"]), axis = 1, result_type="expand")

        final_df["entity_type_1"] = final_df["entity_type_1"].apply(lambda x : recode_entity_types(x.split("-")[0]))
        final_df["entity_type_2"] = final_df["entity_type_2"].apply(lambda x : recode_entity_types(x.split("-")[0]))
        final_df["et-label"] = final_df.apply(lambda x : f"{x['entity_type_1']}-{x['entity_type_2']}", axis=1)
        display(final_df)
        final_df = final_df[final_df["entity_type_1"]!= "OTHER"]
        final_df = final_df[final_df["entity_type_2"]!="OTHER"]
        print("After ET filtering", len(final_df))
        final_df = final_df[~final_df['entity_pos_1'].isin(pos_ignore_list)]
        final_df = final_df[~final_df['entity_pos_2'].isin(pos_ignore_list)]
        
        # final_df = final_df.drop([ "entity_pairs", "ent_type_pairs", "pos_pairs", "dep_pairs", "tag_pairs"], axis = 1)  ## For ACE2004, ACE2005, tacred, kbp37 as no tag exists 
        final_df = final_df.drop([ "entity_pairs", "ent_type_pairs", "pos_pairs", "dep_pairs"], axis = 1)
        return final_df
    except:
        print("Failed to execute file : ", file)
        print("Error : ", sys.exc_info())
        print(traceback.format_exc())
        return pd.DataFrame()

In [13]:
def getFeaturesWithlabels(relation_file, tagged_tokens_file, dataset):
    try:
        final_df = getFeatures(tagged_tokens_file, dataset)
        if dataset in ["ace2004", "ace2005", "tacred", "hlt"]:
            relations = pd.read_csv(relation_file)
        else:
            relations = pd.read_csv(relation_file, sep = "\t")
        relations = relations.rename(columns={"ent1_text" : "entity 1 name", "ent2_text" : "entity 2 name", "rel type": "rel_type"})
        result = pd.merge(final_df, relations, on=["entity 1 name", "entity 2 name"], how = "left", sort=False)
        # result.drop_duplicates(subset=["sent_index", "entity 1 name", "entity 2 name"], keep='first', inplace=True, ignore_index=True
        ### Only used for hlt : Remove this codeblock from getFeatures function before uncommenting this. 
        display(final_df)
        display(relations)
#         result["entity_type_1"] = result["entity 1 type"].apply(lambda x : recode_entity_types(x.split("-")[0]))
#         result["entity_type_2"] = result["entity 2 type"].apply(lambda x : recode_entity_types(x.split("-")[0]))
#         result["et-label"] = result.apply(lambda x : f"{x['entity_type_1']}-{x['entity_type_2']}", axis=1)

#         result = result[result["entity_type_1"]!= "OTHER"]
#         result = result[result["entity_type_2"]!="OTHER"]
         
        result["label"] = result["rel_type"].apply(lambda x : 1 if isinstance(x, str) else 0)
        result["label"] = result.apply(lambda x : 0 if x["rel_type"] == "no_relation" else x["label"], axis = 1)
        result = result[result['rel_type'].notna()]
        return result
    except:
        print("Skipping run for : ", relation_file, tagged_tokens_file)
        print(sys.exc_info())
        return pd.DataFrame()
    

In [14]:
# datasets = ["re3d", "ace2004", "ace2005", "tacred", "docred","kbp37", "hlt"]
datasets = ["kbp37"]

ground_truth_path = f"/Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset/"
tagged_token_path = f"/Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset/"

In [None]:
dataset_features = {}
for dataset in datasets:
    print(dataset)
    os.chdir(f"{tagged_token_path}/{dataset}")
    features_with_labels = pd.DataFrame()
    if dataset in ["ace2004", "ace2005"]:
            os.chdir(f"{tagged_token_path}/{dataset}")
    else:
            os.chdir(f"{tagged_token_path}/{dataset}/tagged_tokens")
    for file in os.listdir():
        if file.endswith(".csv"):
            if dataset in ["ace2004", "ace2005"]:
                file = file.split(".csv_")[0]
                tagged_tokens_file = f"{tagged_token_path}/{dataset}/{file}.csv_tagged_tokens.csv"
                relation_file = f"{ground_truth_path}/{dataset}/{file}.csv_gt_relations.csv"
            else:
                tagged_tokens_file = f"{tagged_token_path}/{dataset}/tagged_tokens/{file}"
                relation_file = f"{ground_truth_path}/{dataset}/ground_truth/{file}"
            temp = getFeaturesWithlabels(relation_file, tagged_tokens_file, dataset)
            if len(features_with_labels) == 0:
                features_with_labels = temp
            else:
                features_with_labels = pd.concat([features_with_labels, temp], axis=0)
    ## Keeping relations with greater than 30 support.
    grouped = features_with_labels.groupby(['rel_type'])
    values = grouped["rel_type"].value_counts()
    features_with_labels["support"] = features_with_labels["rel_type"].apply(lambda x : values[x].values[0])
    features_with_labels = features_with_labels[features_with_labels["support"]>30]
    dataset_features[dataset] = features_with_labels
    display(features_with_labels.head(2))

kbp37


Unnamed: 0,sent_index,entity mention ID,text,entity type,pos,dep
0,0.0,e1_17641,Isle of Wight,--,PROPN-ADP-PROPN,pobj-prep-pobj
1,0.0,e2_17641,Ryde School with Upper Chine,org-org-org-org-org,PROPN-PROPN-ADP-PROPN-PROPN,compound-oprd-prep-compound-pobj
2,1.0,e1_17642,Mars,org,PROPN,nsubj
3,1.0,e2_17642,Maryland,,PROPN,compound
4,2.0,e1_17643,California,,PROPN,compound
...,...,...,...,...,...,...
5929,2964.0,e2_20605,Edward Waters College,per-per-per,PROPN-PROPN-PROPN,compound-compound-pobj
5930,2965.0,e1_20606,NAZA,org,PROPN,compound
5931,2965.0,e2_20606,Malaysia,,PROPN,compound
5932,2966.0,e1_20607,Centrale de Lille,org-org-org,PROPN-ADP-PROPN,nmod-compound-appos


Unnamed: 0,sent_index,entity_pairs,ent_type_pairs,pos_pairs,dep_pairs,sentence,dep,sentence_length,root_index,root_word,entity_1_index,entity_2_index,entity_distance,no_words_before_entity_1,no_words_after_entity_2,entity_1_root_distance,entity_2_root_distance,entity 1 name,entity 2 name,entity_type_1,entity_type_2,entity_pos_1,entity_pos_2,entity_dep_1,entity_dep_2,shortest_distance,shortest_path,et-label
0,0.0,"(Isle of Wight, Ryde School with Upper Chine)","(--, org-org-org-org-org)","(PROPN-ADP-PROPN, PROPN-PROPN-ADP-PROPN-PROPN)","(pobj-prep-pobj, compound-oprd-prep-compound-pobj)","[After, returning, to, the, U.K., she, attended, the, independent, Upper, Chine, School, in, Shanklin, on, the, Isle, of, Wight, which, subsequently, merged, with, the, independent, Ryde, School, and, was, renamed, Ryde, School, with, Upper, Chine, ), .]","[prep, pcomp, prep, det, pobj, nsubj, ROOT, det, amod, compound, compound, dobj, prep, pobj, prep, det, pobj, prep, pobj, nsubj, advmod, relcl, prep, det, amod, compound, pobj, cc, auxpass, conj, compound, oprd, prep, compound, pobj, punct, punct]",37,6,attended,17,26,9,16,10,-11,20,Isle of Wight,Ryde School with Upper Chine,OTHER,ORG,PROPN-ADP-PROPN,PROPN-PROPN-ADP-PROPN-PROPN,pobj-prep-pobj,compound-oprd-prep-compound-pobj,6,"[17, 16, 14, 6, 21, 22, 26]",OTHER-ORG
1,1.0,"(Mars, Maryland)","(org, )","(PROPN, PROPN)","(nsubj, compound)","[Supported, by, their, own, buying, staff, Mars, purchases, produce, and, seafood, daily, from, local, markets, and, always, includes, local, produce, when, available, from, Maryland, growers, .]","[ROOT, agent, poss, amod, compound, pobj, nsubj, nsubj, ROOT, cc, conj, advmod, prep, amod, pobj, cc, advmod, conj, amod, dobj, advmod, advcl, prep, compound, pobj, punct]",26,0,Supported,7,24,17,6,1,-7,24,Mars,Maryland,ORG,OTHER,PROPN,PROPN,nsubj,compound,7,"[7, 8, 0, 17, 19, 21, 22, 24]",ORG-OTHER
2,2.0,"(California, substance abuse)","(, org-org)","(PROPN, NOUN-NOUN)","(compound, compound-compound)","[The, California, Department, of, Alcohol, and, Drug, Programs, (, ADP, ), is, a, California, state, agency, concerned, with, substance, abuse, prevention, and, treatment, .]","[det, compound, nsubj, prep, pobj, cc, compound, conj, punct, appos, punct, ROOT, det, compound, compound, attr, acl, prep, compound, compound, pobj, cc, conj, punct]",24,11,is,2,19,17,1,4,9,8,California,substance abuse,OTHER,ORG,PROPN,NOUN-NOUN,compound,compound-compound,6,"[2, 11, 15, 16, 17, 20, 19]",OTHER-ORG
3,3.0,"(Aetna, Connecticut)","(org, )","(PROPN, PROPN)","(poss, pobj)","[But, but, Aetna, 's, headquarters, are, in, Connecticut, with, that, great, Senator, Joe, Lieberman, .]","[cc, cc, poss, case, nsubj, ROOT, prep, pobj, prep, det, amod, compound, compound, pobj, punct]",15,5,are,3,8,5,2,6,2,3,Aetna,Connecticut,ORG,OTHER,PROPN,PROPN,poss,pobj,4,"[3, 2, 4, 5, 8]",ORG-OTHER
4,4.0,"(Singapore Airlines, SIA)","(org-org, )","(PROPN-PROPN, PROPN)","(compound-nsubj, appos)","[Singapore, Airlines, (, SIA, ), said, Wednesday, it, would, begin, flying, the, world's, biggest, passenger, plane, the, Airbus, A380, on, its, Melbourne, route, in, late, September, .]","[compound, nsubj, punct, appos, punct, ROOT, npadvmod, nsubj, aux, ccomp, xcomp, det, nmod, amod, compound, dobj, det, compound, appos, prep, poss, compound, pobj, prep, amod, pobj, punct]",27,5,said,1,4,3,0,22,4,-1,Singapore Airlines,SIA,ORG,OTHER,PROPN-PROPN,PROPN,compound-nsubj,appos,1,"[1, 4]",ORG-OTHER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2962,2962.0,"(East Carolina University, Greenville)","(--, org)","(PROPN-PROPN-PROPN, PROPN)","(compound-compound-attr, compound)","[Leo, Warren, Jenkins, (, May, 28, 1913, January, 14, 1989, ), was, the, sixth, president, and, chancellor, of, what, is, now, East, Carolina, University, in, Greenville, North, Carolina, USA, .]","[compound, compound, nsubj, punct, appos, nummod, nummod, npadvmod, nummod, nummod, punct, ROOT, det, amod, attr, cc, conj, prep, nsubj, pcomp, advmod, compound, compound, attr, prep, compound, compound, compound, pobj, punct]",30,11,was,22,26,4,21,3,-11,15,East Carolina University,Greenville,OTHER,ORG,PROPN-PROPN-PROPN,PROPN,compound-compound-attr,compound,5,"[22, 23, 24, 28, 27, 26]",OTHER-ORG
2963,2963.0,"(Matt Ellis, USA)","(-, per)","(PROPN-PROPN, PROPN)","(compound-nsubj, conj)","[In, 2005, Matt, Ellis, left, Australia, with, his, latest, album, in, hand, and, embarked, upon, a, tour, across, Canada, and, the, USA, which, included, shows, in, and, around, Vancouver, Hornby, Denman, Islands, and, at, the, North, by, Northeast, (, NxNE, ), Festival, in, Toronto, .]","[prep, pobj, compound, nsubj, ROOT, dobj, prep, poss, amod, pobj, prep, pobj, cc, conj, prep, det, pobj, prep, pobj, cc, det, conj, nsubj, relcl, dobj, prep, cc, conj, compound, compound, compound, pobj, cc, conj, det, pobj, agent, nmod, punct, nmod, punct, pobj, prep, pobj, punct]",45,4,left,3,22,19,2,22,1,18,Matt Ellis,USA,OTHER,PER,PROPN-PROPN,PROPN,compound-nsubj,conj,9,"[3, 4, 13, 14, 16, 17, 18, 21, 23, 22]",OTHER-PER
2964,2964.0,"(Jacksonville, Edward Waters College)","(, per-per-per)","(PROPN, PROPN-PROPN-PROPN)","(poss, compound-compound-pobj)","[He, was, a, heavy, contributor, to, black, colleges, such, as, Jacksonville, 's, Edward, Waters, College, as, well, as, Bethune-Cookman, College, .]","[nsubj, ROOT, det, amod, attr, prep, amod, pobj, amod, prep, poss, case, compound, compound, pobj, advmod, advmod, cc, compound, conj, punct]",21,1,was,11,13,2,10,7,-10,12,Jacksonville,Edward Waters College,OTHER,PER,PROPN,PROPN-PROPN-PROPN,poss,compound-compound-pobj,3,"[11, 10, 14, 13]",OTHER-PER
2965,2965.0,"(NAZA, Malaysia)","(org, )","(PROPN, PROPN)","(compound, compound)","[With, NAZA, FC, he, won, the, Malaysia, Premier, League, 2007-08, championship, .]","[prep, compound, pobj, nsubj, ROOT, det, compound, compound, nmod, punct, dobj, punct]",12,4,won,2,7,5,1,4,2,3,NAZA,Malaysia,ORG,OTHER,PROPN,PROPN,compound,compound,4,"[2, 0, 4, 8, 7]",ORG-OTHER


After ET filtering 0


Unnamed: 0,sent_index,sentence,dep,sentence_length,root_index,root_word,entity_1_index,entity_2_index,entity_distance,no_words_before_entity_1,no_words_after_entity_2,entity_1_root_distance,entity_2_root_distance,entity 1 name,entity 2 name,entity_type_1,entity_type_2,entity_pos_1,entity_pos_2,entity_dep_1,entity_dep_2,shortest_distance,shortest_path,et-label


Unnamed: 0,rel_type,rel subtype,entity 1 ID,entity 2 ID,entity 1 mention ID,entity 2 mention ID,entity 1 name,entity 2 name,entity 1 type,entity 2 type,same_sent,distance,pos_pattern,shortest_dep_path,span
0,stateorprovince_of_headquarters,,e1_17641,e2_17641,e1_17641,e2_17641,Isle of Wight,Ryde School with Upper Chine,,org,True,12.0,DET-ADV-VERB-ADP-DET-ADJ-PROPN-PROPN-CCONJ-VERB-VERB,3.0,Isle of Wight which subsequently merged with the independent Ryde School and was renamed Ryde School with Upper Chine
1,stateorprovince_of_headquarters,,e1_17642,e2_17642,e1_17642,e2_17642,Mars,Maryland,org,,True,17.0,NOUN-VERB-CCONJ-NOUN-ADV-ADP-ADJ-NOUN-CCONJ-ADV-VERB-ADJ-NOUN-ADV-ADJ-ADP,7.0,Mars purchases produce and seafood daily from local markets and always includes local produce when available from Maryland
2,stateorprovince_of_headquarters,,e1_17643,e2_17643,e1_17643,e2_17643,California,substance abuse,,org,True,5.0,NOUN-NOUN-VERB-ADP,5.0,California state agency concerned with substance abuse
3,stateorprovince_of_headquarters,,e1_17644,e2_17644,e1_17644,e2_17644,Aetna,Connecticut,org,,True,5.0,PART-NOUN-VERB-ADP,4.0,Aetna 's headquarters are in Connecticut
4,alternate_names,,e1_17645,e2_17645,e1_17645,e2_17645,Singapore Airlines,SIA,org,,True,1.0,,1.0,Singapore Airlines ( SIA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2962,city_of_headquarters,,e1_21039,e2_21039,e1_21039,e2_21039,East Carolina University,Greenville,org,,,,,,
2963,countries_of_residence,,e1_21040,e2_21040,e1_21040,e2_21040,Matt Ellis,USA,per,,,,,,
2964,city_of_headquarters,,e1_21042,e2_21042,e1_21042,e2_21042,Jacksonville,Edward Waters College,,org,,,,,
2965,members,,e1_21043,e2_21043,e1_21043,e2_21043,NAZA,Malaysia,,org,,,,,


Skipping run for :  /Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset//kbp37/ground_truth/test.csv /Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset//kbp37/tagged_tokens/test.csv
(<class 'ValueError'>, ValueError('Columns must be same length as key'), <traceback object at 0x7fc84b8bb8c0>)


Unnamed: 0,sent_index,entity mention ID,text,entity type,pos,dep
0,0.0,e1_15917,Mississippi,,PROPN,compound
1,0.0,e2_15917,MS ANG,org-org,PROPN-PROPN,compound-appos
2,1.0,e1_15918,PSOP,,PROPN,compound
3,1.0,e2_15918,Socialist Party,org-org,PROPN-PROPN,compound-conj
4,2.0,e1_15919,Integra Bank,org-org,PROPN-PROPN,compound-compound
...,...,...,...,...,...,...
3007,1503.0,e2_17420,Camden School for Girls,---,PROPN-PROPN-ADP-PROPN,compound-dobj-prep-pobj
3008,1504.0,e1_17421,Oneworld,per,PROPN,compound
3009,1504.0,e2_17421,Japan Airlines,-,PROPN-PROPN,compound-ROOT
3010,1505.0,e1_17422,NCAA,org,PROPN,compound


Unnamed: 0,sent_index,entity_pairs,ent_type_pairs,pos_pairs,dep_pairs,sentence,dep,sentence_length,root_index,root_word,entity_1_index,entity_2_index,entity_distance,no_words_before_entity_1,no_words_after_entity_2,entity_1_root_distance,entity_2_root_distance,entity 1 name,entity 2 name,entity_type_1,entity_type_2,entity_pos_1,entity_pos_2,entity_dep_1,entity_dep_2,shortest_distance,shortest_path,et-label
0,0.0,"(Mississippi, MS ANG)","(, org-org)","(PROPN, PROPN-PROPN)","(compound, compound-appos)","[The, Mississippi, Air, National, Guard, (, MS, ANG, ), is, the, air, force, militia, of, the, State, of, Mississippi, United, States, of, America, .]","[det, compound, compound, compound, nsubj, punct, compound, appos, punct, ROOT, det, compound, compound, attr, prep, det, pobj, prep, compound, compound, pobj, prep, pobj, punct]",24,9,is,2,7,5,1,16,7,-2,Mississippi,MS ANG,OTHER,ORG,PROPN,PROPN-PROPN,compound,compound-appos,2,"[2, 4, 7]",OTHER-ORG
1,1.0,"(PSOP, Socialist Party)","(, org-org)","(PROPN, PROPN-PROPN)","(compound, compound-conj)","[When, the, Gauche, was, excluded, from, the, SFIO, he, became, one, of, the, leaders, of, the, new, Parti, Socialiste, Ouvrier, et, Paysan, (, PSOP, -, Workers, and, Peasants, Socialist, Party, ), and, was, at, that, time, quite, close, to, Leon, Trotsky, .]","[advmod, det, nsubjpass, auxpass, advcl, prep, det, pobj, nsubj, ROOT, attr, prep, det, pobj, prep, det, amod, compound, compound, compound, compound, pobj, punct, compound, punct, appos, cc, compound, compound, conj, punct, cc, conj, prep, det, pobj, advmod, acomp, prep, compound, pobj, punct]",42,9,became,24,29,5,23,12,-15,20,PSOP,Socialist Party,OTHER,ORG,PROPN,PROPN-PROPN,compound,compound-conj,2,"[24, 25, 29]",OTHER-ORG
2,2.0,"(Integra Bank, Indiana)","(org-org, )","(PROPN-PROPN, PROPN)","(compound-compound, pobj)","[Integra, Bank, Corporation, was, the, parent, of, Integra, Bank, National, Association, a, retail, bank, headquartered, in, Evansville, Indiana, that, failed, on, July, 29, 2011, .]","[compound, compound, nsubj, ROOT, det, attr, prep, compound, compound, compound, pobj, det, amod, appos, acl, prep, compound, pobj, nsubj, relcl, prep, pobj, nummod, nummod, punct]",25,3,was,1,18,17,0,6,2,15,Integra Bank,Indiana,ORG,OTHER,PROPN-PROPN,PROPN,compound-compound,pobj,6,"[1, 2, 3, 5, 13, 19, 18]",ORG-OTHER
3,3.0,"(SA Ambulance Service, SAAS)","(--, org)","(PROPN-PROPN-PROPN, PROPN)","(compound-compound-compound, nsubj)","[The, name, of, the, association, was, amended, to, SA, Ambulance, Service, Inc, in, 2005, and, in, 2008, SAAS, became, an, agency, under, the, Minister, of, Health, .]","[det, nsubjpass, prep, det, pobj, auxpass, ROOT, prep, compound, compound, compound, pobj, prep, pobj, cc, prep, pobj, nsubj, conj, det, attr, prep, det, pobj, prep, pobj, punct]",27,6,amended,9,18,9,8,8,-3,12,SA Ambulance Service,SAAS,OTHER,ORG,PROPN-PROPN-PROPN,PROPN,compound-compound-compound,nsubj,4,"[9, 11, 7, 6, 18]",OTHER-ORG
4,4.0,"(college, Kansas)","(org, )","(NOUN, PROPN)","(nsubjpass, pobj)","[The, college, is, located, on, the, edge, of, the, Flint, Hills, and, the, vast, wheat, fields, of, south, central, Kansas, in, the, town, of, North, Newton, .]","[det, nsubjpass, auxpass, ROOT, prep, det, pobj, prep, det, compound, pobj, cc, det, amod, compound, conj, prep, amod, amod, pobj, prep, det, pobj, prep, compound, pobj, punct]",27,3,located,2,20,18,1,6,1,17,college,Kansas,ORG,OTHER,NOUN,PROPN,nsubjpass,pobj,7,"[2, 3, 4, 6, 7, 10, 15, 20]",ORG-OTHER
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1501,1501.0,"(William Gladstone, Prime Minister)","(org-org, -)","(PROPN-PROPN, PROPN-PROPN)","(compound-nsubj, compound-attr)","[William, Gladstone, began, his, political, career, as, a, right-wing, Tory, like, his, father, before, becoming, a, Liberal, Prime, Minister, .]","[compound, nsubj, ROOT, poss, amod, dobj, prep, det, amod, pobj, prep, poss, pobj, prep, pcomp, det, amod, compound, attr, punct]",20,2,began,1,18,17,0,1,1,16,William Gladstone,Prime Minister,ORG,OTHER,PROPN-PROPN,PROPN-PROPN,compound-nsubj,compound-attr,5,"[1, 2, 15, 16, 20, 18]",ORG-OTHER
1502,1502.0,"(Intelius, Naveen Jain)","(per, -)","(PROPN, PROPN-PROPN)","(nsubjpass, compound-compound)","[Intelius, was, founded, in, 2003, by, six, former, Infospace, executives, :, Naveen, Jain, Kevin, Marcus, Niraj, Shah, Ed, Petersen, Chandan, Chauhan, and, John, Arnold, .]","[nsubjpass, auxpass, ROOT, prep, pobj, agent, nummod, amod, compound, pobj, punct, compound, compound, ROOT, ROOT, compound, compound, compound, compound, compound, ROOT, cc, compound, conj, punct]",25,2,founded,1,12,11,0,12,1,10,Intelius,Naveen Jain,PER,OTHER,PROPN,PROPN-PROPN,nsubjpass,compound-compound,4,"[1, 2, 0, 18, 12]",PER-OTHER
1503,1503.0,"(London, Camden School for Girls)","(org, ---)","(PROPN, PROPN-PROPN-ADP-PROPN)","(pobj, compound-dobj-prep-pobj)","[Wigglesworth, grew, up, in, north, London, attending, Camden, School, for, Girls, .]","[nsubj, ROOT, prt, prep, compound, pobj, advcl, compound, dobj, prep, pobj, punct]",12,1,grew,6,8,2,5,3,-5,7,London,Camden School for Girls,ORG,OTHER,PROPN,PROPN-PROPN-ADP-PROPN,pobj,compound-dobj-prep-pobj,1,"[6, 8]",ORG-OTHER
1504,1504.0,"(Oneworld, Japan Airlines)","(per, -)","(PROPN, PROPN-PROPN)","(compound, compound-ROOT)","[However, you, can, redeem, Delta, Air, Lines, SkyMiles, on, any, of, the, 10, SkyTeam, alliance, partners, which, include, Air, France, KLM, Alitalia, Continental, Airlines, and, Northwest, Airlines, ;, and, you, can, redeem, American, AAdvantage, miles, on, any, of, the, 10, Oneworld, alliance, carriers, including, British, Airways, (, though, not, on, trans-Atlantic, routes, to, London, ), Cathay, Pacific, Finnair, Iberia, Japan, Airlines, and, Qantas, .]","[advmod, nsubj, aux, ROOT, compound, compound, compound, dobj, prep, pobj, prep, det, nummod, compound, compound, pobj, nsubj, relcl, compound, compound, compound, compound, compound, dobj, cc, compound, conj, punct, cc, nsubj, aux, conj, compound, compound, dobj, prep, pobj, prep, det, nummod, compound, compound, pobj, prep, compound, pobj, punct, mark, neg, prep, amod, pobj, prep, pobj, punct, compound, compound, ROOT, compound, compound, ROOT, cc, conj, punct]",64,3,redeem,41,60,19,40,3,-38,57,Oneworld,Japan Airlines,PER,OTHER,PROPN,PROPN-PROPN,compound,compound-ROOT,7,"[41, 42, 37, 36, 35, 31, 62, 60]",PER-OTHER


After ET filtering 0


Unnamed: 0,sent_index,sentence,dep,sentence_length,root_index,root_word,entity_1_index,entity_2_index,entity_distance,no_words_before_entity_1,no_words_after_entity_2,entity_1_root_distance,entity_2_root_distance,entity 1 name,entity 2 name,entity_type_1,entity_type_2,entity_pos_1,entity_pos_2,entity_dep_1,entity_dep_2,shortest_distance,shortest_path,et-label


Unnamed: 0,rel_type,rel subtype,entity 1 ID,entity 2 ID,entity 1 mention ID,entity 2 mention ID,entity 1 name,entity 2 name,entity 1 type,entity 2 type,same_sent,distance,pos_pattern,shortest_dep_path,span
0,alternate_names,,e1_15917,e2_15917,e1_15917,e2_15917,Mississippi,MS ANG,,org,True,4.0,PROPN-PROPN-PROPN,2.0,Mississippi Air National Guard ( MS ANG
1,alternate_names,,e1_15918,e2_15918,e1_15918,e2_15918,PSOP,Socialist Party,,org,True,4.0,PROPN-CCONJ-PROPN,2.0,PSOP - Workers and Peasants Socialist Party
2,stateorprovince_of_headquarters,,e1_15919,e2_15919,e1_15919,e2_15919,Integra Bank,Indiana,org,,True,16.0,PROPN-VERB-DET-NOUN-ADP-PROPN-PROPN-PROPN-PROPN-DET-ADJ-NOUN-VERB-ADP-PROPN,7.0,Integra Bank Corporation was the parent of Integra Bank National Association a retail bank headquartered in Evansville Indiana
3,alternate_names,,e1_15920,e2_15920,e1_15920,e2_15920,SA Ambulance Service,SAAS,,org,True,7.0,PROPN-ADP-NUM-CCONJ-ADP-NUM,5.0,SA Ambulance Service Inc in 2005 and in 2008 SAAS
4,stateorprovince_of_headquarters,,e1_15921,e2_15921,e1_15921,e2_15921,college,Kansas,org,,True,18.0,VERB-VERB-ADP-DET-NOUN-ADP-DET-PROPN-PROPN-CCONJ-DET-ADJ-NOUN-NOUN-ADP-ADJ-ADJ,6.0,college is located on the edge of the Flint Hills and the vast wheat fields of south central Kansas
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1501,title,,e1_17636,e2_17636,e1_17636,e2_17636,William Gladstone,Prime Minister,per,,,,,,
1502,founded_by,,e1_17637,e2_17637,e1_17637,e2_17637,Intelius,Naveen Jain,org,,,,,,
1503,city_of_headquarters,,e1_17638,e2_17638,e1_17638,e2_17638,London,Camden School for Girls,,org,,,,,
1504,members,,e1_17639,e2_17639,e1_17639,e2_17639,Oneworld,Japan Airlines,org,,,,,,


Skipping run for :  /Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset//kbp37/ground_truth/dev.csv /Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset//kbp37/tagged_tokens/dev.csv
(<class 'ValueError'>, ValueError('Columns must be same length as key'), <traceback object at 0x7fc85c8abd70>)


Unnamed: 0,sent_index,entity mention ID,text,entity type,pos,dep
0,0.0,e1_0,Thom Yorke,per-per,PROPN-PROPN,compound-nsubj
1,0.0,e2_0,Radiohead,,PROPN,pobj
2,1.0,e1_1,Leland High School,org-org-org,PROPN-PROPN-PROPN,compound-compound-nsubj
3,1.0,e2_1,San Jose,-,PROPN-PROPN,compound-compound
4,2.0,e1_2,Ohio University,-,PROPN-PROPN,compound-dobj
...,...,...,...,...,...,...
28545,14272.0,e2_14272,Sheffield Tigers,-,PROPN-PROPN,compound-compound
28546,14273.0,e1_14273,NCAA,,PROPN,compound
28547,14273.0,e2_14273,Virginia Wesleyan College,per-per-per,PROPN-PROPN-PROPN,compound-compound-poss
28548,14274.0,e1_14274,Eredivisie,per,PROPN,nmod


In [None]:
for data in dataset_features.keys():
    dataset_features[data].to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Features/Latest/features_"+data+".csv")

## Examples

In [None]:
def print_full():
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')
print_full()

In [None]:
# get complete sentence
features_with_labels["sentence"] = features_with_labels["sentence"].apply(lambda x : " ".join(x))

In [None]:
## For Binary labels
features_with_labels[features_with_labels["label"].apply(lambda x : x==0)][["sentence", "entity 1 name", "entity 2 name", "entity_type_1", "entity_type_2", "label"]]

In [None]:
features = ["dep", "sentence_length", "root_index", "root_word", "entity_1_index", "entity_2_index", 
            "entity_distance", "no_words_before_entity_1", "no_words_after_entity_2", 
            "entity_1_root_distance", "entity_2_root_distance", 'entity 1 name', 'entity 2 name',
       'entity_type_1', 'entity_type_2', 'entity_pos_1', 'entity_pos_2',
       'entity_dep_1', 'entity_dep_2', 'shortest_distance', 'shortest_path']
examples = ["sentence", "entity 1 name", "entity 2 name", "entity_type_1", "entity_type_2", "et-label", "rel_type", "label", "support"]

In [None]:
import ast
datasets = ["re3d", "ace2004", "ace2005", "tacred", "docred","kbp37", "hlt"]
path = "/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Features/Latest/"
os.chdir(path)
for file in os.listdir():
    if file.endswith(".csv"):
        dataset = file.split("_")[1][:-4]
        print(file)
        data = pd.read_csv(path + file)
        data["sentence"] = data["sentence"].apply(lambda x : " ".join(ast.literal_eval(x)))
        # data = data[data["label"].apply(lambda x : x==0)]
        data = data[features]
        data.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Features/Features/"+dataset+"_features.csv", index=False)
        
        
        

In [None]:
df.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /Examples/"+dataset+"_examples_relations.csv", index=False)