In [2]:
import os
import warnings
import random
import pandas as pd
from itertools import combinations
import sys
import networkx as nx
import spacy
import traceback
import pickle

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder


pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

random.seed(100)
warnings.filterwarnings('ignore')




In [3]:
# !python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_sm")

In [4]:
def generateNetwork(sentence_list, ent1_word_idx, ent2_word_idx):
    try:
        sentence = ' '.join(sentence_list)
        doc = nlp(sentence)
        edges = []
        for word_idx, token in enumerate(doc):
            for child in token.children:
                edges.append((token.i,
                          child.i))

        graph = nx.Graph(edges)
        ## If shortest path not found
        try:
            shortest_path_length = nx.shortest_path_length(graph, source=ent1_word_idx, target=ent2_word_idx)
            shortest_path = nx.shortest_path(graph, source=ent1_word_idx, target=ent2_word_idx)
        except: 
            shortest_path_length = -1
            shortest_path = "no_path_found"
        return shortest_path_length, shortest_path
    except:
        print("NETWORK - NO PATH FOUND: ", sentence_list, ent1_word_idx, ent2_word_idx)
        print(traceback.format_exc())
        return -1, "no_path_found"

In [5]:
def getIndex(row, sentence, entity):
    try:
        return sentence.index(entity.split(" ")[0]) + 1
    except:
        print("INDEX NOT FOUND - ", sentence, entity, entity.split(" ")[0])
        print(traceback.format_exc())
        return -1

In [26]:
def getFeatures(file):
    try:
        # df = pd.read_csv(file, sep="\t", index_col=False, dtype={'text': str, 'entity type': str, 'pos':str , 'tag': str, 'dep': str, 'sent_index': int, 'entity mention ID': str})                           
        df = pd.read_csv(file, index_col=False)  ## For ACE2004 and ACE2005 as sep is different. 
        df = df.rename(columns={"entity_id": "entity mention ID", "sent_idx": "sent_index", "entity_type" : "entity type"})
        df.astype({'text': str, 'entity type': str, 'pos':str , 'dep': str, 'sent_index': int, 'entity mention ID': str})
        df = df.fillna("")
        # filtered_df = df.where(df["entity mention ID"]!="").groupby(["sent_index", "entity mention ID"], as_index=False).agg({'text': ' '.join, 'entity type': '-'.join, 'pos' : '-'.join, 'tag' : '-'.join,'dep': '-'.join})
         ## For ACE2004, ACE2005, tacred, kbp37 as no tag exists
        filtered_df = df.where(df["entity mention ID"]!="").groupby(["sent_index", "entity mention ID"], as_index=False).agg({'text': ' '.join, 'entity type': '-'.join, 'pos' : '-'.join, 'dep': '-'.join}) 

        ## Calculating entity-entity distances per sentence
        # sent_df = filtered_df.groupby(["sent_index"], as_index=False).agg({'text': list, 'entity type': list, 'pos' : list, 'tag' : list, 'dep': list})
         ##  For ACE2004, ACE2005, tacred, kbp37 as no tag exists
        sent_df = filtered_df.groupby(["sent_index"], as_index=False).agg({'text': list, 'entity type': list, 'pos' : list, 'dep': list})

        text_pairs = []
        ent_type_pairs = []
        pos_pairs = []
        tag_pairs = []
        dep_pairs = []
        
        pos_ignore_list = ["PRON", "PROPN-PART", "DET", "AUX", "CONJ", "CCONJ", "PART","PUNCT", "SYM"]

        for index, row in sent_df.iterrows():
            text_pairs.append(list(combinations(row["text"], r=2)))
            ent_type_pairs.append(list(combinations(row["entity type"], r=2)))
            pos_pairs.append(list(combinations(row["pos"], r=2)))
            dep_pairs.append(list(combinations(row["dep"], r=2)))
            # tag_pairs.append(list(combinations(row["tag"], r=2)))  ## For  ACE2004, ACE2005, tacred, kbp37 as no tag exists

        sent_entity_df = pd.DataFrame()
        sent_entity_df["sent_index"] = sent_df["sent_index"]
        sent_entity_df["entity_pairs"] = text_pairs
        sent_entity_df["ent_type_pairs"] = ent_type_pairs
        sent_entity_df["pos_pairs"] = pos_pairs
        sent_entity_df["dep_pairs"] = dep_pairs
        # sent_entity_df["tag_pairs"] = tag_pairs  ## For ACE2004, ACE2005, tacred, kbp37 as no tag exists 


        final_df = sent_entity_df.set_index('sent_index').apply(lambda x: x.apply(pd.Series).stack()).reset_index().drop('level_1', 1)

        sentences = df.groupby(['sent_index'], as_index=False).agg({'text': list, 'dep': list})
        sentences["sentence_length"] = sentences.apply(lambda x : len(x["text"]), axis=1)
        sentences["root_index"] = sentences.apply(lambda x : int(x["dep"].index("ROOT")), axis=1)
        sentences["root_word"] = sentences.apply(lambda x : x["text"][x["root_index"]], axis=1)
        sentences = sentences.rename(columns={"text": "sentence"})
        final_df = pd.merge(final_df, sentences, on='sent_index', sort=False)
        final_df["entity_1_index"] = final_df.apply(lambda x : getIndex(x, x['sentence'], x['entity_pairs'][0]), axis=1)
        final_df["entity_2_index"] = final_df.apply(lambda x : getIndex(x, x['sentence'], x['entity_pairs'][1]), axis=1)
        final_df["entity_distance"] = final_df.apply(lambda x : x['entity_2_index'] - x['entity_1_index'], axis=1)
        final_df["no_words_before_entity_1"] = final_df["entity_1_index"] - 1 
        final_df["no_words_after_entity_2"] = final_df["sentence_length"] - final_df["entity_2_index"] - 1

        ## Calculating root-entity distances for each sentence
        final_df["entity_1_root_distance"] = final_df.apply(lambda x: x['root_index'] - x['entity_1_index'], axis=1)
        final_df["entity_2_root_distance"] = final_df.apply(lambda x: x['entity_2_index'] - x['root_index'], axis=1)

        # Expand tuples
        final_df[['entity 1 name', 'entity 2 name']] = final_df['entity_pairs'].apply(pd.Series)

        final_df[['entity_type_1', 'entity_type_2']] = final_df['ent_type_pairs'].apply(pd.Series)
        final_df[['entity_pos_1', 'entity_pos_2']] = final_df['pos_pairs'].apply(pd.Series)
        final_df[['entity_dep_1', 'entity_dep_2']] = final_df['dep_pairs'].apply(pd.Series)
        # final_df[['entity_tag_1', 'entity_tag_2']] = final_df['tag_pairs'].apply(pd.Series)  ## For  ACE2004, ACE2005, tacred, kbp37 as no tag exists 
        final_df[["shortest_distance", "shortest_path"]] = final_df.apply(lambda x : generateNetwork(x["sentence"], x["entity_1_index"], x["entity_2_index"]), axis = 1, result_type="expand")
        final_df["entity_type_1"] = final_df["entity_type_1"].apply(lambda x : x.split("-")[0])
        final_df["entity_type_2"] = final_df["entity_type_2"].apply(lambda x : x.split("-")[0])

        final_df = final_df[~final_df['entity_pos_1'].isin(pos_ignore_list)]
        final_df = final_df[~final_df['entity_pos_2'].isin(pos_ignore_list)]

        display(final_df[["entity 1 name","entity 2 name", "entity_pos_1", "entity_pos_2", "entity_type_1", "entity_type_2"]])
        # final_df = final_df.drop([ "entity_pairs", "ent_type_pairs", "pos_pairs", "dep_pairs", "tag_pairs"], axis = 1)  ## For ACE2004, ACE2005, tacred, kbp37 as no tag exists 
        final_df = final_df.drop([ "entity_pairs", "ent_type_pairs", "pos_pairs", "dep_pairs"], axis = 1)
        return final_df
    except :
        print("Failed to execute file : ", file)
        print("Error : ", sys.exc_info())
        print(traceback.format_exc())
        return pd.DataFrame()

In [27]:
def getFeaturesWithlabels(relation_file, tagged_tokens_file):
    try:
        final_df = getFeatures(tagged_tokens_file)
        relations = pd.read_csv(relation_file)
        relations = relations.rename(columns={"ent1_text" : "entity 1 name", "ent2_text" : "entity 2 name"})
        result = final_df.merge(relations, on=["entity 1 name", "entity 2 name"], sort=False)
        result.drop_duplicates(subset=["sent_index", "entity 1 name", "entity 2 name"], keep='first', inplace=True, ignore_index=True)
        # if "span" not in result.keys():
        result["label"] = result["sent_index"].apply(lambda x : 1 if isinstance(x, str) else 0)
        # result["label"] = result.apply(lambda x : 0 if x["rel_type"] == "no_relation" else x["label"], axis = 1)
        return result
    except:
        print("Skipping run for : ", relation_file, tagged_tokens_file)
        print(sys.exc_info())
        # display(final_df)
#         display(relations)
        # print(traceback.format_exc())
        return pd.DataFrame()
    

In [28]:
dataset = "ace2005"
# ground_truth_path = f"/Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset/{dataset}/ground_truth"
ground_truth_path = f"/Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset/{dataset}"
# tagged_token_path = f"/Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset/{dataset}/tagged_tokens"
tagged_token_path = f"/Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset/{dataset}"

In [29]:
os.chdir(tagged_token_path)
features_with_labels = pd.DataFrame()
for file in os.listdir():
    if file.endswith(".csv"):
        if dataset in ["ace2004", "ace2005"]:
            file = file.split(".csv_")[0]
            tagged_tokens_file = f"{tagged_token_path}/{file}.csv_tagged_tokens.csv"
            relation_file = f"{ground_truth_path}/{file}.csv_gt_relations.csv"
        else:
            tagged_tokens_file = f"{tagged_token_path}/{file}"
            relation_file = f"{ground_truth_path}/{file}"
        temp = getFeaturesWithlabels(relation_file, tagged_tokens_file)
        features_with_labels = pd.concat([features_with_labels, temp])
        
        break
features_with_labels[["sentence", "entity_type_1", "entity_type_2", "rel_type"]]


Unnamed: 0,entity 1 name,entity 2 name,entity_pos_1,entity_pos_2,entity_type_1,entity_type_2
6,wife,kids,NOUN,NOUN,PER,PER
34,group,hubbies,NOUN,NOUN,PER,PER
36,group,wife,NOUN,NOUN,PER,PER
50,hubbies,wife,NOUN,NOUN,PER,PER
54,ex,boyfriend,PROPN,NOUN,PER,PER
...,...,...,...,...,...,...
509,parents,church,NOUN,NOUN,PER,ORG
511,members,pond,NOUN,ADJ,PER,LOC
512,members,church,NOUN,NOUN,PER,ORG
515,pond,church,ADJ,NOUN,LOC,ORG


Unnamed: 0,sentence,entity_type_1,entity_type_2,rel_type
0,"[I, love, my, wife, ,, and, the, kids, get, along, just, fine, .]",PER,PER,no_relation
1,"[My, wife, belongs, to, a, support, group, ,, and, just, for, fun, ,, they, all, googled, their, hubbies, ex, 's, ., ]",PER,PER,no_relation
2,"[Somehow, ,, this, lead, me, to, googling, my, ex, 's, boyfriend, ,, call, him, B.]",PER,PER,no_relation
3,"[And, there, he, was, ,, talking, about, plans, with, his, wife, and, kids, .]",PER,PER,no_relation
4,"[To, me, ,, B, was, just, this, one, dimensional, guy, that, was, being, dragged, along, by, my, exwife, .]",PER,PER,no_relation
...,...,...,...,...
57,"[My, family, thinks, my, ex, is, pond, scum, but, because, my, parents, are, still, members, at, the, same, church, ,, they, have, to, see, my, ex, and, his, new, wife, every, week, .]",PER,ORG,no_relation
58,"[My, family, thinks, my, ex, is, pond, scum, but, because, my, parents, are, still, members, at, the, same, church, ,, they, have, to, see, my, ex, and, his, new, wife, every, week, .]",PER,PER,no_relation
59,"[My, family, thinks, my, ex, is, pond, scum, but, because, my, parents, are, still, members, at, the, same, church, ,, they, have, to, see, my, ex, and, his, new, wife, every, week, .]",PER,ORG,no_relation
60,"[My, family, thinks, my, ex, is, pond, scum, but, because, my, parents, are, still, members, at, the, same, church, ,, they, have, to, see, my, ex, and, his, new, wife, every, week, .]",PER,ORG,Membership


In [76]:
features_with_labels.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Features/features_"+dataset+"-binary.csv")

## Examples

In [77]:
def print_full():
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')
print_full()

In [78]:
# get complete sentence
features_with_labels["sentence"] = features_with_labels["sentence"].apply(lambda x : " ".join(x))

In [79]:
## For Binary labels
features_with_labels[features_with_labels["label"].apply(lambda x : x==0)][["sentence", "entity 1 name", "entity 2 name", "entity_type_1", "entity_type_2", "label"]]

Unnamed: 0,sentence,entity 1 name,entity 2 name,entity_type_1,entity_type_2,label
0,It 'll take me awhile to get to thought stream...,me,me,PER,PER,0
1,"I love my wife , and the kids get along just f...",I,my,PER,PER,0
2,I do n't like my exwife .,I,my,PER,PER,0
3,I 'll have to ask about B 's kid when my own k...,I,my,PER,PER,0
4,I know my first marriage had a lot of good adv...,I,my,PER,PER,0
...,...,...,...,...,...,...
117,Kiichiro Toyoda founded the automaker in 1937 ...,manufacturer,automaker,ORG,ORG,0
118,Kiichiro Toyoda founded the automaker in 1937 ...,manufacturer,father,ORG,PER,0
119,Kiichiro Toyoda founded the automaker in 1937 ...,manufacturer,his,ORG,PER,0
120,Kiichiro Toyoda founded the automaker in 1937 ...,automaker,automaker,ORG,ORG,0


In [80]:
## For Entity Type Labels
ENT_TYPES = ["PER", "GPE", "LOC", "ORG"]
def recode_entity_types(value):
    if value.lower() in ["person", "per"]:
        return "PER"
    elif value.lower() in ["organisation", "org"]:
        return "ORG"
    elif value.lower() in ["gpe"]:
        return "GPE"
    elif value.lower() in ["location", "loc"]:
        return "LOC"
    else:
        return "OTHER"

    
features_with_labels["entity_type_1"] = features_with_labels["entity_type_1"].apply(lambda x : recode_entity_types(x.split("-")[0]))
features_with_labels["entity_type_2"] = features_with_labels["entity_type_2"].apply(lambda x : recode_entity_types(x.split("-")[0]))
features_with_labels["en-label"] = features_with_labels.apply(lambda x : f"{x['entity_type_1']}-{x['entity_type_2']}", axis=1)
# features_with_labels["entity 1 type"] = features_with_labels["entity 1 type"].apply(lambda x : recode_entity_types(x.split("-")[0]))
# features_with_labels["entity 2 type"] = features_with_labels["entity 2 type"].apply(lambda x : recode_entity_types(x.split("-")[0]))
# features_with_labels["en-label"] = features_with_labels.apply(lambda x : f"{x['entity 1 type']}-{x['entity 2 type']}", axis=1)

In [83]:
## For support 
features_with_labels = features_with_labels[features_with_labels['rel_type'].notna()]
grouped = features_with_labels.groupby(['rel_type'])
values = grouped["rel_type"].value_counts()
# grouped["support"] = grouped["en-label"].apply(lambda x : values[x].values[0])
df = grouped.head(3)[["rel_type", "sentence", "entity 1 name", "entity 2 name", "entity_type_1", "entity_type_2"]].sort_values(by = "rel_type")
## Calculate support of each type
# df["support"] = df["label"].apply(lambda x : values[x].values[0])
# df["support"] = df["en-label"].apply(lambda x : values[x].values[0])
df["support"] = df["rel_type"].apply(lambda x : values[x].values[0])
df["dataset"] = dataset

['no_relation' 'Family' 'Located' 'User-Owner-Inventor-Manufacturer'
 'Lasting-Personal' 'Membership' 'Employment' 'Business'
 'Citizen-Resident-Religion-Ethnicity' 'Student-Alum' 'Geographical'
 'Investor-Shareholder' 'Sports-Affiliation' 'Org-Location' 'Subsidiary'
 'Near' 'Artifact' 'Founder' 'Ownership']


In [86]:
df.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /Examples/"+dataset+"_examples_relations.csv", index=False)