In [None]:
import os
import warnings
import random
import pandas as pd
from itertools import combinations
import sys
import networkx as nx
import spacy
import traceback
import pickle

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder


pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

random.seed(100)
warnings.filterwarnings('ignore')


In [None]:
# !python -m spacy download en_core_web_lg
nlp = spacy.load("en_core_web_lg")
# nlp = spacy.load("en_core_web_sm")

In [None]:
def generateNetwork(sentence_list, ent1_word_idx, ent2_word_idx):
    try:
        sentence = ' '.join(sentence_list)
        doc = nlp(sentence)
        edges = []
        for word_idx, token in enumerate(doc):
            for child in token.children:
                edges.append((token.i,
                          child.i))

        graph = nx.Graph(edges)
        ## If shortest path not found
        try:
            shortest_path_length = nx.shortest_path_length(graph, source=ent1_word_idx, target=ent2_word_idx)
            shortest_path = nx.shortest_path(graph, source=ent1_word_idx, target=ent2_word_idx)
        except: 
            shortest_path_length = -1
            shortest_path = "no_path_found"
        return shortest_path_length, shortest_path
    except:
        print("NETWORK - NO PATH FOUND: ", sentence_list, ent1_word_idx, ent2_word_idx)
        print(traceback.format_exc())
        return -1, "no_path_found"

In [None]:
def getIndex(row, sentence, entity):
    try:
        return sentence.index(entity.split(" ")[0]) + 1
    except:
        print("INDEX NOT FOUND - ", sentence, entity, entity.split(" ")[0])
        print(traceback.format_exc())
        return -1

In [None]:
def getFeatures(file):
    try:
        df = pd.read_csv(file, sep="\t", index_col=False, dtype={'text': str, 'entity type': str, 'pos':str , 'tag': str, 'dep': str, 'sent_index': int, 'entity mention ID': str})                           
        df = df.fillna("")
        filtered_df = df.where(df["entity mention ID"]!="").groupby(["sent_index", "entity mention ID"], as_index=False).agg({'text': ' '.join, 'entity type': '-'.join, 'pos' : '-'.join, 'tag' : '-'.join,'dep': '-'.join})
        ## Calculating entity-entity distances per sentence
        sent_df = filtered_df.groupby(["sent_index"], as_index=False).agg({'text': list, 'entity type': list, 'pos' : list, 'tag' : list, 'dep': list})
        text_pairs = []
        ent_type_pairs = []
        pos_pairs = []
        tag_pairs = []
        dep_pairs = []

        for index, row in sent_df.iterrows():
            text_pairs.append(list(combinations(row["text"], r=2)))
            ent_type_pairs.append(list(combinations(row["entity type"], r=2)))
            pos_pairs.append(list(combinations(row["pos"], r=2)))
            dep_pairs.append(list(combinations(row["dep"], r=2)))
            tag_pairs.append(list(combinations(row["tag"], r=2)))

        sent_entity_df = pd.DataFrame()
        sent_entity_df["sent_index"] = sent_df["sent_index"]
        sent_entity_df["entity_pairs"] = text_pairs
        sent_entity_df["ent_type_pairs"] = ent_type_pairs
        sent_entity_df["pos_pairs"] = pos_pairs
        sent_entity_df["dep_pairs"] = dep_pairs
        sent_entity_df["tag_pairs"] = tag_pairs


        final_df = sent_entity_df.set_index('sent_index').apply(lambda x: x.apply(pd.Series).stack()).reset_index().drop('level_1', 1)

        sentences = df.groupby(['sent_index'], as_index=False).agg({'text': list, 'dep': list})
        sentences["sentence_length"] = sentences.apply(lambda x : len(x["text"]), axis=1)
        sentences["root_index"] = sentences.apply(lambda x : int(x["dep"].index("ROOT")), axis=1)
        sentences["root_word"] = sentences.apply(lambda x : x["text"][x["root_index"]], axis=1)
        sentences = sentences.rename(columns={"text": "sentence"})
        final_df = pd.merge(final_df, sentences, on='sent_index', sort=False)
        final_df["entity_1_index"] = final_df.apply(lambda x : getIndex(x, x['sentence'], x['entity_pairs'][0]), axis=1)
        final_df["entity_2_index"] = final_df.apply(lambda x : getIndex(x, x['sentence'], x['entity_pairs'][1]), axis=1)
        final_df["entity_distance"] = final_df.apply(lambda x : x['entity_2_index'] - x['entity_1_index'], axis=1)
        final_df["no_words_before_entity_1"] = final_df["entity_1_index"] - 1 
        final_df["no_words_after_entity_2"] = final_df["sentence_length"] - final_df["entity_2_index"] - 1

        ## Calculating root-entity distances for each sentence
        final_df["entity_1_root_distance"] = final_df.apply(lambda x: x['root_index'] - x['entity_1_index'], axis=1)
        final_df["entity_2_root_distance"] = final_df.apply(lambda x: x['entity_2_index'] - x['root_index'], axis=1)

        # Expand tuples
        final_df[['entity 1 name', 'entity 2 name']] = final_df['entity_pairs'].apply(pd.Series)

        final_df[['entity_type_1', 'entity_type_2']] = final_df['ent_type_pairs'].apply(pd.Series)
        final_df[['entity_pos_1', 'entity_pos_2']] = final_df['pos_pairs'].apply(pd.Series)
        final_df[['entity_dep_1', 'entity_dep_2']] = final_df['dep_pairs'].apply(pd.Series)
        final_df[['entity_tag_1', 'entity_tag_2']] = final_df['tag_pairs'].apply(pd.Series)
        final_df[["shortest_distance", "shortest_path"]] = final_df.apply(lambda x : generateNetwork(x["sentence"], x["entity_1_index"], x["entity_2_index"]), axis = 1, result_type="expand")

        final_df["entity_type_1"] = final_df["entity_type_1"].apply(lambda x : x.split("-")[0])
        final_df["entity_type_2"] = final_df["entity_type_2"].apply(lambda x : x.split("-")[0])
        
        final_df = final_df.drop(["entity_pairs", "ent_type_pairs", "pos_pairs", "dep_pairs", "sentence", "tag_pairs"], axis = 1)
        return final_df
    except :
        print("Failed to execute file : ", file)
        print("Error : ", sys.exc_info())
        print(traceback.format_exc())
        return pd.DataFrame()

In [None]:
columns = ['sent_index', 'sentence_length', 'root_word', 'root_index',
       'entity_1_index', 'entity_2_index', 'entity_distance',
       'no_words_before_entity_1', 'no_words_after_entity_2',
       'entity_1_root_distance', 'entity_2_root_distance', 'entity 1 name',
       'entity 2 name', 'entity_type_1', 'entity_type_2', 'entity_pos_1',
       'entity_pos_2', 'entity_dep_1', 'entity_dep_2', 'entity_tag_1',
       'entity_tag_2', 'shortest_distance', 'label']

def getFeaturesWithlabels(relation_file, tagged_tokens_file):
    try:
        final_df = getFeatures(tagged_tokens_file)
        relations = pd.read_table(relation_file)
        result = final_df.merge(relations, on=["entity 1 name", "entity 2 name"], sort=False)
        result.drop_duplicates(subset=["sent_index", "entity 1 name", "entity 2 name"], keep='first', inplace=True, ignore_index=True)
        if "span" not in result.keys():
            display(result)
        result["label"] = result["span"].apply(lambda x : 1 if isinstance(x, str) else 0)
        return result
    except:
        print("Skipping run for : ", relation_file, tagged_tokens_file)
        print(sys.exc_info())
        display(final_df)
#         display(relations)
        print(traceback.format_exc())
        return pd.DataFrame()
    

In [None]:
dataset = "ACE2005"
ground_truth_path = f"/Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset/relex/{dataset}/ground_truth"
tagged_token_path = f"/Users/anishajauhari/Desktop/Sem 4/Independent Study /Dataset/relex/{dataset}/tagged_tokens"

In [None]:
os.chdir(tagged_token_path)
features_with_labels = pd.DataFrame()
for file in os.listdir():
    if file.endswith(".csv"):
        print("File : ", file)
        tagged_tokens_file = f"{tagged_token_path}/{file}"
        relation_file = f"{ground_truth_path}/{file}"
        temp = getFeaturesWithlabels(relation_file, tagged_tokens_file)
        features_with_labels = pd.concat([features_with_labels, temp])

display(features_with_labels)

In [None]:
features_with_labels.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Features/n_features_"+dataset+".csv")