In [154]:
import os
import warnings

import pandas as pd
from itertools import combinations

import networkx as nx
import spacy

from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')
pd.set_option('max_colwidth', 400)


In [155]:
nlp = spacy.load("en_core_web_sm")

In [177]:
def generateNetwork(sentence, ent1_word_idx, ent2_word_idx):
    doc = nlp(sentence)
    edges = []
    for word_idx, token in enumerate(doc):
        for child in token.children:
            edges.append((token.i,
                      child.i))
            
    graph = nx.Graph(edges)
    return nx.shortest_path_length(graph, source=ent1_word_idx, target=ent2_word_idx), nx.shortest_path(graph, source=ent1_word_idx, target=ent2_word_idx)

In [181]:
def getFeatures(file):
#     tagged_tokens_header = ["text", "pos", "dep", "head", "begin", "end", "word_index", "sent_index", "entity_id", "ent_type"]
    df = pd.read_table(file)
    
    ## Calculating entity-entity distances per sentence
    sent_df = df.where(~df["entity type"].isna()).groupby(["sent_index"], as_index = False).agg({'text': list, 'entity type': list, 'pos' : list, 'tag' : list, 'dep': list})
    text_pairs = []
    ent_type_pairs = []
    pos_pairs = []
    tag_pairs = []
    dep_pairs = []
    for index, row in sent_df.iterrows():
        text_pairs.append(list(combinations(row["text"], r=2)))
        ent_type_pairs.append(list(combinations(row["entity type"], r=2)))
        pos_pairs.append(list(combinations(row["pos"], r=2)))
        dep_pairs.append(list(combinations(row["dep"], r=2)))
        tag_pairs.append(list(combinations(row["tag"], r=2)))
        
    sent_entity_df = pd.DataFrame()
    sent_entity_df["sent_index"] = sent_df["sent_index"]
    sent_entity_df["entity_pairs"] = text_pairs
    sent_entity_df["ent_type_pairs"] = ent_type_pairs
    sent_entity_df["pos_pairs"] = pos_pairs
    sent_entity_df["dep_pairs"] = dep_pairs
    sent_entity_df["tag_pairs"] = tag_pairs


    final_df = sent_entity_df.set_index('sent_index').apply(lambda x: x.apply(pd.Series).stack()).reset_index().drop('level_1', 1)
    
    sentences = pd.DataFrame()
    text = []
    root_word = ""
    root_word_index = -1
    curr_sent = 0
    for index, row in df.iterrows():
        if str(row["text"]).strip() == ".":
            text.append(row["text"])
            df_temp = {"sent_index": curr_sent, "sentence": text, "sentence_length": len(text), "root_word": root_word, "root_index": root_word_index}
            sentences = sentences.append(df_temp, ignore_index=True)
            text = []
            root_word = ""
            root_word_index = -1
            curr_sent = curr_sent + 1
            continue
        elif str(row["dep"]).strip() == "ROOT":
            root_word = row["text"]
            root_word_index = len(text) + 1
            text.append(row["text"])
        else:
            text.append(row["text"])

    final_df = final_df.join(sentences, on="sent_index", lsuffix="final", rsuffix="root")
    final_df = final_df.rename({"sent_indexfinal": "sent_index"}, axis='columns')
    final_df["entity_1_index"] = final_df.apply(lambda x : x['sentence'].index(x['entity_pairs'][0]) + 1, axis=1)
    final_df["entity_2_index"] = final_df.apply(lambda x : x['sentence'].index(x['entity_pairs'][1]) + 1, axis=1)
    final_df["entity_distance"] = final_df.apply(lambda x : x['entity_2_index'] - x['entity_1_index'], axis=1)
    final_df["no_words_before_entity_1"] = final_df["entity_1_index"] - 1 
    final_df["no_words_after_entity_2"] = final_df["sentence_length"] - final_df["entity_2_index"] - 1
    ## Calculating root-entity distances for each sentence
    final_df["entity_1_root_distance"] = final_df.apply(lambda x: x['root_index'] - x['entity_1_index'], axis=1)
    final_df["entity_2_root_distance"] = final_df.apply(lambda x: x['entity_2_index'] - x['root_index'], axis=1)
    
    # Expand tuples
    final_df[['entity_1', 'entity_2']] = final_df['entity_pairs'].apply(pd.Series)
    
    final_df[['entity_type_1', 'entity_type_2']] = final_df['ent_type_pairs'].apply(pd.Series)
    final_df[['entity_pos_1', 'entity_pos_2']] = final_df['pos_pairs'].apply(pd.Series)
    final_df[['entity_dep_1', 'entity_dep_2']] = final_df['dep_pairs'].apply(pd.Series)
    final_df[['entity_tag_1', 'entity_tag_2']] = final_df['tag_pairs'].apply(pd.Series)

    final_df[["shortest_distance", "shortest_path"]] = final_df.apply(lambda x : generateNetwork(' '.join(x["sentence"]), x["entity_1_index"], x["entity_2_index"]), axis = 1, result_type="expand")
        
    final_df = final_df.drop(["sent_indexroot", "entity_pairs", "ent_type_pairs", "pos_pairs", "dep_pairs", "sentence", "tag_pairs"], axis = 1)
    return final_df

In [190]:
## TODO : Ask about this
def getlabels(label_file, df):
    relations = pd.read_table(label_file)
    display(relations, 4)
    #entity 1 name entity 2 name
#     relations = df.apply(lambda x : )
    

In [191]:
## For testing
file = "/Users/anishajauhari/Desktop/Independent Study /Dataset/relex/re3d/tagged_tokens/001C9C3F3DFE16B4921B1E906F66E161.csv"
final_df = getFeatures(file)
label_file = "/Users/anishajauhari/Desktop/Independent Study /Dataset/relex/re3d/ground_truth/001C9C3F3DFE16B4921B1E906F66E161.csv"
labels = getlabels(label_file, final_df)
display(final_df)

## TODO : get the relation exists or not column

## Entire Dataset
# ground_truth_path = "/Users/anishajauhari/Desktop/Independent Study /Dataset/relex/re3d/ground_truth"
# tagged_token_path = "/Users/anishajauhari/Desktop/Independent Study /Dataset/relex/re3d/tagged_tokens"

# os.chdir(tagged_token_path)
# for file in os.listdir():
#     if file.endswith(".csv"):
#         file_path = f"{tagged_token_path}/{file}"
#         print("File : ", file_path)
#         display(calculate_distance(file_path))

Unnamed: 0,rel type,rel subtype,entity 1 ID,entity 2 ID,entity 1 mention ID,entity 2 mention ID,entity 1 name,entity 2 name,entity 1 type,entity 2 type,same_sent,same_sent.1,distance,pos_pattern,shortest_dep_path,span
0,CoLocated,CoLocated,001C9C3F3DFE16B4921B1E906F66E161-0-15-35-Organisation,001C9C3F3DFE16B4921B1E906F66E161-0-39-44-Location,001C9C3F3DFE16B4921B1E906F66E161-0-15-35-Organisation,001C9C3F3DFE16B4921B1E906F66E161-0-39-44-Location,U.S. service members,Syria,Organisation,Location,True,True,2.0,ADP,2.0,U.S. service members in Syria
1,CommWith,CommWith,001C9C3F3DFE16B4921B1E906F66E161-1-12-23-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-51-75-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-12-23-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-51-75-Organisation,U.S. forces,Syrian Democratic Forces,Organisation,Organisation,True,True,5.0,AUX-VERB-CCONJ-VERB,3.0,U.S. forces are advising and assisting Syrian Democratic Forces
2,CommWith,CommWith,001C9C3F3DFE16B4921B1E906F66E161-1-51-75-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-12-23-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-51-75-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-12-23-Organisation,Syrian Democratic Forces,U.S. forces,Organisation,Organisation,True,True,5.0,AUX-VERB-CCONJ-VERB,3.0,U.S. forces are advising and assisting Syrian Democratic Forces
3,IsSynonymOf,IsSynonymOf,001C9C3F3DFE16B4921B1E906F66E161-1-51-75-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-77-80-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-51-75-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-77-80-Organisation,Syrian Democratic Forces,SDF,Organisation,Organisation,True,True,1.0,,1.0,Syrian Democratic Forces ( SDF
4,IsSynonymOf,IsSynonymOf,001C9C3F3DFE16B4921B1E906F66E161-1-77-80-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-51-75-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-77-80-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-51-75-Organisation,SDF,Syrian Democratic Forces,Organisation,Organisation,True,True,1.0,,1.0,Syrian Democratic Forces ( SDF
5,CoLocated,CoLocated,001C9C3F3DFE16B4921B1E906F66E161-1-93-111-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-125-157-Location,001C9C3F3DFE16B4921B1E906F66E161-1-93-111-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-125-157-Location,Syrian Arab forces,the northern part of the country,Organisation,Location,True,True,3.0,VERB-ADP,3.0,Syrian Arab forces operating in the northern part of the country
6,FightingAgainst,FightingAgainst,001C9C3F3DFE16B4921B1E906F66E161-1-93-111-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-208-213-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-93-111-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-208-213-Organisation,Syrian Arab forces,Daesh,Organisation,Organisation,True,True,13.0,VERB-ADP-DET-PRON-AUX-VERB-NOUN-ADP-DET-NOUN-PART-VERB,4.0,Syrian Arab forces operating in the northern part of the country who are leading efforts in the campaign to defeat Daesh
7,BelongsTo,BelongsTo,001C9C3F3DFE16B4921B1E906F66E161-1-93-111-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-77-80-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-93-111-Organisation,001C9C3F3DFE16B4921B1E906F66E161-1-77-80-Organisation,Syrian Arab forces,SDF,Organisation,Organisation,True,True,2.0,ADV,4.0,"SDF ) , primarily Syrian Arab forces"
8,IsSynonymOf,IsSynonymOf,001C9C3F3DFE16B4921B1E906F66E161-3-0-12-Person,001C9C3F3DFE16B4921B1E906F66E161-3-14-47-Person,001C9C3F3DFE16B4921B1E906F66E161-3-0-12-Person,001C9C3F3DFE16B4921B1E906F66E161-3-14-47-Person,Joseph Votel,commander of U.S. Central Command,Person,Person,True,True,1.0,,1.0,"Joseph Votel , commander of U.S. Central Command"
9,FightingAgainst,FightingAgainst,001C9C3F3DFE16B4921B1E906F66E161-3-131-133-Organisation,001C9C3F3DFE16B4921B1E906F66E161-3-125-130-Organisation,001C9C3F3DFE16B4921B1E906F66E161-3-131-133-Organisation,001C9C3F3DFE16B4921B1E906F66E161-3-125-130-Organisation,we,Daesh,Organisation,Organisation,True,True,1.0,,3.0,Daesh we


4

Unnamed: 0,sent_index,sentence_length,root_word,root_index,entity_1_index,entity_2_index,entity_distance,no_words_before_entity_1,no_words_after_entity_2,entity_1_root_distance,...,entity_type_1,entity_type_2,entity_pos_1,entity_pos_2,entity_dep_1,entity_dep_2,entity_tag_1,entity_tag_2,shortest_distance,shortest_path
0,0.0,24.0,shown,8.0,3,4,1,2,19.0,5.0,...,Organisation,Organisation,PROPN,NOUN,compound,compound,NNP,NN,1,"[3, 4]"
1,0.0,24.0,shown,8.0,3,5,2,2,18.0,5.0,...,Organisation,Organisation,PROPN,NOUN,compound,pobj,NNP,NNS,2,"[3, 4, 5]"
2,0.0,24.0,shown,8.0,3,7,4,2,16.0,5.0,...,Organisation,Location,PROPN,PROPN,compound,pobj,NNP,NNP,4,"[3, 4, 1, 0, 7]"
3,0.0,24.0,shown,8.0,3,13,10,2,10.0,5.0,...,Organisation,Organisation,PROPN,DET,compound,det,NNP,DT,10,"[3, 4, 1, 0, 7, 8, 9, 10, 11, 14, 13]"
4,0.0,24.0,shown,8.0,3,14,11,2,9.0,5.0,...,Organisation,Organisation,PROPN,ADJ,compound,compound,NNP,JJ,9,"[3, 4, 1, 0, 7, 8, 9, 10, 11, 14]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530,6.0,22.0,remain,2.0,8,19,11,7,2.0,-6.0,...,Organisation,Location,DET,PROPN,poss,pobj,PRP$,NNP,10,"[8, 9, 6, 5, 11, 14, 15, 16, 17, 18, 19]"
531,6.0,22.0,remain,2.0,8,21,13,7,0.0,-6.0,...,Organisation,Location,DET,PROPN,poss,conj,PRP$,NNP,7,"[8, 9, 6, 5, 3, 2, 1, 21]"
532,6.0,22.0,remain,2.0,17,19,2,16,2.0,-15.0,...,Organisation,Location,PROPN,PROPN,pobj,pobj,NNP,NNP,2,"[17, 18, 19]"
533,6.0,22.0,remain,2.0,17,21,4,16,0.0,-15.0,...,Organisation,Location,PROPN,PROPN,pobj,conj,NNP,NNP,9,"[17, 16, 15, 14, 11, 5, 3, 2, 1, 21]"


In [189]:

def logistic_regression(x_train, x_test, y_train, y_test):
    print("--------Logistic Regression-----------")
    lr =  LogisticRegression(n_jobs=4)
    lr.fit(x_train, y_train)
    y_pred=lr.predict(x_test)
    print("Coefficients: ", lr.coef_)
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    print("Precision: ", metrics.precision_score(y_test, y_pred, average="binary", pos_label=1))
    print("Recall: ", metrics.recall_score(y_test, y_pred, average="binary", pos_label=1))
    print("F-Score: ", metrics.f1_score(y_test, y_pred, average="binary", pos_label=1))
    print(classification_report(y_test, y_pred))


In [186]:
x_train, x_test, y_train, y_test = train_test_split(final_df.iloc[:,:-1], final_df.iloc[:,-1], test_size=0.2, stratify=final_df.iloc[:,-1])
logistic_regression(x_train, x_test, y_train, y_test)

ValueError: The least populated class in y has only 1 member, which is too few. The minimum number of groups for any class cannot be less than 2.