In [79]:
import pandas as pd
import numpy as np
import pickle
import warnings
from sklearn.preprocessing import LabelEncoder

%run models.ipynb
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

warnings.filterwarnings("ignore")

In [80]:
dataset = "re3d"
columns = ['sent_index', 'sentence_length', 'root_word', 'root_index',
       'entity_1_index', 'entity_2_index', 'entity_distance',
       'no_words_before_entity_1', 'no_words_after_entity_2',
       'entity_1_root_distance', 'entity_2_root_distance', 'entity 1 name',
       'entity 2 name', 'entity 1 type', 'entity 2 type', 'entity_pos_1',
       'entity_pos_2', 'entity_dep_1', 'entity_dep_2', 'entity_tag_1',
       'entity_tag_2', 'shortest_distance', 'label']

features = pd.read_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Features/features_"+dataset+".csv", index_col=0)
features = features[columns]
features.head(10)
features['entity_type_1'] = features['entity 1 type'].fillna("MISC")
features['entity_type_2'] = features['entity 2 type'].fillna("MISC")
# features = features.drop(["entity 1 type", "entity 2 type"], axis=1)

## Feature Encoding

In [81]:
ENT_TYPES = ["PER", "GPE", "LOC", "ORG", "TIME", "NUM", "MISC"]
def recode_entity_types(value):
    if value.lower() in ["person", "per"]:
        return "PERSON"
    elif value.lower() in ["organisation", "org"]:
        return "ORGANISATION"
    elif value.lower() in ["time"]:
        return "TIME"
    elif value.lower() in ["number", "num"]:
        return "NUMBER"
    elif value.lower() in ["gpe"]:
        return "GPE"
    elif value.lower() in ["location", "loc"]:
        return "LOCATION"
    else:
        return "MISC"

In [82]:
features["entity_type_1"] = features["entity_type_1"].apply(lambda x : recode_entity_types(x.split("-")[0]))
features["entity_type_2"] = features["entity_type_2"].apply(lambda x : recode_entity_types(x.split("-")[0]))

In [83]:
features["label"] = features.apply(lambda x : f"{x['entity_type_1']}-{x['entity_type_2']}", axis=1)

In [84]:
print(f"Number of unique entity-type-paris : {len(features['label'].unique())}")
print(f"Number of unique entity-type-paris : {features['label'].unique()}")

Number of unique entity-type-paris : 14
Number of unique entity-type-paris : ['MISC-ORGANISATION' 'PERSON-PERSON' 'MISC-LOCATION'
 'ORGANISATION-LOCATION' 'LOCATION-LOCATION' 'ORGANISATION-ORGANISATION'
 'ORGANISATION-PERSON' 'PERSON-ORGANISATION' 'ORGANISATION-MISC'
 'MISC-MISC' 'PERSON-LOCATION' 'LOCATION-ORGANISATION' 'PERSON-MISC'
 'LOCATION-MISC']


In [85]:
features.drop(["entity_type_1","entity_type_2"], axis=1, inplace=True)

In [86]:
features

Unnamed: 0,sent_index,sentence_length,root_word,root_index,entity_1_index,entity_2_index,entity_distance,no_words_before_entity_1,no_words_after_entity_2,entity_1_root_distance,entity_2_root_distance,entity 1 name,entity 2 name,entity 1 type,entity 2 type,entity_pos_1,entity_pos_2,entity_dep_1,entity_dep_2,entity_tag_1,entity_tag_2,shortest_distance,label
0,0.0,53,said,16,6,45,39,5,7,10,29,chemical weapons,Daesh,Weapon,Organisation,ADJ-NOUN,PROPN,amod-dobj,nsubj,JJ-NNS,NNP,8,MISC-ORGANISATION
1,2.0,23,shows,9,4,7,3,3,15,5,-2,chemical weapons,Daesh,Weapon,Organisation,ADJ-NOUN,PROPN,amod-pobj,pobj,JJ-NNS,NNP,3,MISC-ORGANISATION
2,0.0,53,said,16,10,14,4,9,38,6,-2,The Foreign Secretary,Boris Johnson,Person,Person,DET-PROPN-PROPN,PROPN-PROPN,det-compound-nsubj,compound-appos,DT-NNP-NNP,NNP-NNP,2,PERSON-PERSON
3,2.0,23,shows,9,4,9,5,3,13,5,0,chemical weapons,Marea,Weapon,Location,ADJ-NOUN,PROPN,amod-pobj,pobj,JJ-NNS,NNP,3,MISC-LOCATION
4,2.0,23,shows,9,7,9,2,6,13,2,0,Daesh,Marea,Organisation,Location,PROPN,PROPN,pobj,pobj,NNP,NNP,2,ORGANISATION-LOCATION
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,49.0,24,symbolises,1,1,8,7,0,15,0,7,Karim,the region,Location,Location,PROPN,DET-NOUN,nsubj,det-pobj,NNP,DT-NN,4,LOCATION-LOCATION
0,1.0,16,said,13,5,3,-2,4,12,8,-10,Gareth Bayley,the UK Special Representative for Syria,Person,Person,PROPN-PROPN,DET-PROPN-PROPN-PROPN-ADP-PROPN,compound-dobj,det-compound-compound-appos-prep-pobj,NNP-NNP,DT-NNP-NNP-NNP-IN-NNP,1,PERSON-PERSON
0,0.0,54,provide,18,2,11,9,1,42,16,-7,Deputy Secretary of State Antony Blinken,the Manama Dialogue,Person,Organisation,PROPN-PROPN-ADP-PROPN-PROPN-PROPN,DET-PROPN-PROPN,nmod-nmod-prep-pobj-compound-nsubj,det-compound-pobj,NNP-NNP-IN-NNP-NNP-NNP,DT-NNP-NNP,5,PERSON-ORGANISATION
1,3.0,38,includes,5,20,14,-6,19,23,-15,9,SRTF,the Syria Recovery Trust Fund,Organisation,Organisation,PROPN,DET-PROPN-PROPN-PROPN-PROPN,appos,det-compound-compound-compound-pobj,NNP,DT-NNP-NNP-NNP-NNP,3,ORGANISATION-ORGANISATION


## Multiclass Classification

In [87]:
features["label"].value_counts()
features = features.groupby('label').filter(lambda x : len(x)>2)

In [88]:
features["label"].value_counts()

ORGANISATION-ORGANISATION    167
ORGANISATION-LOCATION        153
LOCATION-LOCATION            53 
PERSON-ORGANISATION          53 
PERSON-LOCATION              47 
                             .. 
ORGANISATION-MISC            27 
MISC-MISC                    12 
MISC-ORGANISATION            11 
ORGANISATION-PERSON          3  
PERSON-MISC                  3  
Name: label, Length: 12, dtype: int64

In [91]:
# features_with_labels = pd.read_csv("features_"+dataset+".csv")
X = features.iloc[:,:-1]
X = X.fillna("")
y_original =  features.iloc[:,-1]

x_train_original, x_test_original, y_train_original, y_test_original = train_test_split(X, y_original, test_size=0.2, stratify=y_original)
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
x_train = onehot_encoder.fit_transform(x_train_original)
x_test = onehot_encoder.transform(x_test_original)

le = LabelEncoder()
y_train = le.fit_transform(y_train_original)
y_test = le.transform(y_test_original)
le_name_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
le_name_mapping

{'LOCATION-LOCATION': 0,
 'MISC-LOCATION': 1,
 'MISC-MISC': 2,
 'MISC-ORGANISATION': 3,
 'ORGANISATION-LOCATION': 4,
 'ORGANISATION-MISC': 5,
 'ORGANISATION-ORGANISATION': 6,
 'ORGANISATION-PERSON': 7,
 'PERSON-LOCATION': 8,
 'PERSON-MISC': 9,
 'PERSON-ORGANISATION': 10,
 'PERSON-PERSON': 11}

In [92]:
unique, count = np.unique(y_original, return_counts = True)
dict(zip(unique, count))

{'LOCATION-LOCATION': 53,
 'MISC-LOCATION': 32,
 'MISC-MISC': 12,
 'MISC-ORGANISATION': 11,
 'ORGANISATION-LOCATION': 153,
 'ORGANISATION-MISC': 27,
 'ORGANISATION-ORGANISATION': 167,
 'ORGANISATION-PERSON': 3,
 'PERSON-LOCATION': 47,
 'PERSON-MISC': 3,
 'PERSON-ORGANISATION': 53,
 'PERSON-PERSON': 41}

In [67]:
pred = logistic_regression_multiple_classes(x_train, x_test, y_train, y_test, "et")
x_test_log_pred = x_test_original
x_test_log_pred["label"] = y_test
x_test_log_pred["prediction"] = pred
x_test_log_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_logisitic_regression_prediction.csv")

--------Logistic Regression-----------


In [68]:
pred = random_forest_multiple(x_train, x_test, y_train, y_test,"et")
x_test_rf_pred = x_test_original
x_test_rf_pred["label"] = y_test
x_test_rf_pred["prediction"] = pred
x_test_rf_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_random_forest_prediction.csv")


--------Random Forest-----------


In [69]:
pred = svm_multiple("linear", x_train, x_test, y_train, y_test, "et")
x_test_svm_pred = x_test_original
x_test_svm_pred["label"] = y_test
x_test_svm_pred["prediction"] = pred
x_test_svm_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_svm_prediction.csv")

--------Support Vector Classifier linear-----------


In [70]:
pred = svm_multiple("rbf", x_train, x_test, y_train, y_test, "et")
x_test_svmrbf_pred = x_test_original
x_test_svmrbf_pred["label"] = y_test
x_test_svmrbf_pred["prediction"] = pred
x_test_svmrbf_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_svm_rbf_prediction.csv")

--------Support Vector Classifier rbf-----------


In [71]:
pred = svm_multiple("poly", x_train, x_test, y_train, y_test, "et")
x_test_svmpoly_pred = x_test_original
x_test_svmpoly_pred["label"] = y_test
x_test_svmpoly_pred["prediction"] = pred
x_test_svmpoly_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_svm_poly_prediction.csv")


--------Support Vector Classifier poly-----------


In [72]:
pred = xgboost_multiple_class(x_train, x_test, y_train, y_test, "et")
x_test_xg_pred = x_test_original
x_test_xg_pred["label"] = y_test
x_test_xg_pred["prediction"] = pred
x_test_xg_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_xgboost_prediction.csv")


In [73]:
pred = adaboost_multiple(x_train, x_test, y_train, y_test, "et")
x_test_ada_pred = x_test_original
x_test_ada_pred["label"] = y_test
x_test_ada_pred["prediction"] = pred
x_test_ada_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_adaboost_prediction.csv")
