In [1]:
import pandas as pd
import numpy as np
import pickle
import warnings
from sklearn.preprocessing import LabelEncoder

%run models.ipynb
pd.set_option('display.max_rows', 10)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', -1)

warnings.filterwarnings("ignore")

In [37]:
dataset = "hlt"
columns = ['sent_index', 'sentence_length', 'root_word', 'root_index',
       'entity_1_index', 'entity_2_index', 'entity_distance',
       'no_words_before_entity_1', 'no_words_after_entity_2',
       'entity_1_root_distance', 'entity_2_root_distance', 'entity 1 name',
       'entity 2 name', 'entity_type_1', 'entity_type_2', 'entity_pos_1',
       'entity_pos_2', 'entity_dep_1', 'entity_dep_2', 'entity_tag_1',
       'entity_tag_2', 'shortest_distance', 'label']

features = pd.read_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Features/features_"+dataset+".csv", index_col=0)
features = features[columns]
features.head(10)
features['entity_type_1'] = features['entity_type_1'].fillna("MISC")
features['entity_type_2'] = features['entity_type_2'].fillna("MISC")

## Feature Encoding

In [38]:
ENT_TYPES = ["PER", "GPE", "LOC", "ORG", "TIME", "NUM", "MISC"]
def recode_entity_types(value):
    if value.lower() in ["person", "per"]:
        return "PERSON"
    elif value.lower() in ["organisation", "org"]:
        return "ORGANISATION"
    elif value.lower() in ["time"]:
        return "TIME"
    elif value.lower() in ["number", "num"]:
        return "NUMBER"
    elif value.lower() in ["gpe"]:
        return "GPE"
    elif value.lower() in ["location", "loc"]:
        return "LOCATION"
    else:
        return "MISC"

In [39]:
features["entity_type_1"] = features["entity_type_1"].apply(lambda x : recode_entity_types(x.split("-")[0]))
features["entity_type_2"] = features["entity_type_2"].apply(lambda x : recode_entity_types(x.split("-")[0]))

In [40]:
features["label"] = features.apply(lambda x : f"{x['entity_type_1']}-{x['entity_type_2']}", axis=1)

In [41]:
print(f"Number of unique entity-type-paris : {len(features['label'].unique())}")
print(f"Number of unique entity-type-paris : {features['label'].unique()}")

Number of unique entity-type-paris : 1
Number of unique entity-type-paris : ['MISC-MISC']


In [42]:
features.drop(["entity_type_1","entity_type_2"], axis=1, inplace=True)

In [43]:
features

Unnamed: 0,sent_index,sentence_length,root_word,root_index,entity_1_index,entity_2_index,entity_distance,no_words_before_entity_1,no_words_after_entity_2,entity_1_root_distance,entity_2_root_distance,entity 1 name,entity 2 name,entity_pos_1,entity_pos_2,entity_dep_1,entity_dep_2,entity_tag_1,entity_tag_2,shortest_distance,label
0,0.0,32,acquire,14,10,16,6,9,15,4,2,Viacom,DreamWorks,PROPN,PROPN,compound,compound,NNP,NNP,3,MISC-MISC
1,1.0,23,established,3,6,19,13,5,3,-3,16,Andre Agassi,Las Vegas,PROPN-PROPN,PROPN-PROPN,compound-compound,compound-appos,NNP-NNP,NNP-NNP,6,MISC-MISC
2,5.0,24,started,1,7,20,13,6,3,-6,19,Andre Agassi,Las Vegas,PROPN-PROPN,PROPN-PROPN,compound-compound,compound-pobj,NNP-NNP,NNP-NNP,8,MISC-MISC
3,14.0,15,Agassi,1,1,13,12,0,1,0,12,Andre Agassi,Las Vegas,PROPN-PROPN,PROPN-PROPN,compound-ROOT,compound-pobj,NNP-NNP,NNP-NNP,3,MISC-MISC
4,17.0,16,born,3,1,12,11,0,3,2,9,Andre Agassi,Las Vegas,PROPN-PROPN,PROPN-PROPN,compound-nsubjpass,compound-pobj,NNP-NNP,NNP-NNP,3,MISC-MISC
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246,242.0,31,Watson,0,7,18,11,6,12,-7,18,Francis Crick,Nobel Prize,PROPN-PROPN,PROPN-PROPN,compound-ROOT,compound-dobj,NNP-NNP,NNP-NNP,-1,MISC-MISC
247,245.0,12,received,6,5,8,3,4,3,1,2,Francis Crick,Nobel Prize,PROPN-PROPN,PROPN-PROPN,compound-nsubj,compound-dobj,NNP-NNP,NNP-NNP,2,MISC-MISC
248,247.0,19,describe,5,4,14,10,3,4,1,9,Francis Crick,Nobel Prize,PROPN-PROPN,PROPN-PROPN,compound-conj,compound-dobj,NNP-NNP,NNP-NNP,4,MISC-MISC
249,248.0,24,discovered,11,10,22,12,9,1,1,11,Francis Crick,Nobel Prize,PROPN-PROPN,PROPN-PROPN,compound-conj,compound-dobj,NNP-NNP,NNP-NNP,5,MISC-MISC


## Multiclass Classification

In [44]:
features["label"].value_counts()
features = features.groupby('label').filter(lambda x : len(x)>2)

In [45]:
features["label"].value_counts()

MISC-MISC    251
Name: label, dtype: int64

In [46]:
# features_with_labels = pd.read_csv("features_"+dataset+".csv")
X = features.iloc[:,:-1]
X = X.fillna("")
y_original =  features.iloc[:,-1]

x_train_original, x_test_original, y_train_original, y_test_original = train_test_split(X, y_original, test_size=0.2, stratify=y_original)
onehot_encoder = OneHotEncoder(sparse=False, handle_unknown="ignore")
x_train = onehot_encoder.fit_transform(x_train_original)
x_test = onehot_encoder.transform(x_test_original)

le = LabelEncoder()
y_train = le.fit_transform(y_train_original)
y_test = le.transform(y_test_original)


In [47]:
unique, count = np.unique(y_original, return_counts = True)
dict(zip(unique, count))

{'MISC-MISC': 251}

In [48]:
pred = logistic_regression_multiple_classes(x_train, x_test, y_train, y_test, "et")
x_test_log_pred = x_test_original
x_test_log_pred["label"] = y_test
x_test_log_pred["prediction"] = pred
x_test_log_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_logisitic_regression_prediction.csv")

--------Logistic Regression-----------


ValueError: This solver needs samples of at least 2 classes in the data, but the data contains only one class: 0

In [49]:
pred = random_forest_multiple(x_train, x_test, y_train, y_test,"et")
x_test_rf_pred = x_test_original
x_test_rf_pred["label"] = y_test
x_test_rf_pred["prediction"] = pred
x_test_rf_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_random_forest_prediction.csv")


--------Random Forest-----------


In [18]:
pred = svm_multiple("linear", x_train, x_test, y_train, y_test, "et")
x_test_svm_pred = x_test_original
x_test_svm_pred["label"] = y_test
x_test_svm_pred["prediction"] = pred
x_test_svm_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_svm_prediction.csv")

--------Support Vector Classifier linear-----------


In [19]:
pred = svm_multiple("rbf", x_train, x_test, y_train, y_test, "et")
x_test_svmrbf_pred = x_test_original
x_test_svmrbf_pred["label"] = y_test
x_test_svmrbf_pred["prediction"] = pred
x_test_svmrbf_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_svm_rbf_prediction.csv")

--------Support Vector Classifier rbf-----------


In [34]:
pred = svm_multiple("poly", x_train, x_test, y_train, y_test, "et")
x_test_svmpoly_pred = x_test_original
x_test_svmpoly_pred["label"] = y_test
x_test_svmpoly_pred["prediction"] = pred
x_test_svmpoly_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_svm_poly_prediction.csv")


--------Support Vector Classifier poly-----------


In [35]:
pred = xgboost_multiple_class(x_train, x_test, y_train, y_test, "et")
x_test_xg_pred = x_test_original
x_test_xg_pred["label"] = y_test
x_test_xg_pred["prediction"] = pred
x_test_xg_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_xgboost_prediction.csv")


In [36]:
pred = adaboost_multiple(x_train, x_test, y_train, y_test, "et")
x_test_ada_pred = x_test_original
x_test_ada_pred["label"] = y_test
x_test_ada_pred["prediction"] = pred
x_test_ada_pred.to_csv("/Users/anishajauhari/Desktop/Sem 4/Independent Study /ResponsibleRelationExtraction/Predictions/binary_adaboost_prediction.csv")
