# Assignment 2

In [90]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report

np.random.seed(42)

facts = pd.read_csv("gum_refexp_data.tab", sep="\t", quoting=3)  # quoting=3 reads quotes literally
print(facts.shape)
facts.head()

(5168, 15)


Unnamed: 0,entity,head,pos,lemma,func,mentioned,position,s_type1,distance,next_func,s_type2,genre,doc,partition,label
0,person,your,PP$,your,nmod:poss,yes,7,imp,12,nmod:poss,imp,whow,GUM_whow_languages,train,pronoun
1,event,development,NN,development,obl,no,42,decl,42,nsubj:pass,decl,news,GUM_news_iodine,train,lexical
2,person,he,PP,he,nsubj,yes,5,decl,11,nsubj,decl,bio,GUM_bio_jerome,train,pronoun
3,person,Theodorus,NP,Theodorus,nsubj,yes,10,decl,11,nmod:poss,decl,bio,GUM_bio_theodorus,train,pronoun
4,person,his,PP$,his,nmod:poss,yes,10,decl,3,nmod:poss,decl,fiction,GUM_fiction_pag,train,pronoun


In [91]:
# prepare data

facts["mentioned_bin"] = np.where(facts["mentioned"]=="yes",1,0)
facts["label_bin"] = np.where(facts["label"]=="pronoun",1,0)

features = ["mentioned_bin","position","distance"]

df = facts

# Get train and dev
train = df.loc[facts["partition"] == "train"]
dev = df.loc[facts["partition"] == "dev"]

X_train = train[features]
y_train = train["label_bin"]

X_dev = dev[features]
y_dev = dev["label_bin"]

In [53]:
# baseline

# predict 1
y_pred_1 = sum(y_train)/len(y_train)

# predict 0
y_pred_2 = 1-sum(y_train)/len(y_train)

print(y_pred_1 if y_pred_1 > y_pred_2 else y_pred_2)

0.5807543520309477


In [54]:
# LR

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

reg = LogisticRegression()

reg.fit(X_train,y_train)

lr_preds = reg.predict(X_dev)

print(accuracy_score(y_dev, lr_preds))
print(classification_report(y_dev, lr_preds))

0.6802325581395349
              precision    recall  f1-score   support

           0       0.74      0.71      0.72       307
           1       0.60      0.64      0.62       209

   micro avg       0.68      0.68      0.68       516
   macro avg       0.67      0.67      0.67       516
weighted avg       0.68      0.68      0.68       516





In [55]:
# RF with same feature

from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_preds = rf.predict(X_dev)

print(accuracy_score(y_dev, rf_preds))
print(classification_report(y_dev, rf_preds))

0.6124031007751938
              precision    recall  f1-score   support

           0       0.67      0.68      0.68       307
           1       0.52      0.51      0.52       209

   micro avg       0.61      0.61      0.61       516
   macro avg       0.60      0.60      0.60       516
weighted avg       0.61      0.61      0.61       516





In [56]:
# GB with same feature

from sklearn.ensemble import GradientBoostingClassifier

gb = GradientBoostingClassifier()
gb.fit(X_train, y_train)
gb_preds = gb.predict(X_dev)

print(accuracy_score(y_dev, gb_preds))
print(classification_report(y_dev, gb_preds))

0.6841085271317829
              precision    recall  f1-score   support

           0       0.73      0.74      0.74       307
           1       0.61      0.60      0.61       209

   micro avg       0.68      0.68      0.68       516
   macro avg       0.67      0.67      0.67       516
weighted avg       0.68      0.68      0.68       516



In [57]:
# svm with the three simple numerical features

from sklearn.svm import SVC

svm = SVC()
svm.fit(X_train, y_train)
svm_preds = svm.predict(X_dev)

print(accuracy_score(y_dev, svm_preds))
print(classification_report(y_dev, svm_preds))



0.6531007751937985
              precision    recall  f1-score   support

           0       0.69      0.75      0.72       307
           1       0.58      0.51      0.54       209

   micro avg       0.65      0.65      0.65       516
   macro avg       0.64      0.63      0.63       516
weighted avg       0.65      0.65      0.65       516



Without changing any features in the training set, **Gradient Boosting** achieves a similar F1 compared with **Logistic Regression**.

In [92]:
# simplify 'func' features, remove subtypes
df['func'] = df['func'].apply(lambda x: x.split(":")[0])
df['next_func'] = df['next_func'].apply(lambda x: x.split(":")[0])

new_features = ["mentioned_bin","position","distance", "pos", "func", "next_func", "entity"]

df = pd.get_dummies(df[new_features])
df_with_label = pd.concat([df, facts['partition'], facts['label_bin']], axis=1)

In [93]:
df_with_label

Unnamed: 0,mentioned_bin,position,distance,pos_'',pos_(,"pos_,",pos_:,pos_CD,pos_DT,pos_FW,...,entity_object,entity_organization,entity_person,entity_place,entity_plant,entity_quantity,entity_substance,entity_time,partition,label_bin
0,1,7,12,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,train,1
1,0,42,42,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,train,0
2,1,5,11,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,train,1
3,1,10,11,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,train,1
4,1,10,3,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,train,1
5,0,1,2,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,train,0
6,1,7,21,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,train,1
7,1,7,37,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,train,1
8,1,16,35,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,dev,1
9,1,8,21,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,test,0


In [94]:
# Get train and dev
train = df_with_label.loc[df_with_label["partition"] == "train"]
dev = df_with_label.loc[df_with_label["partition"] == "dev"]

X_train = train.drop(['partition', 'label_bin'], 1)
y_train = train["label_bin"]

X_dev = dev.drop(['partition', 'label_bin'], 1)
y_dev = dev["label_bin"]

In [95]:
# LR

reg_1 = LogisticRegression()
reg_1.fit(X_train,y_train)
lr_preds_1 = reg_1.predict(X_dev)

print(accuracy_score(y_dev, lr_preds_1))
print(classification_report(y_dev, lr_preds_1))

0.8372093023255814
              precision    recall  f1-score   support

           0       0.87      0.85      0.86       307
           1       0.79      0.81      0.80       209

   micro avg       0.84      0.84      0.84       516
   macro avg       0.83      0.83      0.83       516
weighted avg       0.84      0.84      0.84       516





In [96]:
# gb

gb_1 = GradientBoostingClassifier()
gb_1.fit(X_train, y_train)
gb_preds_1 = gb_1.predict(X_dev)

print(accuracy_score(y_dev, gb_preds_1))
print(classification_report(y_dev, gb_preds_1))

0.8430232558139535
              precision    recall  f1-score   support

           0       0.87      0.87      0.87       307
           1       0.80      0.81      0.81       209

   micro avg       0.84      0.84      0.84       516
   macro avg       0.84      0.84      0.84       516
weighted avg       0.84      0.84      0.84       516



After grouping func features and add some other features such as pos tags and entity information, the f1 increases to **0.84** with both LR and GB.