In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics import classification_report, confusion_matrix

## Accuracy of 28 best features on Baby BNC Corpus

In [2]:
df_bnc = pd.read_csv('../data/baby_bnc_corpus_features.csv', index_col=0)

In [3]:
df_bnc.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
avg_sen_len,2026.0,13.566206,7.248457,2.400000,8.000000,11.800000,17.400000,44.600000
std_sen_len,2026.0,6.744896,4.231069,0.489898,3.773592,5.710662,8.699885,37.626586
TTR,2026.0,0.728749,1.504812,-1.502650,-0.473126,0.375181,1.557037,9.340900
Root TTR,2026.0,0.662709,1.329799,-2.040984,-0.404161,0.516446,1.552887,5.470900
Log TTR,2026.0,0.650362,1.299088,-2.187525,-0.392280,0.536187,1.545283,5.563742
...,...,...,...,...,...,...,...,...
"('AUX', 'INTJ', 'before')",2026.0,0.000494,0.022217,0.000000,0.000000,0.000000,0.000000,1.000000
"('SYM', 'ADP', 'after')",2026.0,0.000494,0.022217,0.000000,0.000000,0.000000,0.000000,1.000000
"('ADJ', 'SYM', 'before')",2026.0,0.000494,0.022217,0.000000,0.000000,0.000000,0.000000,1.000000
"('SYM', 'NUM', 'before')",2026.0,0.000987,0.031411,0.000000,0.000000,0.000000,0.000000,1.000000


In [4]:
df_bnc.label.value_counts()

label
1    1783
0     243
Name: count, dtype: int64

#### Since there are more fiction paragraphs then  non-fiction paragraphs. So, randomly sampling 250 paragraphs of fiction to test the best model's accuracy

In [8]:
import pickle
with open("../resources/full_trained_final_model.pickle", 'rb') as fp:
    model = pickle.load(fp)

In [9]:
## Best feature set
feat = ['TTR',
 'Maas TTR',
 'VocD',
 'adverb/pronoun',
 'noun/verb',
 'mark',
 'nsubj',
 'nummod',
 'acl:relcl',
 'nmod:poss',
 'flat',
 'fixed',
 'aux:pass',
 'obl:npmod',
 'discourse',
 "('VERB', 'ADV', 'before')",
 "('VERB', 'PROPN', 'after')",
 "('VERB', 'ADP', 'before')",
 "('ADJ', 'SCONJ', 'after')",
 "('VERB', 'PRON', 'before')",
 "('VERB', 'SCONJ', 'after')",
 "('PRON', 'VERB', 'before')",
 "('PRON', 'NOUN', 'before')",
 "('PROPN', 'NUM', 'before')",
 "('PROPN', 'PROPN', 'after')",
 "('VERB', 'NUM', 'before')",
 'std_sen_len',
 'content/function']

In [10]:
len(feat)

28

In [11]:
df_bnc_best_feat = df_bnc[['id', 'label'] + feat]
df_bnc_best_feat.columns

Index(['id', 'label', 'TTR', 'Maas TTR', 'VocD', 'adverb/pronoun', 'noun/verb',
       'mark', 'nsubj', 'nummod', 'acl:relcl', 'nmod:poss', 'flat', 'fixed',
       'aux:pass', 'obl:npmod', 'discourse', '('VERB', 'ADV', 'before')',
       '('VERB', 'PROPN', 'after')', '('VERB', 'ADP', 'before')',
       '('ADJ', 'SCONJ', 'after')', '('VERB', 'PRON', 'before')',
       '('VERB', 'SCONJ', 'after')', '('PRON', 'VERB', 'before')',
       '('PRON', 'NOUN', 'before')', '('PROPN', 'NUM', 'before')',
       '('PROPN', 'PROPN', 'after')', '('VERB', 'NUM', 'before')',
       'std_sen_len', 'content/function'],
      dtype='object')

In [12]:
df_non_fict = df_bnc_best_feat[df_bnc_best_feat.label == 0].reset_index()
df_non_fict.drop(['index'], axis=1, inplace=True)

In [13]:
X_preds = {}
Y_org = {}
scores = []
f1_fiction = []
f1_non_fiction = []
for i in range(10):
    df_fict = df_bnc_best_feat[df_bnc_best_feat.label == 1].sample(250, random_state=i).reset_index()
    df_fict.drop(['index'], axis=1, inplace=True)
    df_final = pd.concat([df_non_fict, df_fict], join='outer', ignore_index=True)
    X_test = df_final.drop(columns=['id', 'label'])
    print(X_test.shape)
    Y_test = df_final.label
    X_pred = model.predict(X_test)
    score_ = model.score(X_test, Y_test)
    report = classification_report(Y_test, X_pred, output_dict=True)
    f1_fiction.append(report['1']['f1-score'])
    f1_non_fiction.append(report['0']['f1-score'])
    scores.append(score_)
    X_preds[i] = X_pred
    Y_org[i] = Y_test

(493, 28)
(493, 28)
(493, 28)
(493, 28)
(493, 28)
(493, 28)
(493, 28)
(493, 28)
(493, 28)
(493, 28)


In [14]:
np.mean(scores)*100, np.std(scores)*100

(94.01622718052737, 1.035277903056781)

In [15]:
np.mean(f1_fiction) , np.std(f1_fiction)

(0.9389943788878551, 0.011212039143020776)

In [16]:
np.mean(f1_non_fiction) , np.std(f1_non_fiction)

(0.941273688019133, 0.009552670433019989)

In [29]:
f1_fiction

[0.9531568228105907,
 0.9423868312757201,
 0.9423868312757201,
 0.9531568228105907,
 0.9467213114754099,
 0.9201680672268907,
 0.9467213114754099,
 0.9291666666666667,
 0.9313929313929314,
 0.9246861924686193]

In [31]:
f1_non_fiction

[0.9535353535353536,
 0.9440000000000001,
 0.9440000000000001,
 0.9535353535353536,
 0.9477911646586347,
 0.9254901960784314,
 0.9477911646586347,
 0.932806324110672,
 0.9346534653465346,
 0.9291338582677167]