**Load Packages**

In [1]:
import os
import re
import numpy as np
import pandas as pd
from nltk import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.utils import shuffle
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import recall_score,precision_score,f1_score

**Read Files**

In [2]:
filenames=! ls ../data/*.txt

*Split datasets*

In [3]:
X_train,X_test,_,_ = train_test_split(filenames,np.arange(len(filenames)),test_size = 0.2,random_state = 123)

In [4]:
docs = []
for file in X_train:
    with open(file,'r') as f:
        docs.append(f.readlines())

documents = []
for x in docs:
    paragraphs = ''
    for y in x:
        z = re.sub('\n',' ',y)
        z = re.sub('\'',' \'',z)
        if z != '':
            paragraphs += z
    documents.append(paragraphs)

In [5]:
test_docs = []
for file in X_test:
    with open(file,'r') as f:
        test_docs.append(f.readlines())
test_documents = []
for x in test_docs:
    paragraphs = ''
    for y in x:
        z = re.sub('\n',' ',y)
        z = re.sub('\'',' \'',z)
        if z != '':
            paragraphs += z
    test_documents.append(paragraphs)

FileNotFoundError: [Errno 2] No such file or directory: 'ls: ./data/*.txt: No such file or directory'

*Add stopwords*

In [6]:
stop = [x.capitalize() for x in stopwords.words('english') if re.findall('\'',x)==[]]
stop += ['Never', 'Aha', 'Ah', 'Oh', 'Eh', 'Really', 'Well', 'Yes', 'No', 'Please', 'Ha-ha-ha', 'Alas', 'A-a-ah', 'Next', 'After', 'Hm']

In [9]:
prefix = ['Mr','Miss','Madame','Madam','Doctor','Father','Dr','Ser','Lord','Prince','Princess','King','Queen',
           'Dear','Mother','Uncle','Mrs','Sir','Sister','Brother','Bishop','Lieutenant-General','Captain',
          'Mayor','General','Son','Children','Granny']
s_prefix = [x.lower() for x in prefix]
s_prefix += prefix

**Construct predictors **

*Define functions for pre-process*

In [12]:
def createlist(docs):
    linkedlist = pd.DataFrame({'prev':[],'word':[],'next':[],'y':[]})
    for doc in docs:
        doc = re.sub('[\.;!\?:\(\)\+,]','',doc)
        do = ''
        ts = word_tokenize(doc)
        ts = [t for t in ts if len(t) > 1]
        for t in ts:
            if t in stop:
                continue
            elif t in prefix:
                t = t.lower()
                do = do + t + ' '
            else:
                do = do + t + ' '
    
        do = re.sub('([A-Z][A-Za-z]+) ([A-Z][A-Za-z]+) ([A-Z][A-Za-z]+)','\\1_\\2_\\3',do)
        do = re.sub('([A-Z][A-Za-z]+) ([A-Z][A-Za-z]+)','\\1_\\2',do)
        tokens = word_tokenize(do)
        for i,x in enumerate(tokens):
            if i == 0:
                y = (re.sub('\*','',x)!=x)
                linkedlist = linkedlist.append(pd.DataFrame({'prev':[None],'word':[x],'next':[tokens[1]],'y':[y]}))
            elif i == len(tokens)-1:
                y = (re.sub('\*','',x)!=x)
                linkedlist = linkedlist.append(pd.DataFrame({'prev':[tokens[i-1]],'word':[x],'next':[None],'y':[y]}))
            else:
                y = (re.sub('\*','',x)!=x)
                linkedlist = linkedlist.append(pd.DataFrame({'prev':[tokens[i-1]],'word':[x],'next':[tokens[i+1]],'y':[y]}))
    return linkedlist

*For training set I*

In [48]:
linkedlist = createlist(documents)

linkedlist.word = linkedlist.word.apply(lambda x: re.sub('\*','',x))
linkedlist.prev = linkedlist.prev.apply(lambda x: re.sub('\*','',str(x)))
linkedlist.next = linkedlist.next.apply(lambda x: re.sub('\*','',str(x)))
linkedlist.word = linkedlist.word.apply(lambda x: re.sub('``','"',x))
linkedlist.prev = linkedlist.prev.apply(lambda x: re.sub('``','"',str(x)))
linkedlist.next = linkedlist.next.apply(lambda x: re.sub('``','"',str(x)))

yes = []
for i,x in enumerate(linkedlist.word.values):
    if x[0].isupper():
        yes.append(i)
newlist = linkedlist.iloc[yes]
newlist = newlist[~newlist.word.isin(['\'','"','\'\''])]
newlist = newlist.reset_index(drop=True)

for i in range(newlist.shape[0]):
    tags = pos_tag(newlist.loc[i,['prev','word','next']].values)
    if tags[0][0] == '"':
        tags[0] = ('"','SB')
    elif tags[0][0] == 'that':
        tags[0] = ('that','SB')
    if tags[2][0] == '"':
        tags[2] = ('"','SB')
    elif tags[2][0] == 'that':
        tags[2] = ('that','SB')
    if tags[2][0] in  ['can', 'could', 'may', 'might', 'must', 'will', 'would', 'should', 'shall',
                      'herself', 'himself', 'themself','lies', 'rushes', 'rode', 'lays']:
        tags[2] = (tags[2][0], 'VB')
    newlist.loc[i,'has_verb'] = (tags[0][1].startswith('V')) | (tags[2][1].startswith('V'))

newlist['next_\'s'] = newlist.next.apply(lambda x: x== '\'s')
newlist['prefix'] = newlist.prev.apply(lambda x: x in s_prefix)
newlist['prev_the'] = newlist.prev.apply(lambda x: x != 'the')
newlist['next_of'] = newlist.prev.apply(lambda x: x == 'of')
newlist['next_who'] = newlist.next.apply(lambda x: (x== 'who')|(x=='whose')|(x=='whom'))

*For testing set J*

In [49]:
testlist = createlist(test_documents)

testlist.word = testlist.word.apply(lambda x: re.sub('\*','',x))
testlist.prev = testlist.prev.apply(lambda x: re.sub('\*','',str(x)))
testlist.next = testlist.next.apply(lambda x: re.sub('\*','',str(x)))
testlist.word = testlist.word.apply(lambda x: re.sub('``','"',x))
testlist.prev = testlist.prev.apply(lambda x: re.sub('``','"',str(x)))
testlist.next = testlist.next.apply(lambda x: re.sub('``','"',str(x)))

yes = []
for i,x in enumerate(testlist.word.values):
    if x[0].isupper():
        yes.append(i)
newtestlist = testlist.iloc[yes]
newtestlist = newtestlist[~newtestlist.word.isin(['\'','"','\'\''])]
newtestlist = newtestlist.reset_index(drop=True)

for i in range(newtestlist.shape[0]):
    tags = pos_tag(newtestlist.loc[i,['prev','word','next']].values)
    if tags[0][0] == '"':
        tags[0] = ('"','SB')
    elif tags[0][0] == 'that':
        tags[0] = ('that','SB')
    if tags[2][0] == '"':
        tags[2] = ('"','SB')
    elif tags[2][0] == 'that':
        tags[2] = ('that','SB')
    if tags[2][0] in  ['can', 'could', 'may', 'might', 'must', 'will', 'would', 'should', 'shall',
                       'herself', 'himself', 'themself',
                      'lies', 'rushes', 'rode', 'lays']:
        tags[2] = (tags[2][0], 'VB')
    newtestlist.loc[i,'has_verb'] = (tags[0][1].startswith('V')) | (tags[2][1].startswith('V'))

newtestlist['next_\'s'] = newtestlist.next.apply(lambda x: x== '\'s')
newtestlist['prefix'] = newtestlist.prev.apply(lambda x: x in s_prefix)
newtestlist['prev_the'] = newtestlist.prev.apply(lambda x: x != 'the')
newtestlist['next_of'] = newtestlist.prev.apply(lambda x: x == 'of')
newtestlist['next_who'] = newtestlist.next.apply(lambda x: (x== 'who')| (x =='whom') | (x== 'whose'))

*Finalize datasets*

In [50]:
X_train = newlist[['has_verb','prefix','next_\'s','prev_the','next_of','next_who']]
Y_train = newlist.y
X_test = newtestlist[['has_verb','prefix','next_\'s','prev_the','next_of','next_who']]
Y_test = newtestlist.y

**Finding optimal models**

*Cross-Validation Results*

In [51]:
X4linear, Y4linear = shuffle(X_train,Y_train)
threshold = 0.8
train_size = len(X4linear)
#print train_size
subset_size = np.int(train_size/5)
#print subset_size
p_list = []
r_list = []
f1_list = []

In [57]:
for i in range(0, 5):
    regr = LinearRegression()
    cv_train_X = X4linear[0:i*subset_size].append(X4linear[(i+1)*subset_size:])
    cv_train_Y = Y4linear[0:i*subset_size].append(Y4linear[(i+1)*subset_size:])
    cv_test_X = X4linear[i*subset_size:(i+1)*subset_size]
    cv_test_Y = Y4linear[i*subset_size:(i+1)*subset_size]
    regr.fit(cv_train_X, cv_train_Y)
    cv_predict = regr.predict(cv_test_X)
    cv_predict = [int(y>threshold) for y in cv_predict]
    p_list.append(precision_score(cv_test_Y, cv_predict))
    r_list.append(recall_score(cv_test_Y, cv_predict))
    f1_list.append(f1_score(cv_test_Y, cv_predict))
print('regr_precission: %f' %np.average(p_list))
print('regr_recall: %f' %np.average(r_list))
print('regr_f1: %f' %np.average(f1_list))

lr = LogisticRegression(max_iter = 200,solver = 'lbfgs')
rf = RandomForestClassifier(criterion = 'entropy',max_depth=20,n_estimators=5)
svm = SVC(gamma='scale')
tree = DecisionTreeClassifier()

scores = cross_val_score(lr, X_train, Y_train, scoring='precision_macro', cv=5)
print('lr_precision: %f' %scores.mean())
scores = cross_val_score(lr, X_train, Y_train, scoring='recall_macro', cv=5)
print('lr_recall: %f' %scores.mean())
scores = cross_val_score(lr, X_train, Y_train, scoring='f1_macro', cv=5)
print('lr_f1: %f' %scores.mean())

scores = cross_val_score(rf, X_train, Y_train, scoring='precision_macro', cv=5)
print('rf_precision: %f' %scores.mean())
scores = cross_val_score(rf, X_train, Y_train, scoring='recall_macro', cv=5)
print('rf_recall: %f' %scores.mean())
scores = cross_val_score(rf, X_train, Y_train, scoring='f1_macro', cv=5)
print('rf_f1: %f' %scores.mean())

scores = cross_val_score(svm, X_train, Y_train, scoring='precision_macro', cv=5)
print('svm_precision: %f' %scores.mean())
scores = cross_val_score(svm, X_train, Y_train, scoring='recall_macro', cv=5)
print('svm_recall: %f' %scores.mean())
scores = cross_val_score(svm, X_train, Y_train, scoring='f1_macro', cv=5)
print('svm_f1: %f' %scores.mean())

scores = cross_val_score(tree, X_train, Y_train, scoring='precision_macro', cv=5)
print('tree_precision: %f' %scores.mean())
scores = cross_val_score(tree, X_train, Y_train, scoring='recall_macro', cv=5)
print('tree_recall: %f' %scores.mean())
scores = cross_val_score(tree, X_train, Y_train, scoring='f1_macro', cv=5)
print('tree_f1: %f' %scores.mean())

regr_precission: 0.854413
regr_recall: 0.401592
regr_f1: 0.483291
lr_precision: 0.738015
lr_recall: 0.746216
lr_f1: 0.733354
rf_precision: 0.743470
rf_recall: 0.751781
rf_f1: 0.737265
svm_precision: 0.743693
svm_recall: 0.751781
svm_f1: 0.737585
tree_precision: 0.743470
tree_recall: 0.751509
tree_f1: 0.737265


**Postprocess with decision tree model**

In [58]:
svm = SVC(gamma='scale')
svm.fit(X_train, Y_train)

Y_pred = svm.predict(X_test)
print(precision_score(Y_test, Y_pred))
print(recall_score(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred))
cfn = newtestlist.loc[(Y_pred != Y_test) & (Y_pred == 1)]
#cfn %check false negative examples
cfp = newtestlist.loc[(Y_pred != Y_test) & (Y_test == 1)]
#cfp %check false positive examples

0.8535211267605634
0.6718403547671841
0.7518610421836228


In [33]:
blacklist=['Janurary','Feburary','March','April','May','June','July','August','September','October','November','December',
           'God','Gods','Granny', 'Near', 'Beyond', 'Next', 'Below','Come','Go',
           'Persia','Persian','Russia','France','Russians','Viennese','French','Russian','Chinese','German','Latin','United_States',
           'London','America','Winterfell','California',
           'Children','People','Every','Everyone','Everything','Someone','Something','Anything','Others','Nothing','Many','House',
           'One','Two','Three','Four','Five','Six','Seven','Eight','Nine'
           'Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday','Yesterday','Tomorrow','Today','Tonight',
           'Spring','Summer', 'Autumn', 'Winter','Men', 'Women']


In [35]:
white = newtestlist.prev.isin(s_prefix)
black = newtestlist.word.isin(blacklist)

whitetrain = newlist.word.isin(s_prefix)
blacktrain = newlist.word.isin(blacklist)

**Final results**

In [56]:
Y_pred = svm.predict(X_test)
Y_pred[white] = 1
Y_pred[black] = 0
print(precision_score(Y_test, Y_pred))
print(recall_score(Y_test, Y_pred))
print(f1_score(Y_test, Y_pred))

0.9181818181818182
0.6718403547671841
0.7759282970550576
