In [26]:
import pandas as pd
import numpy as np
import spacy
from sklearn import svm
from sklearn import metrics
from sklearn.model_selection import train_test_split

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [27]:
with open('quick-task-data/train.txt') as training_f:
        train = training_f.read().split('\n')

In [29]:
df = pd.DataFrame()

In [30]:
df['human'] = train[1::6]
df['candidate'] = train[2::6]
df['score'] = train[3::6]
df['label'] = train[4::6]

In [31]:
df

Unnamed: 0,human,candidate,score,label
0,"bahraini princess marries us soldier , astonis...",bahraini princess marries a u.s. soldier ; ast...,0.3125,H
1,the star-crossed marriage between bahraini pri...,u.s. television stations had once feted the ma...,0.6531,H
2,meriam is a member of the gulf country bahrain...,"meri gulf state of bahrain , the royal family ...",0.3784,M
3,but the las vegas review-journal reported that...,"however , according to the las vegas , comment...",0.3646,M
4,the pair met in 1999 when career military man ...,"the two met in 1999 , when johnson was still a...",0.7778,H
...,...,...,...,...
579,"the new technique is simple , convenient , and...","new technologies with simple , convenient , fa...",0.3705,M
580,upon the invitation of the china disease preve...,commissioned by the chinese center for disease...,0.6944,H
581,spokesman for the ministry of foreign affairs ...,the mfa office spokesman told the house intern...,0.3267,M
582,xinhua news agency report of december 3rd from...,"beijing , december 3 ( xinhua ) -- the mfa off...",0.3858,M


In [23]:
# nlp = spacy.load("en_core_web_sm")
# doc = nlp("Apple is looking at buying U.K. startup for $1 billion")


In [24]:
# len(doc)

In [35]:
def walk_tree(node, depth):
    '''
    A helper function that calculates the max depth of a Spacy parse tree in a recursive way.
    
    '''
    if node.n_lefts + node.n_rights > 0:
        return max(walk_tree(child, depth + 1) for child in node.children)
    else:
        return depth

def extract_features(df):
    rst = pd.DataFrame(columns=['bleu', 'similarity', 'tree_depth', 'func_density', 'pron_density'])
    sw = stopwords.words('english')
    nlp = spacy.load("en_core_web_sm")
    for i, row in df.iterrows():
        rst.at[i, 'bleu'] = row['score']
        # calculate Jaccard similarity between human and candidate
        x = {w for w in word_tokenize(row['human']) if not w in sw} 
        y = {w for w in word_tokenize(row['candidate']) if not w in sw} 
        rst.at[i, 'similarity'] = len(x.intersection(y))/ len(x.union(y))
        # Extract linguistic features from candidate sentence
        # normalized parse tree depth
        doc = nlp(row['candidate'])
        rst.at[i, 'tree_depth'] = [walk_tree(sent.root, 0) for sent in doc.sents][0]/ len(doc)
        # density of function words and pronouns (based on POS tags)
        count_f = 0
        count_p = 0
        for token in doc:
            if token.pos_ in ['ADP', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'PART', 'SCONJ']:
                count_f += 1
            elif token.pos_ == 'PRON':
                count_p += 1
        rst.at[i, 'func_density'] = count_f/ len(doc)
        rst.at[i, 'pron_density'] = count_p/ len(doc)
    return rst
    

In [36]:
X = extract_features(df)
y = df['label']

In [37]:
X

Unnamed: 0,bleu,similarity,tree_depth,func_density,pron_density
0,0.3125,0.277778,0.214286,0.142857,0
1,0.6531,0.689655,0.102041,0.204082,0
2,0.3784,0.216216,0.189189,0.297297,0.027027
3,0.3646,0.258065,0.181818,0.272727,0
4,0.7778,0.692308,0.388889,0.222222,0
...,...,...,...,...,...
579,0.3705,0.285714,0.152174,0.152174,0
580,0.6944,0.555556,0.0972222,0.263889,0
581,0.3267,0.307692,0.352941,0.117647,0
582,0.3858,0.333333,0.189189,0.243243,0


In [44]:
X.describe()

Unnamed: 0,bleu,similarity,tree_depth,func_density,pron_density
count,584.0,584.0,584.0,584.0,584.0
unique,441.0,224.0,196.0,180.0,71.0
top,0.5,0.5,0.333333,0.333333,0.0
freq,10.0,34.0,28.0,39.0,408.0


In [48]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [61]:
# Create a svm classifier
clf = svm.SVC(kernel='linear') # Linear Kernel

# Train the model using the training sets
clf.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = clf.predict(X_test)

# Training accuracy
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("F1 Score:",metrics.f1_score(y_test, y_pred, average='weighted'))

Accuracy: 0.7008547008547008
F1 Score: 0.6995040561147868


In [50]:
# Train on entire training set, then apply the model to test set
model = svm.SVC(kernel='linear')
model.fit(X, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=None,
    shrinking=True, tol=0.001, verbose=False)

In [52]:
# Test
with open('quick-task-data/test.txt') as testing_f:
        test = testing_f.read().split('\n')
test_df = pd.DataFrame()
test_df['human'] = test[1::6]
test_df['candidate'] = test[2::6]
test_df['score'] = test[3::6]
test_df['label'] = test[4::6]
test_X = extract_features(test_df)

In [53]:
test_y = test_df['label']

In [54]:
predictions = model.predict(test_X)

In [57]:
# Evaluate using the average of F1 score for the human and machine classes
print(metrics.f1_score(test_y, predictions, average='weighted'))

0.7128717387338076


In [58]:
predictions

array(['M', 'H', 'H', 'M', 'H', 'M', 'H', 'M', 'M', 'H', 'H', 'M', 'H',
       'M', 'M', 'H', 'M', 'H', 'M', 'M', 'M', 'M', 'H', 'H', 'M', 'M',
       'H', 'H', 'M', 'H', 'M', 'M', 'H', 'H', 'H', 'H', 'H', 'H', 'M',
       'H', 'H', 'H', 'M', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H', 'H',
       'H', 'M', 'M', 'M', 'H', 'H', 'M', 'M', 'H', 'M', 'M', 'M', 'M',
       'M', 'H', 'M', 'M', 'H', 'H', 'H', 'H', 'M', 'M', 'M', 'M', 'M',
       'M', 'M', 'H', 'H', 'M', 'H', 'H', 'H', 'M', 'M', 'M', 'H', 'H',
       'M', 'M', 'H', 'H', 'H', 'M', 'H', 'H', 'H', 'H', 'H', 'M', 'M',
       'M', 'H', 'M', 'M', 'M', 'M', 'H', 'H', 'H', 'M', 'M', 'H', 'M',
       'M', 'M', 'M', 'M', 'H', 'H', 'H', 'M', 'H', 'H', 'M', 'H', 'M',
       'H', 'M', 'M', 'H', 'M', 'H', 'M', 'M', 'H', 'H', 'H', 'M', 'H',
       'M', 'M', 'M', 'M', 'H', 'M', 'M', 'H', 'H', 'M', 'H', 'H', 'H',
       'H', 'H', 'H', 'H', 'M', 'H', 'M', 'H', 'M', 'M', 'M', 'H', 'M',
       'M', 'H', 'M', 'M', 'M'], dtype=object)