# Extract names from doc - Syntaxnet + Sklearn GaussianNB

This is a follow-up to the baseline solution.  The baseline solution utilized a basic sequential approach to finding the party names in an agreement.<br>
Format:  Agreement Name, Party1:  YOU, Party2: Apple

For this solution, **Syntaxnet** processed data and **Machine Learning** WILL BE utilized.

In [1]:
import numpy as np # scientific computing
import pandas as pd # data analysis (dataframes) 
import time # to measure how long it takes to train and test a model
from sklearn.cross_validation import train_test_split #to split between training and testing
from sklearn.naive_bayes import GaussianNB # Naive Bayes algorithm
from sklearn.metrics import f1_score # measures accuracy of predictions.  Scale: 0 (bad) to 1 (good)
from sklearn.decomposition import PCA # Used to prevent the curse of dimensionality
import matplotlib.pyplot as plt

# Tell iPython to include plots inline in the notebook
%matplotlib inline

In [2]:
# Read training csv file.
agreements_data = pd.read_csv("data/agreements_dataset2.csv")
print "\n","Training data read successfully!", "\n"


Training data read successfully! 



In [3]:
# Explore the data

n_rows = len(agreements_data)
n_features = agreements_data.shape[1] - 1 # added -1 as the last column is the target

print "\n", "Total number of rows: {}".format(n_rows)
print "Number of features: {}".format(n_features), "\n"



Total number of rows: 2223
Number of features: 10 



In [4]:
# Identify features and target columns

feature_cols = list(agreements_data.columns[:-1])  # all columns but last are features
target_col = agreements_data.columns[-1]  # last column is the target/label (party names)

X_all = agreements_data[feature_cols]  # feature values for all agreements
y_all = agreements_data[target_col]  # corresponding targets/labels


In [5]:
# Preprocess feature columns
def preprocess_features(X):
    outX = pd.DataFrame(index=X.index)  # output dataframe, initially empty

    # Check each column
    for col, col_data in X.iteritems():
        # If data type is non-numeric, try to replace all yes/no values with 1/0
        if col_data.dtype == object:
            col_data = col_data.replace(['T', 'F'], [1, 0])
            
        # If data type is non-numeric, try to replace all "_" values with 0
        if col_data.dtype == object:
            col_data = col_data.replace(['_'], [0])

        # If still non-numeric, convert to one or more dummy variables
        if col_data.dtype == object:
            col_data = pd.get_dummies(col_data, prefix=col) # e.g. 'feature_3' => 'feature_3_ADJ', 'feature_3_ADP'

        outX = outX.join(col_data)  # collect column(s) in output dataframe

    return outX

X_all = preprocess_features(X_all)

print "Total columns after pre-processing: {}\n".format(len(X_all.columns))

Total columns after pre-processing: 692



In [6]:
# Split data into training and testing datasets

rowNumber = 2177

# training data
X_train = X_all[:rowNumber]
y_train = y_all[:rowNumber]

# testing data
X_test = X_all[rowNumber:]
y_test = y_all[rowNumber:]

print "\n", "Training set: {} samples".format(X_train.shape[0])
print "Test set: {} samples".format(X_test.shape[0]), "\n"


Training set: 2177 samples
Test set: 46 samples 



In [7]:
# Train model

def train_classifier(clf, X_train, y_train):
    print "\n", "Training {}...".format(clf.__class__.__name__)
    start = time.time()
    clf.fit(X_train, y_train)
    end = time.time()
    print "Training time (secs): {:.3f}".format(end - start), "\n"

# Instantiate Gaussian Naive Bays algorithm
clf = GaussianNB() # clf -> classifier

# Fit model to training data
train_classifier(clf, X_train, y_train) 


Training GaussianNB...
Training time (secs): 0.065 



In [8]:
# Predict on training set and compute F1 score

def predict_labels(clf, features, target):
    print "\n", "Predicting labels using {}...".format(clf.__class__.__name__)
    start = time.time()
    y_pred = clf.predict(features)
    end = time.time()
    print "Prediction time (secs): {:.3f}".format(end - start)
    return f1_score(target.values, y_pred, pos_label='T')

train_f1_score = predict_labels(clf, X_train, y_train)
print "F1 score for training set: {}".format(train_f1_score), "\n"


Predicting labels using GaussianNB...
Prediction time (secs): 0.078
F1 score for training set: 0.52736318408 



In [9]:
# Predict on test data

def predict_labels_test(clf, features, target):
    print "\n", "Predicting labels using {}...".format(clf.__class__.__name__)
    start = time.time()
    y_pred = clf.predict(features)
    print "y_pred on test: "
    print y_pred
    end = time.time()
    print "Prediction time (secs): {:.3f}".format(end - start)
    return f1_score(target.values, y_pred, pos_label='T')

print "F1 score for test set: {}".format(predict_labels_test(clf, X_test, y_test)), "\n"


Predicting labels using GaussianNB...
y_pred on test: 
['T' 'F' 'F' 'T' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F' 'F'
 'F' 'T' 'F' 'F' 'F' 'F' 'F' 'T' 'F' 'T' 'T' 'F' 'F' 'F' 'F' 'F' 'T' 'F'
 'T' 'F' 'T' 'F' 'F' 'T' 'F' 'F' 'F' 'F']
Prediction time (secs): 0.003
F1 score for test set: 0.461538461538 



In [None]:
# Apply code to extract the actual words identified from the party name prediction above 
#(as opposed to showing only True and Falses)