In [1]:
import csv
import itertools
import operator
import numpy as np
import nltk
import sys
from datetime import datetime
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.neural_network import MLPClassifier

import matplotlib.pyplot as plt
%matplotlib inline

After loading the definition-word pairs from the data file, words have to be tokenized in order to later produce a numerical representation of each word via an index.

In [54]:
#vocabulary_size = 6000
#class_size = 50
unknown_token = "UNKNOWN_TOKEN"

# Read the data 
print "Reading CSV file..."
df = pd.read_csv('training_data/60Def50Words.csv', header=None, skipinitialspace=True)
words = list(df.iloc[:, 0])
definitions = list(df.iloc[:,1])
    
print "Parsed %d definitions." % (len(definitions))
    
# Tokenize each sentence into words
tokenized_definitions = [nltk.word_tokenize(sent) for sent in definitions]

# Count word frequencies
word_freq = nltk.FreqDist(itertools.chain(*(tokenized_definitions)))
print "Found %d unique words tokens." % len(word_freq.items())

# Set vocabulary and class sizes
vocabulary_size = len(word_freq.items()) + 1        # Plus 1 to account for the unknown token
classes = set(words)
class_size = len(classes)

# Build index_to_word and word_to_index vectors
vocab = word_freq.most_common(vocabulary_size-1)
index_to_word = [x[0] for x in vocab]
index_to_word.append(unknown_token)
word_to_index = dict([(w,i) for i,w in enumerate(index_to_word)])

# Build index_to_class and class_to_index vectors
index_to_class = list(set(words))
class_to_index = dict([(w,i) for i,w in enumerate(index_to_class)])

print "Using vocabulary size %d." % vocabulary_size
# print "The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])

# Replace all words not in our vocabulary with the unknown token
for i, sent in enumerate(tokenized_definitions):
    tokenized_definitions[i] = [w if w in word_to_index else unknown_token for w in sent]

print "\nExample sentence: '%s'" % definitions[0]
print "\nExample sentence after Pre-processing: '%s'" % tokenized_definitions[0]

Reading CSV file...
Parsed 2998 definitions.
Found 5740 unique words tokens.
Using vocabulary size 5741.

Example sentence: 'a unit of angular measurement equal to one sixtieth of a degree or seconds'

Example sentence after Pre-processing: '['a', 'unit', 'of', 'angular', 'measurement', 'equal', 'to', 'one', 'sixtieth', 'of', 'a', 'degree', 'or', 'seconds']'


In [55]:
# Create the raw data
X = np.asarray([[word_to_index[w] for w in sent] for sent in tokenized_definitions])
y = np.asarray([class_to_index[w] for w in words])

In [56]:
# Print an training data example
x_example, y_example = X[17], y[17]
print "x:\n%s\n%s" % (" ".join([index_to_word[x] for x in x_example]), x_example)
print "\ny:\n%s\n%s" % (index_to_class[17], y_example)

x:
a unit of angle equal to one sixtieth of a degree
[0, 36, 2, 729, 195, 4, 9, 570, 2, 0, 194]

y:
minute
17


In [57]:
X_vect = np.zeros((vocabulary_size, 1))

In [58]:
X[17]

[0, 36, 2, 729, 195, 4, 9, 570, 2, 0, 194]

Now we want to generate a one-hot vector for each word, in order to produce a single definition vector as the sum of its word components.

In [59]:
index_to_vec = []
for w in X:
    X_vect = np.zeros((vocabulary_size, 1))
    for e in w:
        X_vect[e] += 1
    index_to_vec.append(X_vect.flatten().tolist())

In [60]:
print len(index_to_vec), len(y)

2998 2998


In [61]:
X_train, X_test, y_train, y_test = train_test_split(index_to_vec, y, test_size=0.33, random_state=5)

In [62]:
print('#Training data points: {}'.format(len(X_train)))
print('#Testing data points: {}'.format(len(X_test)))
print('Class labels: {}'.format(class_to_index))

#Training data points: 2008
#Testing data points: 990
Class labels: {'right': 0, 'force': 1, 'family': 2, 'point': 3, 'job': 45, 'house': 4, 'number': 5, 'president': 33, 'home': 7, 'girl': 8, 'air': 46, 'law': 34, 'end': 11, 'power': 25, 'service': 13, 'night': 48, 'question': 10, 'state': 15, 'book': 16, 'change': 36, 'lot': 18, 'party': 20, 'kid': 37, 'issue': 22, 'father': 14, 'head': 24, 'business': 12, 'eye': 26, 'company': 27, 'mother': 28, 'back': 23, 'reason': 30, 'part': 31, 'group': 6, 'hand': 29, 'day': 35, 'minute': 17, 'man': 21, 'case': 38, 'school': 39, 'word': 40, 'name': 41, 'level': 42, 'car': 43, 'study': 44, 'game': 32, 'face': 9, 'thing': 47, 'time': 19, 'side': 49}


In [63]:
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)

In [64]:
nn = MLPClassifier(activation='tanh', solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(30,), max_iter=1000, random_state=1)
nn.fit(X_train_std, y_train)

MLPClassifier(activation='tanh', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(30,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

In [65]:
y_p = nn.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_p).sum())
print('Accuracy: %.2f' % accuracy_score(y_test, y_p))

Misclassified samples: 471
Accuracy: 0.52


In [66]:
def print_top_pred(definition, no_results):
    """
    Function to print the top n results with their corresponding probabilities.
    """
    
    tokenized_def = [nltk.word_tokenize(definition)]
    for i, sent in enumerate(tokenized_def):
        tokenized_def[i] = [w if w in word_to_index else unknown_token for w in sent]

    x = np.asarray([[word_to_index[w] for w in sent] for sent in tokenized_def])

    vec = np.zeros((vocabulary_size, 1))
    for e in x:
        vec[e] += 1    
    vec = vec.flatten().tolist()
    vec = sc.transform([vec])

    pred = nn.predict_proba(vec)

    preds = np.argsort(pred[0])[-no_results:][::-1]
    probs = np.sort(pred[0])[-no_results:][::-1]

    for x, y in zip(preds, probs):
        x = index_to_class[x]
        print "%s\t->\t%f" %(x, y)
        
def get_top_pred(y_pred_prob, no_results):
    """
    Function to get only the top n results from each prediction. 
    """
    
    res = []
    for elem in y_pred_prob:
        res.append(np.argsort(elem)[-no_results:][::-1])
    return res

def get_miss_match(y_test, y_pred, no_results):
    """
    Function to determine if a correct label is among the top n corresponding predictions, that is
    to determine misses and matches. 
    """
    
    match = 0
    miss = 0
    y_pred_top = get_top_pred(y_pred, no_results)
    for a, b in zip(y_test, y_pred_top):
        if a in b: 
            match += 1
        else:
            miss += 1
    return miss, match

def get_accuracy_score(miss, match):
    """
    Function to compute the accuracy score of a model based on its misses and matches. 
    """
    
    tot = miss + match
    return match / float(tot)

In [67]:
y_pred = nn.predict_proba(X_test_std)
miss, match = get_miss_match(y_test, y_pred, 5)
print('Misclassified samples: %d' % miss)
print('Accuracy: %.2f' % get_accuracy_score(miss, match))

Misclassified samples: 300
Accuracy: 0.70


In [68]:
print_top_pred("little female human", 5)

girl	->	0.994547
mother	->	0.003301
thing	->	0.001450
law	->	0.000314
man	->	0.000154
