In [20]:
import sys, os, glob

from collections import Counter
from math import log
from numpy import mean
import numpy as np

from nltk.stem.wordnet import WordNetLemmatizer
import nltk
nltk.download('wordnet')
lm = WordNetLemmatizer()

from evaluation import Eval

from nbmodel import load_docs

from sklearn.metrics import confusion_matrix

[nltk_data] Downloading package wordnet to /Users/arifali/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [21]:
def extract_feats(doc):
    """
    Extract input features (percepts) for a given document.
    Each percept is a pairing of a name and a boolean, integer, or float value.
    A document's percepts are the same regardless of the label considered.
    """
    ff = Counter()
    for word in doc:
        ff[word] = 1
    ff['bias_term'] = 1
    return ff

In [22]:
def load_featurized_docs(datasplit):
    rawdocs, labels = load_docs(datasplit, lemmatize=False)
    assert len(rawdocs)==len(labels)>0,datasplit
    featdocs = []
    for d in rawdocs:
        featdocs.append(extract_feats(d))
    return featdocs, labels

In [23]:
CLASSES = ['ARA', 'DEU', 'FRA', 'HIN', 'ITA', 'JPN', 'KOR', 'SPA', 'TEL', 'TUR', 'ZHO']
MAX_ITERATIONS = 10
dev_docs,  dev_labels  = load_featurized_docs('dev')
weights = {l: Counter() for l in CLASSES}
#learn(train_docs, train_labels)

In [24]:
train_docs, train_labels = load_featurized_docs('train')

In [25]:
CLASSES = ['ARA', 'DEU', 'FRA', 'HIN', 'ITA', 'JPN', 'KOR', 'SPA', 'TEL', 'TUR', 'ZHO']
MAX_ITERATIONS = MAX_ITERATIONS
dev_docs = dev_docs
dev_labels = dev_labels
weights = {l: Counter() for l in CLASSES}

def score(doc, label):
    """
    Returns the current model's score of labeling the given document
    with the given label.
    """
    score = 0
    for word in doc:
        score += weights[label][word]*doc[word]
    return score

def predict(doc):
    """
    Return the highest-scoring label for the document under the current model.
    """
    max_label = CLASSES[0]
    max_score = score(doc, max_label)
    # note: the dict method from nbmodels don't work because there is a greater certainity of 
    for l in CLASSES[1:]:
        current_score = score(doc, l)
        if current_score > max_score:
            max_score = current_score
            max_label = l
    return max_label

In [26]:
dev_docs,  dev_labels  = load_featurized_docs('dev')

In [27]:
for iteration in range(MAX_ITERATIONS):
    update = 0
    train_accuracy = 0
    for i in range(len(train_docs)):
        label = train_labels[i]
        yhat = predict(train_docs[i])
        if yhat != label:
            for word in train_docs[i]:
                weights[label][word] += train_docs[i][word]
                weights[yhat][word] -= train_docs[i][word]
            train_accuracy -= 1
            update += 1
    # print(str(iteration) +","+ str(np.divide(len(train_docs)+train_accuracy, len(train_docs))) +"," + str(self.test_eval(dev_docs, dev_labels))+","+str(update), file=sys.stderr)
    if np.divide(len(train_docs)+train_accuracy, len(train_docs)) == 1.0:
        break
    for l in CLASSES:
        label_weights = weights[l]
        print("max weights for " + l + ": " + str(sorted(label_weights, key=label_weights.get, reverse = True)[:10]))
        print("min weights for " + l + ": " + str(sorted(label_weights, key=label_weights.get)[:10]))
        print("bias term for "+ l + ": " + str(label_weights['bias_term']))

max weights for ARA: ['alot', 'statment', 'thier', 'self', 'any', 'its', 'right', 'i', 'Also', 'Finally']
min weights for ARA: ['If', 'out', 'difficult', 'lot', 'little', 'statement', 'why', 'already', 'various', 'everybody']
bias term for ARA: 6
max weights for DEU: ['statement', 'often', 'opinion', 'on', 'beeing', 'next', 'why', 'faster', 'you', 'special']
min weights for DEU: ["'s", 'we', 'i', 'However', 'person', 'those', ';', 'by', 'find', 'forget']
bias term for DEU: -6
max weights for FRA: ['Indeed', '...', 'fact', 'instance', 'It', 'exemple', 'France', 'question', 'contrary', 'society']
min weights for FRA: ['from', 'get', 'above', 'agree', 'public', 'reach', 'with', 'us', 'information', 'makes']
bias term for FRA: 1
max weights for HIN: ['then', 'towards', 'which', 'has', 'its', 'i', 'So', 'particular', 'u', 'upon']
min weights for HIN: ['Finally', 'who', 'years', 'reasons', 'others', 'I', 'other', 'success', 'probably', 'instance']
bias term for HIN: 1
max weights for ITA: ['

In [28]:
pred_labels = [predict(d) for d in dev_docs]
confusion_matrix = confusion_matrix(dev_labels, pred_labels, labels=CLASSES)
print(confusion_matrix)

[[38  0  0  1  0  1  2  3  3  1  2]
 [ 0 25  2  0  0  3  0  1  0  1  2]
 [ 2  6 36  0  2  1  1  2  2  1  0]
 [ 2  3  1 20  1  0  2  0 17  1  0]
 [ 4  1  5  0 36  0  2  2  1  0  2]
 [ 6  0  0  0  1 35 15  0  0  0  3]
 [ 1  0  0  0  0  6 44  0  1  0  8]
 [ 7  1  3  0  3  4  2 29  1  0  2]
 [ 3  0  0  8  0  0  0  1 50  0  0]
 [ 9  6  3  0  0  1  4  0  1 27  6]
 [ 4  1  1  1  0  3  4  0  3  0 52]]


In [42]:
(confusion_matrix[2][1])

6

In [None]:
if __name__ == "__main__":
    args = sys.argv[1:]
    niters = int(args[0])

    train_docs, train_labels = load_featurized_docs('train')
    print(len(train_docs), 'training docs with',
        sum(len(d) for d in train_docs)/len(train_docs), 'percepts on avg', file=sys.stderr)

    dev_docs,  dev_labels  = load_featurized_docs('dev')
    print(len(dev_docs), 'dev docs with',
        sum(len(d) for d in dev_docs)/len(dev_docs), 'percepts on avg', file=sys.stderr)


    test_docs,  test_labels  = load_featurized_docs('test')
    print(len(test_docs), 'test docs with',
        sum(len(d) for d in test_docs)/len(test_docs), 'percepts on avg', file=sys.stderr)

    ptron = Perceptron(train_docs, train_labels, MAX_ITERATIONS=niters, dev_docs=dev_docs, dev_labels=dev_labels)
    acc = ptron.test_eval(test_docs, test_labels)
    print(acc, file=sys.stderr)

In [29]:
niters = 30
train_docs, train_labels = load_featurized_docs('train')

print(len(train_docs), 'training docs with',
    sum(len(d) for d in train_docs)/len(train_docs), 'percepts on avg')

dev_docs,  dev_labels  = load_featurized_docs('dev')
print(len(dev_docs), 'dev docs with',
    sum(len(d) for d in dev_docs)/len(dev_docs), 'percepts on avg')


test_docs,  test_labels  = load_featurized_docs('test')
print(len(test_docs), 'test docs with',
    sum(len(d) for d in test_docs)/len(test_docs), 'percepts on avg')

ptron = Perceptron(train_docs, train_labels, MAX_ITERATIONS=niters, dev_docs=dev_docs, dev_labels=dev_labels)
acc = ptron.test_eval(test_docs, test_labels)
print(acc)

5366 training docs with 154.68747670518076 percepts on avg
598 dev docs with 154.81939799331104 percepts on avg
604 test docs with 153.6341059602649 percepts on avg


TypeError: 'int' object is not subscriptable

In [25]:
weights['ARA']['add']

TypeError: 'int' object is not subscriptable