In [11]:
#!/usr/bin/env python3
"""
ENLP A1 Part II: Perceptron

Usage: python perceptron.py NITERATIONS

(Adapted from Alan Ritter)
"""
import sys, os, glob

from collections import Counter
from math import log
from numpy import mean

from nltk.stem.wordnet import WordNetLemmatizer

from evaluation import Eval

from nbmodel import load_docs

In [15]:
def extract_feats(doc):
    """
    Extract input features (percepts) for a given document.
    Each percept is a pairing of a name and a boolean, integer, or float value.
    A document's percepts are the same regardless of the label considered.
    """
    ff = Counter()
    for word in doc:
        ff[word] = 1
    ff['bias_term'] = 1
    return ff

In [7]:
def load_featurized_docs(datasplit):
    rawdocs, labels = load_docs(datasplit, lemmatize=False)
    assert len(rawdocs)==len(labels)>0,datasplit
    featdocs = []
    for d in rawdocs:
        featdocs.append(extract_feats(d))
    return featdocs, labels

In [8]:
CLASSES = ['ARA', 'DEU', 'FRA', 'HIN', 'ITA', 'JPN', 'KOR', 'SPA', 'TEL', 'TUR', 'ZHO']
MAX_ITERATIONS = 30
dev_docs,  dev_labels  = load_featurized_docs('dev')
weights = {l: Counter() for l in CLASSES}
#learn(train_docs, train_labels)

In [10]:
train_docs, train_labels = load_featurized_docs('train')

In [None]:
def copy_weights(self):
    """
    Returns a copy of self.weights.
    """
    return {l: Counter(c) for l,c in self.weights.items()}

def learn(self, train_docs, train_labels):
    """
    Train on the provided data with the perceptron algorithm.
    Up to self.MAX_ITERATIONS of learning.
    At the end of training, self.weights should contain the final model
    parameters.
    """
    for inter in self.MAX_ITERATIONS:
        for i in range(train_docs):
            for word in train_docs[i]:
                
    ...

def score(self, doc, label):
    """
    Returns the current model's score of labeling the given document
    with the given label.
    """
    return ...

def predict(self, doc):
    """
    Return the highest-scoring label for the document under the current model.
    """
    return ...

In [None]:
    def test_eval(self, test_docs, test_labels):
        pred_labels = [self.predict(d) for d in test_docs]
        ev = Eval(test_labels, pred_labels)
        return ev.accuracy()
    
if __name__ == "__main__":
    args = sys.argv[1:]
    niters = int(args[0])

    train_docs, train_labels = load_featurized_docs('train')
    print(len(train_docs), 'training docs with',
        sum(len(d) for d in train_docs)/len(train_docs), 'percepts on avg', file=sys.stderr)

    dev_docs,  dev_labels  = load_featurized_docs('dev')
    print(len(dev_docs), 'dev docs with',
        sum(len(d) for d in dev_docs)/len(dev_docs), 'percepts on avg', file=sys.stderr)


    test_docs,  test_labels  = load_featurized_docs('test')
    print(len(test_docs), 'test docs with',
        sum(len(d) for d in test_docs)/len(test_docs), 'percepts on avg', file=sys.stderr)

    ptron = Perceptron(train_docs, train_labels, MAX_ITERATIONS=niters, dev_docs=dev_docs, dev_labels=dev_labels)
    acc = ptron.test_eval(test_docs, test_labels)
    print(acc, file=sys.stderr)

In [17]:
train_docs, train_labels = load_featurized_docs('train')

print(len(train_docs), 'training docs with',
    sum(len(d) for d in train_docs)/len(train_docs), 'percepts on avg')

dev_docs,  dev_labels  = load_featurized_docs('dev')
print(len(dev_docs), 'dev docs with',
    sum(len(d) for d in dev_docs)/len(dev_docs), 'percepts on avg')


test_docs,  test_labels  = load_featurized_docs('test')
print(len(test_docs), 'test docs with',
    sum(len(d) for d in test_docs)/len(test_docs), 'percepts on avg')

5366 training docs with 154.68747670518076 percepts on avg
598 dev docs with 154.81939799331104 percepts on avg
604 test docs with 153.6341059602649 percepts on avg


In [21]:
weights = 

{'ARA': Counter(),
 'DEU': Counter(),
 'FRA': Counter(),
 'HIN': Counter(),
 'ITA': Counter(),
 'JPN': Counter(),
 'KOR': Counter(),
 'SPA': Counter(),
 'TEL': Counter(),
 'TUR': Counter(),
 'ZHO': Counter()}