In [1]:
import sys # to import files in other directories
import numpy as np
import sklearn
import csv

from features import *
from collections import defaultdict

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectFpr, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn import metrics

# sys.path.insert(0, 'class_resources/')
# from distributedwordreps import build, ShallowNeuralNetwork

In [2]:
dat = np.load('data/consolidations/yak_cons_2015-05-27 14:01:41.586356.npy')
print len(dat), "data points loaded"

# since the rows of the database are sorted by time, we should shuffle the data set
# so that the train and test sets both cover the same time periods.
np.random.shuffle(dat)
cutoff = 4*len(dat)/5

3063 data points loaded


In [3]:
X = dat[:,1:4]
cutoff = int(0.8*len(X))
X_train = X[:cutoff]
X_test = X[cutoff:]
y = [1 if float(val) < 0 else -1 for val in dat[:,4]]
y = y[:]
y_train, y_test = y[:cutoff], y[cutoff:]

In [4]:
def featurizer(X, y, feature_function):
    """Map the data in reader to a list of features according to feature_function,
    and create the gold label vector."""
    feats = []
    labels = []
    split_index = None
    for yak, label in zip(X, y):
        d = feature_function(yak)
        feats.append(d)
        labels.append(label)              
    return (feats, labels)

In [5]:
def random_feature(yak):
    features = defaultdict(float)
    features["random_feature"] = np.random.random()
    return features

In [6]:
def train_classifier(
        X,
        y,
        feature_function,
        feature_selector=SelectFpr(chi2, alpha=0.05), # Use None to stop feature selection
        cv=10, # Number of folds used in cross-validation
        priorlims=np.arange(.1, 3.1, .1)): # regularization priors to explore (we expect something around 1)
    # Featurize the data:
    feats, labels = featurizer(X, y, feature_function) 
    
    # Map the count dictionaries to a sparse feature matrix:
    vectorizer = DictVectorizer(sparse=False)
    X = vectorizer.fit_transform(feats)

    ##### FEATURE SELECTION    
    # (An optional step; not always productive). By default, we select all
    # the features that pass the chi2 test of association with the
    # class labels at p < 0.05. sklearn.feature_selection has other
    # methods that are worth trying. I've seen particularly good results
    # with the model-based methods, which require some changes to the
    # current code.
    feat_matrix = None
    if feature_selector:
        feat_matrix = feature_selector.fit_transform(X, labels)
    else:
        feat_matrix = X
    
    ##### HYPER-PARAMETER SEARCH
    # Define the basic model to use for parameter search:
    searchmod = LogisticRegression(fit_intercept=True, intercept_scaling=1)
    # Parameters to grid-search over:
    parameters = {'C':priorlims, 'penalty':['l1','l2']}  
    # Cross-validation grid search to find the best hyper-parameters:   
    clf = GridSearchCV(searchmod, parameters, cv=cv)
    clf.fit(feat_matrix, labels)
    params = clf.best_params_

    # Establish the model we want using the parameters obtained from the search:
    mod = LogisticRegression(fit_intercept=True, intercept_scaling=1, C=params['C'], penalty=params['penalty'])

    ##### ASSESSMENT              
    # Cross-validation of our favored model; for other summaries, use different
    # values for scoring: http://scikit-learn.org/dev/modules/model_evaluation.html
    scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro")       
    print 'Best model', mod
    print '%s features selected out of %s total' % (feat_matrix.shape[1], X.shape[1])
    print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2)

    # TRAIN OUR MODEL:
    mod.fit(feat_matrix, labels)

    # Return the trained model along with the objects we need to
    # featurize test data in a way that aligns with our training
    # matrix:
    return (mod, vectorizer, feature_selector, feature_function)


In [24]:
# this will be an array of function pointers
# all_feature_generators = [random_feature]
all_feature_generators = [#random_feature,
                          bigram_feature, 
#                           cap_feature,
#                          punc_feature,
#                          imbalance_feature,
#                           interrogative_feature,
                         ]

def get_all_features(fs):
    def total_feature_fn(yak):
        features = defaultdict(float)
        for f in fs:
            features.update(f(yak))
        return features
    return total_feature_fn

all_features = get_all_features(all_feature_generators)

In [None]:
model = train_classifier(X_train, y_train, all_features, feature_selector=None, cv=2)

In [21]:
def evaluate_trained_classifier(model, X, y):
    """Evaluate model, the output of train_classifier, on the data in reader."""
    mod, vectorizer, feature_selector, feature_function = model
    print feature_function
    feats, labels = featurizer(X, y, feature_function)
    feat_matrix = vectorizer.transform(feats)
    if feature_selector:
        feat_matrix = feature_selector.transform(feat_matrix)
    predictions = mod.predict(feat_matrix)
#     print predictions
#     print y
    return metrics.classification_report(labels, predictions)

In [22]:
for readername, X, y in (('Train', X_train, y_train), ('Dev', X_test, y_test)):
    print "======================================================================"
    print readername
    print evaluate_trained_classifier(model, X, y)

Train
<function total_feature_fn at 0x1073a5ed8>
             precision    recall  f1-score   support

         -1       0.74      1.00      0.85      1825
          1       0.00      0.00      0.00       625

avg / total       0.55      0.74      0.64      2450

Dev
<function total_feature_fn at 0x1073a5ed8>
             precision    recall  f1-score   support

         -1       0.73      1.00      0.84       445
          1       0.00      0.00      0.00       168

avg / total       0.53      0.73      0.61       613



  'precision', 'predicted', average, warn_for)


In [83]:
d1 = defaultdict(float)
d2 = defaultdict(float)
d1["a"] = 1.0
d1["b"] = 2.0
d2["b"] = 1.0
d2["c"] = 2.0
d1.update(d2)
d1

defaultdict(<type 'float'>, {'a': 1.0, 'c': 2.0, 'b': 1.0})