In [2]:
import sys # to import files in other directories
import numpy as np
import sklearn
import csv

from features import *
from collections import defaultdict

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectFpr, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn import metrics

# sys.path.insert(0, 'class_resources/')
# from distributedwordreps import build, ShallowNeuralNetwork

In [8]:
dat = np.load('data/consolidations/yak_cons_2015-06-01 02-03-55.904056.npy')
print len(dat), "data points loaded"

# since the rows of the database are sorted by time, we should shuffle the data set
# so that the train and test sets both cover the same time periods.
np.random.shuffle(dat)
cutoff = 4*len(dat)/5

(dat)

6376 data points loaded


array([[u'R/5565171da143656c690b064aceedc', u'vanderbilt', u'',
        u'If you let a dog eat as much as it wants will it just pop?',
        u'1.0'],
       [u'R/5566462a18cd6080b61dc44a168f2', u'dartmouth', u'',
        u'Is foco open?', u'-inf'],
       [u'R/556a9ac4033f4ea262e42f8e5fec2', u'northwestern', u'',
        u'I kinda wanna stay a virgin', u'37.0'],
       [u'R/55652b4610de040042205c2bff795', u'northwestern', u'',
        u"8 AM me: I'll go to the gym tonight     8 PM me: I'll go to Taco Bell tonight ",
        u'75.0'],
       [u'R/556634a9993c54f0f43a43c50459a', u'dartmouth', u'',
        u'PSA Collis market is closed but Collis is still selling salad and premade food right now ',
        u'8.0'],
       [u'R/55651ede800ff62621aa9575c9edc', u'harvard', u'',
        u"Someone just told me I'm cool because other black people freak them out... still trying to figure it out",
        u'11.0'],
       [u'R/55651ddef043cf3371337ad987adf', u'northwestern', u'',
        u"My p

In [4]:
X = dat[:,1:4]
y = [1 if float(val) < 0 else -1 for val in dat[:,4]]
pos = [ind for ind, y_i in enumerate(y) if y_i == 1]
neg = [ind for ind, y_i in enumerate(y) if y_i == -1]
print len(pos),len(neg)
cutoff = int(0.8*len(X))
X_train = X[:cutoff]
X_test = X[cutoff:]
y = y[:]
y_train, y_test = y[:cutoff], y[cutoff:]

1665 4711


array([[u'penn', u'',
        u'Any other penn students living on spruce street for summer  want to hang out? Lonely AF '],
       [u'columbia', u'',
        u"I can live with her Adam's apple and deep voice. But it makes me very insecure that she is packing more than me "],
       [u'dartmouth', u'',
        u'Made my bed and accidentally got an engineering degree from Cornell. '],
       [u'uc berkeley', u'',
        u'I wonder what people who write "u" and "ur" do with all the extra time'],
       [u'uc berkeley', u'',
        u"I wana have a summer love who's down?! Lady 4 lady \u263a\ufe0f"],
       [u'dartmouth', u'',
        u"Where is Dartmouth secure working? Im in Dartmouth hall and it's not picking it up here (ironically)"],
       [u'brown', u'',
        u'When you find out the girl that you fell for because you thought she was innocent is actually a hoe \U0001f494'],
       [u'rice', u'', u'I love alone time'],
       [u'mit', u'', u'b0ss, can i habe de pusi pls'],
       

In [15]:
def featurizer(X, y, feature_function):
    """Map the data in reader to a list of features according to feature_function,
    and create the gold label vector."""
    feats = []
    labels = []
    split_index = None
    for yak, label in zip(X, y):
        d = feature_function(yak)
        feats.append(d)
        labels.append(label)              
    return (feats, labels)

In [16]:
def random_feature(yak):
    features = defaultdict(float)
    features["random_feature"] = np.random.random()
    return features

In [17]:
def train_classifier(
        X,
        y,
        feature_function,
        feature_selector=SelectFpr(chi2, alpha=0.05), # Use None to stop feature selection
        cv=10, # Number of folds used in cross-validation
        priorlims=np.arange(.1, 3.1, .1)): # regularization priors to explore (we expect something around 1)
    # Featurize the data:
    feats, labels = featurizer(X, y, feature_function) 
    
    # Map the count dictionaries to a sparse feature matrix:
    vectorizer = DictVectorizer(sparse=False)
    X = vectorizer.fit_transform(feats)

    ##### FEATURE SELECTION    
    # (An optional step; not always productive). By default, we select all
    # the features that pass the chi2 test of association with the
    # class labels at p < 0.05. sklearn.feature_selection has other
    # methods that are worth trying. I've seen particularly good results
    # with the model-based methods, which require some changes to the
    # current code.
    feat_matrix = None
    if feature_selector:
        feat_matrix = feature_selector.fit_transform(X, labels)
    else:
        feat_matrix = X
    
    ##### HYPER-PARAMETER SEARCH
    # Define the basic model to use for parameter search:
    searchmod = LogisticRegression(fit_intercept=True, intercept_scaling=1)
    # Parameters to grid-search over:
    parameters = {'C':priorlims, 'penalty':['l1','l2']}  
    # Cross-validation grid search to find the best hyper-parameters:   
    clf = GridSearchCV(searchmod, parameters, cv=cv)
    clf.fit(feat_matrix, labels)
    params = clf.best_params_

    # Establish the model we want using the parameters obtained from the search:
    mod = LogisticRegression(fit_intercept=True, intercept_scaling=1, C=params['C'], penalty=params['penalty'])

    ##### ASSESSMENT              
    # Cross-validation of our favored model; for other summaries, use different
    # values for scoring: http://scikit-learn.org/dev/modules/model_evaluation.html
    scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro")       
    print 'Best model', mod
    print '%s features selected out of %s total' % (feat_matrix.shape[1], X.shape[1])
    print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2)

    # TRAIN OUR MODEL:
    mod.fit(feat_matrix, labels)

    # Return the trained model along with the objects we need to
    # featurize test data in a way that aligns with our training
    # matrix:
    return (mod, vectorizer, feature_selector, feature_function)


In [22]:
# this will be an array of function pointers
# all_feature_generators = [random_feature]
all_feature_generators = [random_feature,
#                           bigram_feature, 
#                           cap_feature,
#                          punc_feature,
#                          imbalance_feature,
#                           interrogative_feature,
                         ]

def get_all_features(fs):
    def total_feature_fn(yak):
        features = defaultdict(float)
        for f in fs:
            features.update(f(yak))
        return features
    return total_feature_fn

all_features = get_all_features(all_feature_generators)

In [24]:
model = train_classifier(X_train, y_train, all_features, feature_selector=None, cv=2)

Best model LogisticRegression(C=0.10000000000000001, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0)
1 features selected out of 1 total
F1 mean: 0.43 (+/- 0.00)


  'precision', 'predicted', average, warn_for)


In [25]:
def evaluate_trained_classifier(model, X, y):
    """Evaluate model, the output of train_classifier, on the data in reader."""
    mod, vectorizer, feature_selector, feature_function = model
    print feature_function
    feats, labels = featurizer(X, y, feature_function)
    feat_matrix = vectorizer.transform(feats)
    if feature_selector:
        feat_matrix = feature_selector.transform(feat_matrix)
    predictions = mod.predict(feat_matrix)
#     print predictions
#     print y
    return metrics.classification_report(labels, predictions)

In [26]:
for readername, X, y in (('Train', X_train, y_train), ('Dev', X_test, y_test)):
    print "======================================================================"
    print readername
    print evaluate_trained_classifier(model, X, y)

Train
<function total_feature_fn at 0x112deded8>
             precision    recall  f1-score   support

         -1       0.74      1.00      0.85      3779
          1       0.00      0.00      0.00      1321

avg / total       0.55      0.74      0.63      5100

Dev
<function total_feature_fn at 0x112deded8>
             precision    recall  f1-score   support

         -1       0.73      1.00      0.84       932
          1       0.00      0.00      0.00       344

avg / total       0.53      0.73      0.62      1276



  'precision', 'predicted', average, warn_for)


In [83]:
d1 = defaultdict(float)
d2 = defaultdict(float)
d1["a"] = 1.0
d1["b"] = 2.0
d2["b"] = 1.0
d2["c"] = 2.0
d1.update(d2)
d1

defaultdict(<type 'float'>, {'a': 1.0, 'c': 2.0, 'b': 1.0})