In [13]:
import sys # to import files in other directories
import numpy as np
import sklearn
import csv
import nltk

# from features import *
from collections import defaultdict

from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectFpr, chi2
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import cross_val_score
from sklearn import metrics
from sklearn.feature_selection import SelectFpr, chi2, RFE

# sys.path.insert(0, 'class_resources/')
# from distributedwordreps import build, ShallowNeuralNetwork


In [14]:
dat = np.load('data/consolidations/yak_cons_2015-06-06 19-42-38.950644.npy')
print len(dat), "data points loaded"

# since the rows of the database are sorted by time, we should shuffle the data set
# so that the train and test sets both cover the same time periods.
np.random.shuffle(dat)
cutoff = 4*len(dat)/5

7101 data points loaded


In [15]:
X = dat[:,1:4]
y = [1 if float(val) < 0 else -1 for val in dat[:,4]]
pos = [ind for ind, y_i in enumerate(y) if y_i == 1]
neg = [ind for ind, y_i in enumerate(y) if y_i == -1]
print len(pos),len(neg)
cutoff = int(0.6*len(X))
cutoff_2 = int(0.8*len(X))
X_train, X_dev, X_test = X[:cutoff], X[cutoff:cutoff_2], X[cutoff_2:]
# X_train = X[:cutoff]
# X_test = X[cutoff:]
y = y[:]

y_train, y_dev, y_test = y[:cutoff], y[cutoff:cutoff_2], y[cutoff_2:]
# y_train, y_test = y[:cutoff], y[cutoff:]

1839 5262


In [16]:
def featurizer(X, y, feature_function):
    """Map the data in reader to a list of features according to feature_function,
    and create the gold label vector."""
    feats = []
    labels = []
    split_index = None
    for yak, label in zip(X, y):
        d = feature_function(yak)
        feats.append(d)
        labels.append(label)              
    return (feats, labels)

In [17]:
def random_feature(yak):
    features = defaultdict(float)
    for i in range(10):
        features["random_feature" + str(i)] = np.random.random()
    return features

In [18]:
def train_classifier(
        X,
        y,
        feature_function,
        feature_selector=SelectFpr(chi2, alpha=0.05), # Use None to stop feature selection
        cv=10, # Number of folds used in cross-validation
        priorlims=np.arange(.1, 3.1, .1)): # regularization priors to explore (we expect something around 1)
    # Featurize the data:
    feats, labels = featurizer(X, y, feature_function) 
        
    # Map the count dictionaries to a sparse feature matrix:
    vectorizer = DictVectorizer(sparse=False)
    X = vectorizer.fit_transform(feats)
#     print X.shape
#     print "y.shape"
#     print y.shape
    ##### FEATURE SELECTION    
    # (An optional step; not always productive). By default, we select all
    # the features that pass the chi2 test of association with the
    # class labels at p < 0.05. sklearn.feature_selection has other
    # methods that are worth trying. I've seen particularly good results
    # with the model-based methods, which require some changes to the
    # current code.
    feat_matrix = None
    if feature_selector:
        feat_matrix = feature_selector.fit_transform(X, labels)
    else:
        feat_matrix = X
    
    ##### HYPER-PARAMETER SEARCH
    # Define the basic model to use for parameter search:
    searchmod = LogisticRegression(fit_intercept=True, intercept_scaling=1)
    # Parameters to grid-search over:
    parameters = {'C':priorlims, 'penalty':['l1','l2']}  
    # Cross-validation grid search to find the best hyper-parameters:   
    clf = GridSearchCV(searchmod, parameters, cv=cv)
    clf.fit(feat_matrix, labels)
    params = clf.best_params_

    # Establish the model we want using the parameters obtained from the search:
    mod = LogisticRegression(fit_intercept=True, intercept_scaling=1, C=params['C'], penalty=params['penalty'])

    ##### ASSESSMENT              
    # Cross-validation of our favored model; for other summaries, use different
    # values for scoring: http://scikit-learn.org/dev/modules/model_evaluation.html
    scores = cross_val_score(mod, feat_matrix, labels, cv=cv, scoring="f1_macro")       
    print 'Best model', mod
    print '%s features selected out of %s total' % (feat_matrix.shape[1], X.shape[1])
    print 'F1 mean: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std()*2)

    # TRAIN OUR MODEL:
    mod.fit(feat_matrix, labels)

    # Return the trained model along with the objects we need to
    # featurize test data in a way that aligns with our training
    # matrix:
    return (mod, vectorizer, feature_selector, feature_function)


In [19]:
# Warriner et. al. affect score import
lines = [line.rstrip('\n').split(",") for line in open('Ratings_Warriner_et_al.csv')]
WARRINER_AFFECT = {}
for line in lines[1:]:
    WARRINER_AFFECT[line[1].lower()] = float(line[2])
    
# SentiStrength score importing
lines = [line.rstrip('\n') for line in open('SentStrength_Data_Sept2011/EmotionLookupTable.txt')]
line_splits = [line.split() for line in lines]
SENTI_STRENGTH = {}
for line in line_splits:
    SENTI_STRENGTH[line[0].rstrip('*')] =  float(line[1])
    
# Booster word list
lines = [line.rstrip('\n') for line in open('SentStrength_Data_Sept2011/BoosterWordList.txt')]
line_splits = [line.split() for line in lines]
SENTI_STRENGTH = {}
for line in line_splits:
    SENTI_STRENGTH[line[0].rstrip('*')] =  float(line[1])
    

In [20]:
def build_word_counts(X_train):
    counts = defaultdict(float)
    for yak in X_train:
        text = yak[2].lower()
        tokens = [word for sent in nltk.tokenize.sent_tokenize(text) for word in nltk.tokenize.word_tokenize(sent)]
        words = filter(lambda word: word not in ',-', tokens)
        for word in words:
            counts[word] += 1
    return counts
word_counts = build_word_counts(X_train)

In [21]:
dictionary_words = set(line.rstrip('\n').lower() for line in open('/usr/share/dict/words'))

In [22]:
import string

''' FEATURE FUNCTIONS
'''

def spelling_feature(yak):
    features = defaultdict(float)
    text = yak[2].lower()
    tokens = [word for sent in nltk.tokenize.sent_tokenize(text) for word in nltk.tokenize.word_tokenize(sent)]
    words = filter(lambda word: word not in ',-', tokens)
    
    num_correct = 0
    num_total = len(words)
    for word in words:
        if word in dictionary_words:
            num_correct += 1
    features["correct_spelling_" + yak[0]] = float(num_correct / num_total)
    
    return features

def word_count_feature(yak):
    features = defaultdict(float)
    text = yak[2].lower()
    tokens = [word for sent in nltk.tokenize.sent_tokenize(text) for word in nltk.tokenize.word_tokenize(sent)]
    words = filter(lambda word: word not in ',-', tokens)
    counts = [word_counts[word] for word in words]
    features["num_zero_count_words"] = counts.count(0) if len(counts) > 0 else 0
    features["max_count_word"] = max(counts) if len(counts) > 0 else 0
    nonzeros = [count for count in counts if count > 0]
    features["min_count_word"] = min(nonzeros) if len(nonzeros) > 0 else 0
    return features

# Capitalization counter
# https://github.com/MathieuCliche/Sarcasm_detector
def cap_feature(yak):
    features = defaultdict(float)
    counter = 0
    thresh = 4 # for alternative feature below
    for j in range(len(yak[2])):
        counter += int(yak[2][j].isupper())
        #features['Capitalization'] += int(yak[2][j].isupper())
    features['Capitalization'] = int(counter >= thresh)
    return features

def cap_propor_feature(yak):
    features = defaultdict(float)
    counter = 0
    for j in range(len(yak[2])):
        counter += int(yak[2][j].isupper())
    features["Caps Proportion"] = float(counter / len(yak[2]))
    return features
    
# Punctuation
punc_marks = ['.', '...', ';', ':', '?', '!', '\'', '\"', '-', '(', ')', '*', ',']
def punc_feature(yak):
    features = defaultdict(float)
    punc_total = 0
    for i in punc_marks:
        punc_total += float(i in yak[2])
        features[('Punctuation ' + i)] += float(i in yak[2])
    return features

# Delta affect
def imbalance_feature(yak):
    features = defaultdict(float)
    words = yak[2].split(" ")
    affect = np.array([])
    senti = np.array([])

    for word in words:
        if word.lower() in WARRINER_AFFECT:
            affect = np.append(affect, WARRINER_AFFECT[word.lower()])
        if word.lower() in SENTI_STRENGTH:
            senti = np.append(senti, SENTI_STRENGTH[word.lower()])  
    
    if affect.size > 0:
        features["Delta Affect"] = float(np.amax(affect) - np.amin(affect)) 
    if senti.size > 0:
        features["Delta Sentiment"] = float(np.amax(senti) - np.amin(senti)) 
    
    return features
    
# Is the sentence interrogative?
def interrogative_feature(yak):
    features = defaultdict(float)
    model_words = ['what', 'where', 'when', 'why', 'who']
    auxiliary_verbs = ['am', 'is', 'are', 'was', 'were', 'am', 'do', 'did', 'does']
    words = yak[2].split(" ")
    
    is_interrogative = ((words[0] in model_words) and 
                        (words[1] in auxiliary_verbs) and ('?' in yak[2]))
    features["Interrogative"] = float(is_interrogative)
    return features

# "section leader" and "section leaders"
# punctuation cutting off yaks.
def bigram_feature(yak):
    features = defaultdict(float)
    words = yak[2].lower().split()
    school = yak[0]
    for ind in xrange(len(words) - 1):
        features[(school, words[ind], words[ind + 1])] += 1.0
    return features

# does the yak contain a handle?
def handle_feature(yak):
    features = defaultdict(float)
    features["Handle"] = float(yak[1] != '')
    return features

def handle_school_feature(yak):
    features = defaultdict(float)
    features[(yak[0],yak[1].lower())] += 1.0
    return features

# PRITHVI'S NEW STUFF

def trigram_feature(yak):
    features = defaultdict(float)
    words = yak[2].lower().split()
    school = yak[0]
    for ind in xrange(len(words) - 2):
        features[(school, words[ind], words[ind + 1], words[ind + 2])] += 1.0
    return features

def unigram_feature(yak):
    features = defaultdict(float)
    text = yak[2]
    tokens = [word for sent in nltk.tokenize.sent_tokenize(text) for word in nltk.tokenize.word_tokenize(sent)]
    words = filter(lambda word: word not in ',-', tokens)
    for word in words:
        features[word] += 1.0
    return features

def emoji_feature(yak):
    features = defaultdict(float)
    text = yak[2]
    
    for ch in text:
        if ch not in string.ascii_letters:
            features['Emoji ' + ch] += float(1)
            
    return features

def school_name(yak):
    features = defaultdict(float)
    features['School_name_' + yak[0]] = 1.0
    return features

In [23]:
# this will be an array of function pointers
# all_feature_generators = [random_feature]
all_feature_generators = [#random_feature,
#                           bigram_feature, 
#                           cap_feature,
                         punc_feature,
                         imbalance_feature,
                          interrogative_feature,
                        handle_feature,
                        handle_school_feature,
#                         cap_propor_feature,
                        unigram_feature,
                        word_count_feature,
                        spelling_feature,
                        emoji_feature
                         ]

def get_all_features(fs):
    def total_feature_fn(yak):
        features = defaultdict(float)
        for f in fs:
            features.update(f(yak))
        return features
    return total_feature_fn

all_features = get_all_features(all_feature_generators)

In [24]:
model = train_classifier(X_train, y_train, all_features)

Best model LogisticRegression(C=3.0000000000000004, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0)
464 features selected out of 9510 total
F1 mean: 0.68 (+/- 0.05)


In [143]:
def evaluate_trained_classifier(model, X, y):
    """Evaluate model, the output of train_classifier, on the data in reader."""
    mod, vectorizer, feature_selector, feature_function = model
    print feature_function
    feats, labels = featurizer(X, y, feature_function)
    feat_matrix = vectorizer.transform(feats)
    if feature_selector:
        feat_matrix = feature_selector.transform(feat_matrix)
    predictions = mod.predict(feat_matrix)
#     print predictions
#     print y
    return metrics.classification_report(labels, predictions)

In [150]:
for readername, X, y in (('Train', X_train, y_train), ('Dev', X_dev, y_dev), ('Test', X_test, y_test)):
    print "======================================================================"
    print readername
    print evaluate_trained_classifier(model, X, y)

Train
<function total_feature_fn at 0x1135d6758>
             precision    recall  f1-score   support

         -1       0.83      0.97      0.90      3159
          1       0.84      0.44      0.58      1101

avg / total       0.83      0.83      0.81      4260

Dev
<function total_feature_fn at 0x1135d6758>
             precision    recall  f1-score   support

         -1       0.85      0.97      0.91      1071
          1       0.85      0.47      0.60       349

avg / total       0.85      0.85      0.83      1420

Test
<function total_feature_fn at 0x1135d6758>
             precision    recall  f1-score   support

         -1       0.82      0.96      0.89      1032
          1       0.82      0.45      0.58       389

avg / total       0.82      0.82      0.80      1421



In [83]:
d1 = defaultdict(float)
d2 = defaultdict(float)
d1["a"] = 1.0
d1["b"] = 2.0
d2["b"] = 1.0
d2["c"] = 2.0
d1.update(d2)
d1

defaultdict(<type 'float'>, {'a': 1.0, 'c': 2.0, 'b': 1.0})