# Baselines for the Twitter Sentiment Analysis CIL Project

They will mostly employ simple linear classifiers on naive operations on the word embeddings, such as averaging.

In [1]:
import os
import pickle

import gensim
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.utils import shuffle

In [9]:
# We will work on the preprocessed data, so that we have a common ground with
# the deep learning competitors.
pp = os.path.join('..', 'data', 'preprocessing')

LIMIT = -1

trainX = np.load(os.path.join(pp, 'full-trainX.npy'))
trainY = np.load(os.path.join(pp, 'full-trainY.npy'))
embeddings = np.load(os.path.join(pp, 'full-embeddings.npy'))

trainY = np.argmax(trainY, axis=1)
trainX, trainY = shuffle(trainX, trainY)
trainX = trainX[:LIMIT]
trainY = trainY[:LIMIT]

with open(os.path.join(pp, 'full-vocab.pkl'), 'rb') as f:
    vocab = pickle.load(f)

In [10]:
def mk_avg(tweet):
    """Averages a tweet's representation vectors.
    
    Ignores the padding.
    """
    return np.mean([embeddings[wid] for wid in tweet if wid != 0], axis=0)

def mk_concat(tweet, lim=15):
    """Concatenates the word embeddings in a tweet.
    
    Does not ignore the padding.
    """
    return np.hstack([embeddings[wid] for wid in tweet[:lim]])

In [11]:
ts = mk_concat(trainX[128])

In [12]:
from sklearn.grid_search import *

grid = {
    'alpha': [0.00001, 0.00005, 0.0001, 0.0005, 0.001],
}

def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("{2}: Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores),
              i + 1))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [13]:
from sklearn.metrics import *

def eval_avg(trainX, trainY):
    print("Evaluating input of size {0}.".format(trainX.shape))
    print("Doing embedding averaging.")
    avgd_tweets = [mk_avg(tweet) for tweet in trainX]
    avgd_tweets = [(t, label) for (t, label) in zip(avgd_tweets, trainY) if t.shape == (300,)]
    trainX_trimmed = np.array([t for (t, label) in avgd_tweets])
    trainY_trimmed = np.array([label for (t, label) in avgd_tweets])
    print(trainX_trimmed.shape)
    print(trainY_trimmed.shape)
    del avgd_tweets
    trainX_trimmed, trainY_trimmed = shuffle(trainX_trimmed, trainY_trimmed)
    
    gs = GridSearchCV(SGDClassifier(), grid, cv=3, verbose=True)
    print("Starting grid search...")
    res = gs.fit(trainX_trimmed, trainY_trimmed)
    report(res.grid_scores_, n_top=25)
    
    predY = res.predict(trainX_trimmed)
    acc = accuracy_score(trainY_trimmed, predY)
    f1 = accuracy_score(trainY_trimmed, predY)
    
    print("Train accuracy: {0}\nTrain F1 score: {1}".format(acc, f1))
    
    return res    

In [14]:
def eval_concat(trainX, trainY):
    print("Evaluating input of size {0}.".format(trainX.shape))
    print("Doing embedding concatenation.")
    concat_tweets = [mk_concat(tweet) for tweet in trainX]
    concat_tweets = [(t, label) for (t, label) in zip(concat_tweets, trainY) if t.shape == (300 * 15,)]
    trainX_trimmed = np.array([t for (t, label) in concat_tweets])
    trainY_trimmed = np.array([label for (t, label) in concat_tweets])
    print(trainX_trimmed.shape)
    print(trainY_trimmed.shape)
    del concat_tweets
    trainX_trimmed, trainY_trimmed = shuffle(trainX_trimmed, trainY_trimmed)
    
    gs = GridSearchCV(SGDClassifier(), grid, cv=3, n_jobs=1, verbose=True)
    print("Starting grid search...")
    res = gs.fit(trainX_trimmed, trainY_trimmed)
    report(res.grid_scores_, n_top=25)
    return res   

In [None]:
avg_res = eval_avg(trainX, trainY)

Evaluating input of size (2499999, 40).
Doing embedding averaging.


  ret = ret.dtype.type(ret / rcount)


In [None]:
# WARNING: Can get VERY memory-hungry!

concat_res = eval_concat(trainX, trainY)