# Baselines for the Twitter Sentiment Analysis CIL Project

They will mostly employ simple linear classifiers on naive operations on the word embeddings, such as averaging.

 * Attempt 1. uses word embedding vector averaging.
 * Attempt 2. uses embedding vector concatenation and is VERY memory-hungry.
 * The `del`s in the code seek to alleviate some of the memory pressure.

In [1]:
import os
import pickle

import gensim
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import *
from sklearn.utils import shuffle

In [16]:
# We will work on the preprocessed data, so that we have a common ground with
# the deep learning competitors.
pp = os.path.join('..', 'data', 'preprocessing')

# How many Tweets to sample.
# Averaging should work with all of them, but concatenation chokes
# on even a tenth in its current implementation.
LIMIT = 250000

trainX = np.load(os.path.join(pp, 'full-trainX.npy'))
trainY = np.load(os.path.join(pp, 'full-trainY.npy'))
embeddings = np.load(os.path.join(pp, 'full-embeddings.npy'))

trainY = np.argmax(trainY, axis=1)
trainX, trainY = shuffle(trainX, trainY)
trainX = trainX[:LIMIT]
trainY = trainY[:LIMIT]

with open(os.path.join(pp, 'full-vocab.pkl'), 'rb') as f:
    vocab = pickle.load(f)

In [17]:
def mk_avg(tweet):
    """Averages a tweet's representation vectors.
    
    Ignores the padding.
    """
    return np.mean([embeddings[wid] for wid in tweet if wid != 0], axis=0)

def mk_concat(tweet, lim=25):
    """Concatenates the word embeddings in a tweet.
    
    Does not ignore the padding.
    """
    return np.hstack([embeddings[wid] for wid in tweet[:lim]])

In [18]:
from sklearn.grid_search import *

grid = {
    'alpha': [0.00001, 0.00005, 0.0001, 0.0005, 0.001],
}

def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("{2}: Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores),
              i + 1))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [24]:
def eval_avg(trainX, trainY):
    """Averages each tweet's word vectors into one, and trains a linear classifier."""

    print("Evaluating input of size {0}.".format(trainX.shape))
    print("Doing embedding averaging.")
    avgd_tweets = [mk_avg(tweet) for tweet in trainX]
    avgd_tweets = [(t, label) for (t, label) in zip(avgd_tweets, trainY) if t.shape == (300,)]
    trainX_trimmed = np.array([t for (t, label) in avgd_tweets])
    trainY_trimmed = np.array([label for (t, label) in avgd_tweets])
    print(trainX_trimmed.shape)
    print(trainY_trimmed.shape)
    del avgd_tweets
    trainX_trimmed, trainY_trimmed = shuffle(trainX_trimmed, trainY_trimmed)
    
    gs = GridSearchCV(SGDClassifier(), grid, cv=3, verbose=True)
    print("Starting grid search...")
    res = gs.fit(trainX_trimmed, trainY_trimmed)
    report(res.grid_scores_, n_top=25)
    
    predY = res.predict(trainX_trimmed)
    acc = accuracy_score(trainY_trimmed, predY)
    f1 = accuracy_score(trainY_trimmed, predY)
    
    print("Train accuracy: {0}\nTrain F1 score: {1}".format(acc, f1))
    
    return res    

In [22]:
def eval_concat(trainX, trainY):
    """Concatenates each tweet's word vectors into one, and trains a linear classifier.
    
    Note: can get VERY memory-hungry.
    """
    
    print("Evaluating input of size {0}.".format(trainX.shape))
    print("Doing embedding concatenation.")
    
    embedding_size = 300
    max_tweet_len = 25
    
    concat_tweets = [mk_concat(tweet, max_tweet_len) for tweet in trainX]
    concat_tweets = [(t, label) for (t, label) in zip(concat_tweets, trainY) if t.shape == (embedding_size * max_tweet_len,)]
    trainX_trimmed = np.array([t for (t, label) in concat_tweets])
    trainY_trimmed = np.array([label for (t, label) in concat_tweets])
    print(trainX_trimmed.shape)
    print(trainY_trimmed.shape)
    del concat_tweets
    trainX_trimmed, trainY_trimmed = shuffle(trainX_trimmed, trainY_trimmed)
    
    gs = GridSearchCV(SGDClassifier(), grid, cv=3, verbose=True)
    print("Starting grid search...")
    res = gs.fit(trainX_trimmed, trainY_trimmed)
    report(res.grid_scores_, n_top=25)
    
    predY = res.predict(trainX_trimmed)
    acc = accuracy_score(trainY_trimmed, predY)
    f1 = accuracy_score(trainY_trimmed, predY)
    
    print("Train accuracy: {0}\nTrain F1 score: {1}".format(acc, f1))
    
    return res   

In [15]:
avg_res = eval_avg(trainX, trainY)

Evaluating input of size (2499999, 40).
Doing embedding averaging.


  ret = ret.dtype.type(ret / rcount)


(2499981, 300)
(2499981,)
Starting grid search...
Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  3.3min finished


1: Mean validation score: 0.774 (std: 0.000)
Parameters: {'alpha': 5e-05}

2: Mean validation score: 0.772 (std: 0.003)
Parameters: {'alpha': 1e-05}

3: Mean validation score: 0.772 (std: 0.000)
Parameters: {'alpha': 0.0001}

4: Mean validation score: 0.764 (std: 0.000)
Parameters: {'alpha': 0.0005}

5: Mean validation score: 0.757 (std: 0.000)
Parameters: {'alpha': 0.001}

Train accuracy: 0.7739754822136649
Train F1 score: 0.7739754822136649


In [23]:
# WARNING: Can get VERY memory-hungry!

concat_res = eval_concat(trainX, trainY)

Evaluating input of size (250000, 40).
Doing embedding concatenation.
(250000, 7500)
(250000,)
Starting grid search...
Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 87.8min finished


1: Mean validation score: 0.774 (std: 0.001)
Parameters: {'alpha': 0.001}

2: Mean validation score: 0.773 (std: 0.001)
Parameters: {'alpha': 0.0005}

3: Mean validation score: 0.751 (std: 0.006)
Parameters: {'alpha': 0.0001}

4: Mean validation score: 0.734 (std: 0.012)
Parameters: {'alpha': 5e-05}

5: Mean validation score: 0.723 (std: 0.013)
Parameters: {'alpha': 1e-05}

