# Baselines for the Twitter Sentiment Analysis CIL Project

They will mostly employ simple linear classifiers on naive operations on the word embeddings, such as averaging.

 * Attempt 1. uses word embedding vector averaging.
 * Attempt 2. uses embedding vector concatenation and is VERY memory-hungry.
 * The `del`s in the code seek to alleviate some of the memory pressure.

In [1]:
import os
import pickle

import gensim
import numpy as np
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import *
from sklearn.utils import shuffle
from sklearn.grid_search import *
from gensim.models.word2vec import Word2Vec

In [2]:
# We will work on the preprocessed data, so that we have a common ground with
# the deep learning competitors.
pp = os.path.join('..', 'data', 'preprocessing')
tr = os.path.join('..', 'data', 'train')
vc = os.path.join('..', 'data', 'word2vec')

TRAINING_SAMPLES = 2500000
MAX_TWEET_LENGHT = 25

# How many Tweets to sample.
# Averaging should work with all of them, but concatenation chokes
# on even a tenth in its current implementation.
LIMIT = 250000

#training files
train_neg_file = os.path.join(tr, 'train_pos_full_orig.txt')
train_pos_file = os.path.join(tr, 'train_neg_full_orig.txt')

#load word2vec model
model = Word2Vec.load(os.path.join(vc, 'word2vec-local-gensim-orig-20.bin'))
EMBEDDING_DIM = model.vector_size
print(EMBEDDING_DIM)

20


In [3]:
def load_training_data_concat():
    trainX = np.zeros((TRAINING_SAMPLES, MAX_TWEET_LENGHT*EMBEDDING_DIM))
    trainY = np.ones((TRAINING_SAMPLES))
    tweet = 0
    for filename in [train_neg_file, train_pos_file]:
        with open(filename) as f:
            for line in f:
                embedded_words = []
                line = line.strip()
                for word in line.split():
                    if word in model:
                        embedded_words.append(model[word])
                embedded_words.append(np.zeros(MAX_TWEET_LENGHT*EMBEDDING_DIM+2))
                tweet_embedded = np.hstack(embedded_words)
                trainX[tweet,:] = tweet_embedded[0:MAX_TWEET_LENGHT*EMBEDDING_DIM]
                if filename == train_neg_file:
                    trainY[tweet] = -1
                tweet += 1
        print(filename)
        print(tweet)
    return trainX, trainY

def load_training_data_average():
    trainX = np.zeros((TRAINING_SAMPLES, EMBEDDING_DIM))
    trainY = np.ones((TRAINING_SAMPLES))
    tweet = 0
    for filename in [train_neg_file, train_pos_file]:
        with open(filename) as f:
            for line in f:
                embedded_words = []
                line = line.strip()
                for word in line.split():
                    if word in model:
                        embedded_words.append(model[word])
                tweet_embedded = np.mean(embedded_words)
                trainX[tweet,:] = tweet_embedded
                if filename == train_neg_file:
                    trainY[tweet] = -1
                tweet += 1
    return trainX, trainY

In [4]:
grid = {
    'alpha': [0.00001, 0.00005, 0.0001, 0.0005, 0.001],
}

def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=lambda x: x[1], reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("{2}: Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores),
              i + 1))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [5]:
def eval_avg(trainX, trainY):
    """Averages each tweet's word vectors into one, and trains a linear classifier."""

    print("Evaluating input of size {0}.".format(trainX.shape))
    print(trainX.shape)
    print(trainY.shape)
    trainX, trainY = shuffle(trainX, trainY)
    
    gs = GridSearchCV(SGDClassifier(), grid, cv=3, verbose=True)
    print("Starting grid search...")
    res = gs.fit(trainX, trainY)
    report(res.grid_scores_, n_top=25)
    
    predY = res.predict(trainX)
    acc = accuracy_score(trainY, predY)
    f1 = accuracy_score(trainY, predY)
    
    print("Train accuracy: {0}\nTrain F1 score: {1}".format(acc, f1))
    
    return res    

In [6]:
def eval_concat(trainX, trainY):
    """Concatenates each tweet's word vectors into one, and trains a linear classifier.
    
    Note: can get VERY memory-hungry.
    """
    
    print("Evaluating input of size {0}.".format(trainX.shape))
    
    trainX, trainY = shuffle(trainX, trainY)
    
    gs = GridSearchCV(SGDClassifier(), grid, cv=2, verbose=True)
    print("Starting grid search...")
    res = gs.fit(trainX, trainY)
    report(res.grid_scores_, n_top=25)
    
    predY = res.predict(trainX)
    acc = accuracy_score(trainY, predY)
    f1 = accuracy_score(trainY, predY)
    
    print("Train accuracy: {0}\nTrain F1 score: {1}".format(acc, f1))
    
    return res   

In [7]:
trainX, trainY = load_training_data_concat()

../data/train/train_pos_full_orig.txt
1250000
../data/train/train_neg_full_orig.txt
2500000


In [8]:
concat_res = eval_concat(np.nan_to_num(trainX), trainY)

Evaluating input of size (2500000, 500).
Starting grid search...
Fitting 2 folds for each of 5 candidates, totalling 10 fits


[Parallel(n_jobs=1)]: Done  10 out of  10 | elapsed: 19.0min finished


1: Mean validation score: 0.677 (std: 0.003)
Parameters: {'alpha': 0.001}

2: Mean validation score: 0.665 (std: 0.005)
Parameters: {'alpha': 0.0005}

3: Mean validation score: 0.655 (std: 0.001)
Parameters: {'alpha': 0.0001}

4: Mean validation score: 0.644 (std: 0.004)
Parameters: {'alpha': 1e-05}

5: Mean validation score: 0.632 (std: 0.010)
Parameters: {'alpha': 5e-05}

Train accuracy: 0.6919524
Train F1 score: 0.6919524


In [9]:
trainX, trainY = load_training_data_average()
avg_res = eval_avg(np.nan_to_num(trainX), trainY)

  ret = ret.dtype.type(ret / rcount)


Evaluating input of size (2500000, 20).
(2500000, 20)
(2500000,)
Starting grid search...
Fitting 3 folds for each of 5 candidates, totalling 15 fits


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:   35.3s finished


1: Mean validation score: 0.634 (std: 0.001)
Parameters: {'alpha': 1e-05}

2: Mean validation score: 0.632 (std: 0.001)
Parameters: {'alpha': 0.001}

3: Mean validation score: 0.631 (std: 0.000)
Parameters: {'alpha': 5e-05}

4: Mean validation score: 0.631 (std: 0.001)
Parameters: {'alpha': 0.0005}

5: Mean validation score: 0.631 (std: 0.000)
Parameters: {'alpha': 0.0001}

Train accuracy: 0.6313264
Train F1 score: 0.6313264
