# 3 Sentiment analysis with word embedings

In [1]:
import numpy as np 
import pandas as pd 
import os
import re
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
from spacy.symbols import ORTH
import string
import csv
import nltk
from nltk.corpus import stopwords

%matplotlib inline

### process data from files into a dataframe

In [2]:
def read_text(path_dir, prefix, flag):
    file_path = path_dir+prefix+'/'+flag+'/'
    result = []
    for fi in os.listdir(file_path):
        with open(file_path+fi) as f:   
            result.append(f.read())
    return result

def process_file(path_dir='aclImdb/', prefix='train'):
    # check output file
    # if exists, delete it
    output_file = '%s%s.csv' % (path_dir, prefix)
    if os.path.exists(output_file): os.remove(output_file)
    
    # process pos and neg files
    pos_result = read_text(path_dir, prefix, 'pos')
    neg_result = read_text(path_dir, prefix, 'neg')
    
    # write output file 
    f = open(output_file, 'w')
    writer = csv.writer(f)
    writer.writerow(('text', 'flag'))
    writer.writerows((t,1) for t in pos_result)
    writer.writerows((t,0) for t in neg_result)
    f.close()

In [3]:
process_file(path_dir='aclImdb/', prefix='train')
process_file(path_dir='aclImdb/', prefix='test')

In [4]:
data = pd.read_csv('aclImdb/train.csv')
test = pd.read_csv('aclImdb/test.csv')

## 1. Use the libary `spacy` to tokenize your data.

In [5]:
# borrowed from fast.ai (https://github.com/fastai/fastai/blob/master/fastai/nlp.py)

re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

##  2. Download embedding vectors.

## 3. Read the 300 dimensional Glove embeddings into a dictionary.

In [6]:
def load_glove(filename):
    """
    Read all lines from the indicated file and return a dictionary
    mapping word:vector where vectors are of numpy `array` type.
    GloVe file lines are of the form:
    the 0.418 0.24968 -0.41242 0.1217 ...
    So split each line on spaces into a list; the first element is the word
    and the remaining elements represent factor components. The length of the vector
    should not matter; read vectors of any length.
    """
    dict = {}

    f = open(filename)
    for line in f.readlines():
        split_line = line.split()
        for i in range(1, len(split_line)):
            split_line[i] = float(split_line[i])

        dict[split_line[0]] = np.array(split_line[1:])
    f.close()

    return dict

In [7]:
gloves = load_glove('/Users/chuanxu/data/glove/glove.6B.300d.txt')

## 4. Create average feature embedding for each sentence. You may want to ignore stopwords.

In [8]:
# modified from https://www.kaggle.com/anokas/data-analysis-xgboost-starter-0-35460-lb

stops = set(stopwords.words("english"))
def get_non_stopwords(sentence):
    """Returns a list of non-stopwords"""
    return np.array([gloves[x] for x in spacy_tok(str(sentence).lower()) if (x not in stops) and (x in gloves.keys())])

def get_average_feature_embedding(sentence):
    return np.mean(get_non_stopwords(sentence), axis=0)

In [9]:
%%time
data['text'] = data['text'].apply(get_average_feature_embedding)
test['text'] = test['text'].apply(get_average_feature_embedding)

CPU times: user 4min 1s, sys: 444 ms, total: 4min 2s
Wall time: 4min 2s


### save data and test

In [10]:
data.to_pickle('train')
test.to_pickle('test')

## 5. Fit an XGBoost classifier to this data. Report test and training errors.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

data = pd.read_pickle('train')
test = pd.read_pickle('test')

In [2]:
def scores(bst, d, y, data_set):
    y_hat_prob = bst.predict(d)
    y_hat = [int(x > 0.5) for x in y_hat_prob]
    return '%s accuracy: %f\n%s logloss: %f' % (data_set, accuracy_score(y, y_hat), data_set, log_loss(y, y_hat_prob))

In [3]:
X_data = np.array([x for x in data['text'].values])
y_data = data['flag'].values

X_test = np.array([x for x in test['text'].values])
y_test = test['flag'].values

In [4]:
xgb_pars = {"min_child_weight": 50, "eta": 0.05, "max_depth": 8,
            "subsample": 0.8, "silent" : 1, "nthread": 4,
            "eval_metric": "logloss", "objective": "binary:logistic"}

d_data = xgb.DMatrix(X_data, label=y_data)
d_test = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_data, 'train'), (d_test, 'valid')]

bst = xgb.train(xgb_pars, d_data, 400, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.680421	valid-logloss:0.682339
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.422993	valid-logloss:0.487192
[100]	train-logloss:0.337639	valid-logloss:0.438117
[150]	train-logloss:0.290643	valid-logloss:0.4156
[200]	train-logloss:0.258256	valid-logloss:0.402363
[250]	train-logloss:0.235819	valid-logloss:0.39412
[300]	train-logloss:0.217094	valid-logloss:0.388836
[350]	train-logloss:0.201216	valid-logloss:0.385207
[399]	train-logloss:0.188329	valid-logloss:0.382689


In [5]:
print(scores(bst, d_data, y_data, 'train'))
print(scores(bst, d_test, y_test, 'test'))

train accuracy: 0.958720
train logloss: 0.188329
test accuracy: 0.826120
test logloss: 0.382689


## 6. Compare previous results to fitting XGBoost to a one-hot encoding representation of the data with bag of words. Report test and training errors.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss
import nltk
from nltk.corpus import stopwords

%matplotlib inline

In [2]:
data = pd.read_csv('aclImdb/train.csv')
test = pd.read_csv('aclImdb/test.csv')

In [3]:
def scores(bst, d, y, data_set):
    y_hat_prob = bst.predict(d)
    y_hat = [int(x > 0.5) for x in y_hat_prob]
    return '%s accuracy: %f\n%s logloss: %f' % (data_set, accuracy_score(y, y_hat), data_set, log_loss(y, y_hat_prob))

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

stops = set(stopwords.words("english"))

count_vect = CountVectorizer(stop_words = stops, lowercase = True, binary = True).fit(data['text'].tolist())

In [5]:
X_data = count_vect.transform(data['text'].tolist())
y_data = data['flag'].values

X_test = count_vect.transform(test['text'].tolist())
y_test = test['flag'].values

In [6]:
xgb_pars = {"min_child_weight": 50, "eta": 0.05, "max_depth": 8,
            "subsample": 0.8, "silent" : 1, "nthread": 4,
            "eval_metric": "logloss", "objective": "binary:logistic"}

d_data = xgb.DMatrix(X_data, label=y_data)
d_test = xgb.DMatrix(X_test, label=y_test)

watchlist = [(d_data, 'train'), (d_test, 'valid')]

bst = xgb.train(xgb_pars, d_data, 400, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.681675	valid-logloss:0.681531
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.489659	valid-logloss:0.494371
[100]	train-logloss:0.428559	valid-logloss:0.439625
[150]	train-logloss:0.393221	valid-logloss:0.408737
[200]	train-logloss:0.368887	valid-logloss:0.388962
[250]	train-logloss:0.351054	valid-logloss:0.374743
[300]	train-logloss:0.337585	valid-logloss:0.364973
[350]	train-logloss:0.326808	valid-logloss:0.35713
[399]	train-logloss:0.317598	valid-logloss:0.351301


In [7]:
print(scores(bst, d_data, y_data, 'train'))
print(scores(bst, d_test, y_test, 'test'))

train accuracy: 0.873440
train logloss: 0.317598
test accuracy: 0.849600
test logloss: 0.351301
