In [2]:
import numpy as np 
import pandas as pd 
import os
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Read in the data into test and train set

In [3]:
#path = "/Users/Yimei/Desktop/spring1/ml2/data/aclImdb/"
path = "/Users/Yimei/Desktop/spring1/ml2/data/aclImdb/"
names = ['neg','pos']

In [6]:
def texts_from_folders(src, names):
    texts,labels = [],[]
    for idx,name in enumerate(names):
        path = os.path.join(src, name)
        for fname in sorted(os.listdir(path)):
            fpath = os.path.join(path, fname)
            texts.append(open(fpath).read())
            labels.append(idx)
    return texts,np.array(labels)

In [7]:
trn,trn_y = texts_from_folders(path+'train',names)
val,val_y = texts_from_folders(path+'test',names)

In [8]:
type(trn), trn[0]

(list,
 "Story of a man who has unnatural feelings for a pig. Starts out with a opening scene that is a terrific example of absurd comedy. A formal orchestra audience is turned into an insane, violent mob by the crazy chantings of it's singers. Unfortunately it stays absurd the WHOLE time with no general narrative eventually making it just too off putting. Even those from the era should be turned off. The cryptic dialogue would make Shakespeare seem easy to a third grader. On a technical level it's better than you might think with some good cinematography by future great Vilmos Zsigmond. Future stars Sally Kirkland and Frederic Forrest can be seen briefly.")

## Tokenizing with spacy

In [9]:
import spacy
import string
import re
from spacy.symbols import ORTH
from __future__ import unicode_literals

In [46]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [47]:
trn_trial = [spacy_tok(w.lower()) for w in trn[:3]]

## Creating embedding features - average embedding feature for each review

from the https://nlp.stanford.edu/projects/glove/

In [31]:
globe_path = '/Users/Yimei/data/glove/glove.6B/glove.6B.300d.txt'

In [32]:
def load_word_embedings(file =globe_path):
    embeddings = {}
    with open(file, 'r') as infile:
        for line in infile:
            values = line.split()
            embeddings[values[0]] = np.asarray(values[1:], dtype='float32')
    return embeddings

In [33]:
embeddings = load_word_embedings()

In [34]:
len(embeddings.keys())

400000

In [35]:
# get stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to /Users/Yimei/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [50]:
# modified from https://www.kaggle.com/anokas/data-analysis-xgboost-starter-0-35460-lb
def get_non_stopwords(s):
    """Returns a list of non-stopwords"""
    return {x:1 for x in spacy_tok(s.lower()) if x not in stops}.keys()
    

In [51]:
def sentence_features(s, embeddings=embeddings, emb_size=300):
    words = get_non_stopwords(s)
    words = [w for w in words if w.isalpha() and w in embeddings]
    if len(words) == 0:
        return np.hstack([np.zeros(emb_size)])
    M = np.array([embeddings[w] for w in words])
    return M.mean(axis=0)

In [53]:
x_trn = np.array([sentence_features(x) for x in trn])

In [54]:
x_val = np.array([sentence_features(x) for x in val])

In [58]:
x_trn.shape

(25000, 300)

## Fit XGBoost with average feature embedding 

In [10]:
import xgboost as xgb

In [75]:
xgb_pars = {"min_child_weight": 50, "eta": 0.05, "max_depth": 8,
            "subsample": 0.8, "silent" : 1, "nthread": 4,
            "eval_metric": "logloss", "objective": "binary:logistic"}

d_train = xgb.DMatrix(x_trn, label=trn_y)
d_val = xgb.DMatrix(x_val, label=val_y)

watchlist = [(d_train, 'train'), (d_val, 'valid')]

bst = xgb.train(xgb_pars, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.679729	valid-logloss:0.681471
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.412146	valid-logloss:0.472889
[100]	train-logloss:0.327442	valid-logloss:0.423206
[150]	train-logloss:0.282454	valid-logloss:0.40227
[200]	train-logloss:0.25224	valid-logloss:0.391278
[250]	train-logloss:0.230545	valid-logloss:0.38444
[300]	train-logloss:0.212875	valid-logloss:0.380176
[350]	train-logloss:0.198953	valid-logloss:0.377312


In [67]:
#report training and validation log loss
from sklearn.metrics import log_loss
prob_trn = bst.predict(d_train)
prob_val = bst.predict(d_val)

In [69]:
log_loss_trn = log_loss(trn_y, prob_trn)

In [70]:
log_loss_val = log_loss(val_y, prob_val)

In [73]:
print "log_loss_trn:",log_loss_trn, "; log_loss_val:",log_loss_val

log_loss_trn: 0.186594901618 ; log_loss_val: 0.37474418679


## Encoding with one-hot-encoded bag of words

In [11]:
from sklearn.feature_extraction.text import CountVectorizer

In [12]:
veczr = CountVectorizer()

In [13]:
trn_term_doc = veczr.fit_transform(trn)
#transform: use the previous created bag
val_term_doc = veczr.transform(val)
#use the previously created model, here is trn_term_doc

In [14]:
trn_term_doc

<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3445861 stored elements in Compressed Sparse Row format>

In [15]:
val_term_doc

<25000x74849 sparse matrix of type '<class 'numpy.int64'>'
	with 3339346 stored elements in Compressed Sparse Row format>

In [9]:
#convert sparse matrix to pandas dataframe
train_df = pd.SparseDataFrame(trn_term_doc)

In [10]:
#fill the nan with 0
train_df_filled_na = train_df.fillna(0)

In [11]:
val_df = pd.SparseDataFrame(val_term_doc)
val_df_filled_na = val_df.fillna(0)

In [12]:
vocab = veczr.get_feature_names(); vocab[5000:5005]

[u'augustine', u'augusto', u'augustus', u'auh', u'auie']

## Fit XGBoost with bag of words

In [16]:
xgb_pars = {"min_child_weight": 50, "eta": 0.05, "max_depth": 8,
            "subsample": 0.8, "silent" : 1,
            "eval_metric": "logloss", "objective": "binary:logistic"}


d_train = xgb.DMatrix(trn_term_doc, label=trn_y)
d_val = xgb.DMatrix(val_term_doc, label=val_y)


watchlist = [(d_train, 'train'), (d_val, 'valid')]


In [17]:
bst = xgb.train(xgb_pars, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.681085	valid-logloss:0.681127
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.479689	valid-logloss:0.489127
[100]	train-logloss:0.413813	valid-logloss:0.431985
[150]	train-logloss:0.374572	valid-logloss:0.400454
[200]	train-logloss:0.347537	valid-logloss:0.380255
[250]	train-logloss:0.327139	valid-logloss:0.365904
[300]	train-logloss:0.311089	valid-logloss:0.355359
[350]	train-logloss:0.297842	valid-logloss:0.347335
[399]	train-logloss:0.286724	valid-logloss:0.341357


In [18]:
prob_trn = bst.predict(d_train)
prob_val = bst.predict(d_val)

In [20]:
from sklearn.metrics import log_loss
log_loss_trn = log_loss(trn_y, prob_trn)
log_loss_val = log_loss(val_y, prob_val)

In [22]:
print "log_loss_trn:",log_loss_trn, "; log_loss_val:",log_loss_val

log_loss_trn: 0.286724204693 ; log_loss_val: 0.341357342403
