In [1]:
import numpy as np
import pandas as pd
from glob import glob

In [2]:
path = './aclImdb'
pos_files = glob(path + '/train/pos/*.txt')
neg_files = glob(path + '/train/neg/*.txt')

In [3]:
pos_files_test = glob(path + '/test/pos/*.txt')
neg_files_test = glob(path + '/test/neg/*.txt')

In [4]:
pos_list = []
for file in pos_files:
    with open(file) as f:
        pos_list.append(pd.DataFrame({'comment':f.readlines(),'is_positive':1}))
pos_df = pd.concat(pos_list)

In [5]:
neg_list = []
for file in neg_files:
    with open(file) as f:
        neg_list.append(pd.DataFrame({'comment':f.readlines(),'is_positive':0}))
neg_df = pd.concat(neg_list)

In [6]:
pos_list_test = []
for file in pos_files_test:
    with open(file) as f:
        pos_list_test.append(pd.DataFrame({'comment':f.readlines(),'is_positive':1}))
pos_df_test = pd.concat(pos_list_test)

In [7]:
neg_list_test = []
for file in neg_files_test:
    with open(file) as f:
        neg_list_test.append(pd.DataFrame({'comment':f.readlines(),'is_positive':0}))
neg_df_test = pd.concat(neg_list_test)

In [8]:
data_df = pd.concat([pos_df,neg_df])
data_df = data_df.reset_index().drop('index',axis=1)
data_df.head()

Unnamed: 0,comment,is_positive
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [9]:
data_X = np.array([data_df['comment'][i] for i in range(25000)])
data_y = np.array(data_df['is_positive'])

In [10]:
test_df = pd.concat([pos_df_test,neg_df_test])
test_df = test_df.reset_index().drop('index',axis=1)
test_df.head()

Unnamed: 0,comment,is_positive
0,I went and saw this movie last night after bei...,1
1,Actor turned director Bill Paxton follows up h...,1
2,As a recreational golfer with some knowledge o...,1
3,"I saw this film in a sneak preview, and it is ...",1
4,Bill Paxton has taken the true story of the 19...,1


In [11]:
test_X = np.array([test_df['comment'][i] for i in range(25000)])
test_y = np.array(test_df['is_positive'])

## Tokenizing

In [61]:
#!python3 -m spacy download en

In [14]:
import spacy
import string
import re
from spacy.symbols import ORTH

In [15]:
re_br = re.compile(r'<\s*br\s*/?>', re.IGNORECASE)
def sub_br(x): return re_br.sub("\n", x)

my_tok = spacy.load('en')
def spacy_tok(x): return [tok.text for tok in my_tok.tokenizer(sub_br(x))]

In [16]:
# get stop words
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stops = set(stopwords.words("english"))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/libingyi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [17]:
def get_non_stopwords(comment):
    """Returns a list of non-stopwords"""
    return {x:1 for x in spacy_tok(str(comment).lower()) if x not in stops}.keys()

In [137]:
#train_df['comment'] = train_df['comment'].apply(get_non_stopwords)

## Embedding

In [18]:
emb_path = "./glove.6B/glove.6B.300d.txt"

In [19]:
def load_word_embedings(file=emb_path):
    embeddings = {}
    with open(file, 'r') as infile:
        for line in infile:
            values = line.split()
            embeddings[values[0]] = np.asarray(values[1:], dtype='float32')
    return embeddings

In [20]:
embeddings = load_word_embedings()

In [24]:
def sentence_features_v2(s, embeddings=embeddings, emb_size=300):
    words = get_non_stopwords(s)
    words = [w for w in words if w.isalpha() and w in embeddings]
    if len(words) == 0:
        return np.hstack([np.zeros(emb_size)])
    M = np.array([embeddings[w] for w in words])
    return M.mean(axis=0)

In [25]:
data_df['emb_comment'] = data_df['comment'].apply(sentence_features_v2)
data_df.head()

Unnamed: 0,comment,is_positive,emb_comment
0,Bromwell High is a cartoon comedy. It ran at t...,1,"[-0.08725257, 0.055149537, 0.023050005, 0.0222..."
1,Homelessness (or Houselessness as George Carli...,1,"[-0.034318715, 0.029895563, 0.015404149, -0.04..."
2,Brilliant over-acting by Lesley Ann Warren. Be...,1,"[-0.06797261, 0.06756984, -0.0057137716, -0.00..."
3,This is easily the most underrated film inn th...,1,"[-0.021300474, 0.01583602, 0.03760675, -0.0153..."
4,This is not the typical Mel Brooks film. It wa...,1,"[-0.112299, 0.07660703, -0.006365776, -0.08404..."


In [26]:
test_df['emb_comment'] = test_df['comment'].apply(sentence_features_v2)

In [27]:
index = np.random.choice(np.arange(0,2), size = len(data_df), p=[0.2, 0.8])

In [28]:
data_X_E = np.array([data_df['emb_comment'][i] for i in range(25000)])
test_X_E = np.array([test_df['emb_comment'][i] for i in range(25000)])

data_y_E = data_df['is_positive'].values
test_y_E = test_df['is_positive'].values

## XGB to embedding data

In [29]:
import xgboost as xgb

In [30]:
xgb_pars = {"min_child_weight": 50, "eta": 0.05, "max_depth": 8,
            "subsample": 0.8, "silent" : 1, "nthread": 4,
            "eval_metric": "logloss", "objective": "binary:logistic"}

d_train = xgb.DMatrix(data_X_E[index==1], label=data_y_E[index==1])
d_val = xgb.DMatrix(data_X_E[index==0], label=data_y_E[index==0])

watchlist = [(d_train, 'train'), (d_val, 'valid')]

bst = xgb.train(xgb_pars, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.679883	valid-logloss:0.682
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.412888	valid-logloss:0.476838
[100]	train-logloss:0.328801	valid-logloss:0.427204
[150]	train-logloss:0.28219	valid-logloss:0.404869
[200]	train-logloss:0.252317	valid-logloss:0.39348
[250]	train-logloss:0.229782	valid-logloss:0.385967
[300]	train-logloss:0.211535	valid-logloss:0.380778
[350]	train-logloss:0.196332	valid-logloss:0.376527
[399]	train-logloss:0.183322	valid-logloss:0.374052


In [40]:
pred_train = np.rint(bst.predict(xgb.DMatrix(data_X_E)))
print('training error:', np.mean(pred_train != data_y_E))

training error: 0.06704


In [41]:
pred_test = np.rint(bst.predict(xgb.DMatrix(test_X_E)))
print('test error:', np.mean(pred_test != test_y_E))

test error: 0.17168


## XGB to one hot encoding data

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

In [45]:
count = CountVectorizer()
data_X_bag = count.fit_transform([data_df['comment'][i] for i in range(len(data_df))])
test_X_bag = count.transform([test_df['comment'][i] for i in range(len(test_df))])

data_y_bag = np.array(data_df['is_positive'])
test_y_bag = np.array(test_df['is_positive'])

In [77]:
xgb_pars = {"min_child_weight": 50, "eta": 0.05, "max_depth": 8,
            "subsample": 0.8, "silent" : 1, "nthread": 4,
            "eval_metric": "logloss", "objective": "binary:logistic"}

d_train = xgb.DMatrix(data_X_bag[index==1], label=data_y_bag[index==1])
d_val = xgb.DMatrix(data_X_bag[index==0], label=data_y_bag[index==0])

watchlist = [(d_train, 'train'), (d_val, 'valid')]

bst = xgb.train(xgb_pars, d_train, 400, watchlist, early_stopping_rounds=50, verbose_eval=50)

[0]	train-logloss:0.681197	valid-logloss:0.68145
Multiple eval metrics have been passed: 'valid-logloss' will be used for early stopping.

Will train until valid-logloss hasn't improved in 50 rounds.
[50]	train-logloss:0.481588	valid-logloss:0.486601
[100]	train-logloss:0.415469	valid-logloss:0.428351
[150]	train-logloss:0.377008	valid-logloss:0.39646
[200]	train-logloss:0.34992	valid-logloss:0.375758
[250]	train-logloss:0.330063	valid-logloss:0.362617
[300]	train-logloss:0.314415	valid-logloss:0.353027
[350]	train-logloss:0.300773	valid-logloss:0.345136
[399]	train-logloss:0.289447	valid-logloss:0.338873


In [78]:
pred_train = np.rint(bst.predict(xgb.DMatrix(data_X_bag)))
print('training error:', np.mean(pred_train != data_y_bag))

training error: 0.11364


In [79]:
pred_test = np.rint(bst.predict(xgb.DMatrix(test_X_bag)))
print('test error:', np.mean(pred_test != test_y_bag))

test error: 0.14976
