In [185]:
import numpy as np
import pandas as pd
import gensim
from tqdm import tqdm
from BeautifulSoup import BeautifulSoup
import gc
import scipy

# Models For Experimentation
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, LinearSVR
from sklearn.naive_bayes import MultinomialNB, GaussianNB
import lightgbm as lgb 
import xgboost as xgb

# For Model Assessment
from sklearn.metrics import classification_report, accuracy_score, auc

# Data Split
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV

# List of English StopWords
from nltk.corpus import stopwords
stopwords = list(set(stopwords.words('english')))

In [186]:
dtypes = {
    'review'      : np.unicode ,
    'cleaned_review': np.unicode ,
    'tokens': np.unicode ,
    'sentiment' : np.int16
}
data_snt = pd.read_csv("cleaned_data/cleaned_training_reviews.csv", usecols=['review','sentiment','cleaned_review','tokens'],
                                                    encoding="utf-8", dtype=dtypes)
data_snt = data_snt.loc[np.random.permutation(data_snt.index)]

In [187]:
train, valid, train_l, valid_l = train_test_split(data_snt['cleaned_review'],
                                                          data_snt['sentiment'], test_size=0.2, random_state=42)

In [86]:
import re, string
re_tok = re.compile(u'([{string.punctuation}“”¨«»®´·º½¾¿¡§£₤‘’])')

def tokenize(s):
    return re_tok.sub(r' \1 ', s).split()

from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# Try Pretrained Word2Vec Model

## Let's start by Glove Model

In [28]:
# load the GloVe vectors in a dictionary:
embeddings_index_glove = {}
f = open('glove.840B.300d.txt')
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index_glove[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index_glove))

2196017it [02:02, 17954.72it/s]

Found 2196016 word vectors.





In [155]:
# this function creates a normalized vector for the whole sentence
def sent2vec(s, embeddings_index):
    words = str(s).lower().decode('utf-8')
    words = tokenize(words)
    words = [w for w in words if not w in stop_words]
    words = [w for w in words if w.isalpha()]
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [188]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x, embeddings_index_glove) for x in tqdm(train)]
xvalid_glove = [sent2vec(x, embeddings_index_glove) for x in tqdm(valid)]

xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

100%|██████████| 19923/19923 [00:21<00:00, 938.13it/s]
100%|██████████| 4981/4981 [00:04<00:00, 1170.71it/s]


After generating the features its time to build the model

In [190]:
log_reg = LogisticRegression(C= 1, class_weight='balanced', solver='sag', n_jobs=-1, random_state=42) 
log_reg.fit(xtrain_glove, train_l)
y_pred_log = log_reg.predict(xvalid_glove)
report_log = classification_report(valid_l, y_pred_log)

In [191]:
print report_log

             precision    recall  f1-score   support

          0       0.64      0.61      0.63      2493
          1       0.63      0.66      0.64      2488

avg / total       0.64      0.64      0.64      4981



In [192]:
sub_sg = SGDClassifier(alpha= 1e-05, max_iter= 10000, loss='log', penalty= 'l2', n_jobs=-1, random_state=42)
sub_sg.fit(xtrain_glove, train_l)
y_pred = sub_sg.predict(xvalid_glove)
report = classification_report(valid_l, y_pred)

In [193]:
print(report)

             precision    recall  f1-score   support

          0       0.67      0.64      0.66      2493
          1       0.66      0.68      0.67      2488

avg / total       0.66      0.66      0.66      4981



In [92]:
data_test = pd.read_csv("cleaned_data/cleaned_testing_reviews.csv", usecols=['review','cleaned_review','tokens'],
                                                    encoding="utf-8", dtype=dtypes)

In [93]:
xtest_glove = [sent2vec(x, embeddings_index_glove) for x in tqdm(data_test['cleaned_review'])]
xtest_glove = np.array(xtest_glove)

100%|██████████| 25000/25000 [00:19<00:00, 1284.59it/s]


In [94]:
predictions = sub_sg.predict(xtest_glove)

In [95]:
subm = pd.read_csv('data/sampleSubmission.csv')    
submid = pd.DataFrame({'id': subm["id"], 'sentiment': predictions})
submid.to_csv('submissions/sample_submission_Logsgd_glove.csv', index=False)

    ### I got AUC curve of 0.65272 

## Then the Google Word2Vec Model

- First we 'll load the binary model of Googlenews Word2vec model then we will save the wors and its word vectors in a txt file

In [6]:
from gensim.models.keyedvectors import KeyedVectors

model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)
model.save_word2vec_format('GoogleNews-vectors-negative300.txt', binary=False)

In [37]:
# load the GloVe vectors in a dictionary:
embeddings_index_google = {}
f = open('GoogleNews-vectors-negative300.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index_google[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index_google))

Found 3000001 word vectors.


In [189]:
# create sentence vectors using the above function for training and validation set
xtrain_google = [sent2vec(x, embeddings_index_google) for x in tqdm(train)]
xvalid_google = [sent2vec(x, embeddings_index_google) for x in tqdm(valid)]

xtrain_google = np.array(xtrain_google)
xvalid_google = np.array(xvalid_google)

100%|██████████| 19923/19923 [00:17<00:00, 1119.97it/s]
100%|██████████| 4981/4981 [00:03<00:00, 1295.96it/s]


In [97]:
xtest_google = [sent2vec(x, embeddings_index_google) for x in tqdm(data_test['cleaned_review'])]
xtest_google = np.array(xtest_google)

100%|██████████| 25000/25000 [00:19<00:00, 1296.36it/s]


In [98]:
log_reg = LogisticRegression(C= 1, class_weight='balanced', solver='sag', n_jobs=-1, random_state=42) 
log_reg.fit(xtrain_google, train_l)
y_pred_log = log_reg.predict(xvalid_google)
report_log = classification_report(valid_l, y_pred_log)

In [99]:
print report_log

             precision    recall  f1-score   support

          0       0.64      0.62      0.63      2443
          1       0.65      0.67      0.66      2538

avg / total       0.64      0.64      0.64      4981



In [100]:
sub_sg = SGDClassifier(alpha= 1e-05, max_iter= 10000, loss='log', penalty= 'l2', n_jobs=-1, random_state=42)
sub_sg.fit(xtrain_google, train_l)
y_pred = sub_sg.predict(xvalid_google)
report = classification_report(valid_l, y_pred)

In [101]:
print(report)

             precision    recall  f1-score   support

          0       0.67      0.64      0.65      2443
          1       0.67      0.70      0.68      2538

avg / total       0.67      0.67      0.67      4981



In [102]:
predictions_google = sub_sg.predict(xtest_google)

In [103]:
subm = pd.read_csv('data/sampleSubmission.csv')    
submid = pd.DataFrame({'id': subm["id"], 'sentiment': predictions_google})
submid.to_csv('submissions/sample_submission_Logsgd_google.csv', index=False)

    ### I got AUC curve of 0.50000 

## Note

definelty we can do better by doing some hyperparameters Grid Search Optimization




# I will try to add some engineered features and see how things will turn out


In [194]:
train_df = pd.DataFrame(train)
valid_df = pd.DataFrame(valid)

In [195]:
data = [train_df, valid_df, data_test]
for element in data:
    element['total_length'] = element['cleaned_review'].apply(len)
    element['capitals'] = element['cleaned_review'].apply(lambda comment: sum(1 for c in comment if c.isupper()))
    element['caps_vs_length'] = element.apply(lambda row: float(row['capitals'])/float(row['total_length']), axis=1)
    element['num_exclamation_marks'] = element['cleaned_review'].apply(lambda comment: comment.count('!'))
    element['num_question_marks'] = element['cleaned_review'].apply(lambda comment: comment.count('?'))
    element['num_punctuation'] = element['cleaned_review'].apply(lambda comment: sum(comment.count(w) for w in '.,;:'))
    element['num_symbols'] = element['cleaned_review'].apply(lambda comment: sum(comment.count(w) for w in '*&$%'))
    element['num_words'] = element['cleaned_review'].apply(lambda comment: len(comment.split()))
    element['num_unique_words'] = element['cleaned_review'].apply(lambda comment: len(set(w for w in comment.split())))
    element['words_vs_unique'] = element['num_unique_words'] / element['num_words']
    element['num_smilies'] = element['cleaned_review'].apply(lambda comment: sum(comment.count(w) for w in (':-)', ':)', ';-)', ';)')))
    
col = ['total_length', 'capitals', 'caps_vs_length',
       'num_exclamation_marks', 'num_question_marks', 'num_punctuation',
       'num_symbols', 'num_words', 'num_unique_words', 'words_vs_unique',
       'num_smilies']


In [196]:
train = scipy.sparse.csr_matrix(train_df[col].values)
valid = scipy.sparse.csr_matrix(valid_df[col].values)
test = scipy.sparse.csr_matrix(data_test[col].values)

reviews_train = scipy.sparse.hstack([train,xtrain_glove])
reviews_valid = scipy.sparse.hstack([valid, xvalid_glove])
reviews_test = scipy.sparse.hstack([test, xtest_glove])

In [197]:
log_reg = LogisticRegression(C=0.1, class_weight='balanced', solver='sag', n_jobs=-1, random_state=42) 
log_reg.fit(reviews_train, train_l)
y_pred_log = log_reg.predict(reviews_valid)
report_log = classification_report(valid_l, y_pred_log)

print(report_log)

             precision    recall  f1-score   support

          0       0.54      0.72      0.62      2493
          1       0.58      0.39      0.46      2488

avg / total       0.56      0.55      0.54      4981



In [198]:
sub_sg = SGDClassifier(alpha= 1e-05, max_iter= 10000, loss='log', penalty= 'l2', n_jobs=-1, random_state=42)
sub_sg.fit(reviews_train, train_l)
y_pred = sub_sg.predict(reviews_valid)
report = classification_report(valid_l, y_pred)

print(report)

             precision    recall  f1-score   support

          0       0.59      0.36      0.45      2493
          1       0.54      0.75      0.63      2488

avg / total       0.57      0.56      0.54      4981



In [217]:
def runXGB(train_X, train_y, test_X, test_y=None, feature_names=None, seed_val=2017, num_rounds=500):
    param = {}
    param['objective'] = 'binary:logistic'
    param['eta'] = 0.1
    param['max_depth'] = 6
    param['silent'] = 1
    param['eval_metric'] = 'auc'
    param['min_child_weight'] = 1
    param['subsample'] = 0.7
    param['colsample_bytree'] = 0.7
    param['seed'] = seed_val
    num_rounds = num_rounds

    plst = list(param.items())
    xgtrain = xgb.DMatrix(train_X, label=train_y)

    if test_y is not None:
        xgtest = xgb.DMatrix(test_X, label=test_y)
        watchlist = [ (xgtrain,'train'), (xgtest, 'test') ]
        model = xgb.train(plst, xgtrain, num_rounds, watchlist, early_stopping_rounds=20)
    else:
        xgtest = xgb.DMatrix(test_X)
        model = xgb.train(plst, xgtrain, num_rounds)

    return model 

In [218]:
model = runXGB(reviews_train, train_l, reviews_valid, valid_l)

[0]	train-auc:0.669302	test-auc:0.619819
Multiple eval metrics have been passed: 'test-auc' will be used for early stopping.

Will train until test-auc hasn't improved in 20 rounds.
[1]	train-auc:0.70475	test-auc:0.63373
[2]	train-auc:0.722843	test-auc:0.646428
[3]	train-auc:0.738798	test-auc:0.653257
[4]	train-auc:0.749734	test-auc:0.659659
[5]	train-auc:0.758851	test-auc:0.665239
[6]	train-auc:0.765508	test-auc:0.668008
[7]	train-auc:0.772548	test-auc:0.669871
[8]	train-auc:0.776613	test-auc:0.672737
[9]	train-auc:0.781289	test-auc:0.673908
[10]	train-auc:0.786729	test-auc:0.675794
[11]	train-auc:0.791046	test-auc:0.67751
[12]	train-auc:0.795207	test-auc:0.676915
[13]	train-auc:0.800332	test-auc:0.678737
[14]	train-auc:0.803187	test-auc:0.682103
[15]	train-auc:0.807179	test-auc:0.685199
[16]	train-auc:0.811374	test-auc:0.686277
[17]	train-auc:0.815709	test-auc:0.686354
[18]	train-auc:0.819212	test-auc:0.68676
[19]	train-auc:0.822477	test-auc:0.686165
[20]	train-auc:0.826239	test-auc:

In [219]:
preds = model.predict(xgb.DMatrix(reviews_test), ntree_limit = model.best_ntree_limit)
preds[:10]

array([0.7374206 , 0.19246723, 0.32274005, 0.33006647, 0.48401198,
       0.42097735, 0.21041808, 0.6455684 , 0.81396955, 0.90532845],
      dtype=float32)

In [220]:
subm = pd.read_csv('data/sampleSubmission.csv')    
submid = pd.DataFrame({'id': subm["id"], 'sentiment': preds})
submid.to_csv('submissions/sample_submission_xgboost.csv', index=False)

     ### I got AUC of 0.70822