In [3]:
import numpy as np 
import pandas as pd 
import fasttext
import bz2
import string
import csv
from sklearn.metrics import roc_auc_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
import re
import random
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, roc_auc_score, accuracy_score
%matplotlib inline

I've tried working with all 3,6 millions of reviews, then I tried working with 500k reviews, but my laptop does not have enough computational power, so I settled on working with 360k.

Labels are evenly distributed in the set, so it won't be a problem, but accuracy might take a hit. 

## Fasttext

In [34]:
data = bz2.BZ2File("train.ft.txt.bz2")
data = data.readlines()
data = [x.decode('utf-8') for x in data]
len(data)

3600000

In [40]:
sample_data = data[:360000]
len(sample_data)

360000

In [41]:
sample_data = pd.DataFrame(sample_data)
sample_data.to_csv("sample_train.txt", index=False, sep=' ', header=False, quoting=csv.QUOTE_NONE, quotechar="", escapechar=" ")

In [42]:
model = fasttext.train_supervised('sample_train.txt',label_prefix='__label__', thread=4, epoch = 10)
print(model.labels, 'are the labels or targets the model is predicting')

['__label__2', '__label__1'] are the labels or targets the model is predicting


In [43]:
# Load the test data 
test = bz2.BZ2File("test.ft.txt.bz2")
test = test.readlines()
test = [x.decode('utf-8') for x in test]

In [44]:
sample_test = test[:40000] 
len(sample_test)

40000

In [45]:
new = [w.replace('__label__2 ', '') for w in sample_test]
new = [w.replace('__label__1 ', '') for w in new]
new = [w.replace('\n', '') for w in new]

In [46]:
pred = model.predict(new)

In [49]:
labels = [0 if x.split(' ')[0] == '__label__1' else 1 for x in sample_test]
pred_labels = [0 if x == ['__label__1'] else 1 for x in pred[0]]

In [50]:
# run the accuracy measure. 
print(roc_auc_score(labels, pred_labels))

0.8988120119772891


This is a relatively high result, but I've tried this with all 3,6 millions and the roc_auc was somewhere around 91%. But it wouldn't be logical to compare models that were trained on different scales of data, so I guess I'll have to sacrifice those 2% for getting any results from machine learning algorithms. 

## Machine learning

In [4]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [5]:
train_sample = train.sample(n = 360000, random_state=123).reset_index(drop=True)
train_sample['label'].value_counts()

__label__1    180295
__label__2    179705
Name: label, dtype: int64

In [6]:
test_sample = test.sample(n = 40000, random_state=123).reset_index(drop=True)
test_sample['label'].value_counts()

__label__1    20127
__label__2    19873
Name: label, dtype: int64

In [7]:
labels = {'__label__2': 1, '__label__1': 0}
train_sample['label'] = train_sample['label'].map(labels).astype(int) # changed to make further calculations easier
test_sample['label'] = test_sample['label'].map(labels).astype(int)

In [6]:
def process_data(data): 
    
    data = data.str.lower() # lower case
     
    data = data.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation))) # remove punctuation
    
    return data

In [7]:
train_sample['clean'] = process_data(train_sample['review'])
test_sample['clean'] = process_data(test_sample['review'])

In [8]:
train_sample

Unnamed: 0,label,review,clean
0,0,this movie sucks: This movie supposedly about ...,this movie sucks this movie supposedly about m...
1,0,Good Entertainment: This program a well edited...,good entertainment this program a well edited ...
2,1,Does the job: This hamper does the job in my k...,does the job this hamper does the job in my ki...
3,0,"Buffett Mails it In: Being a huge Buffett fan,...",buffett mails it in being a huge buffett fan i...
4,1,Sharp as a razor... almost.: Wow! My replaceme...,sharp as a razor almost wow my replacement is ...
...,...,...,...
359995,1,A different perspective on an often portrayed ...,a different perspective on an often portrayed ...
359996,1,What a tragic waste. She had my favourite sung...,what a tragic waste she had my favourite sungl...
359997,0,"Worst book ever, seriously. . .: First, the ra...",worst book ever seriously first the rating i...
359998,1,Great Bed Rail: This rail has worked perfectly...,great bed rail this rail has worked perfectly ...


In [9]:
train_list = train_sample['clean'].values.tolist()
test_list = test_sample['clean'].values.tolist()

In [10]:
vectorizer = TfidfVectorizer(max_features=2500, stop_words=stopwords.words('english'))
processed_train = vectorizer.fit_transform(train_list).toarray()
processed_test = vectorizer.fit_transform(test_list).toarray()

In [11]:
X_train = processed_train
y_train = train_sample['label']
X_test = processed_test
y_test = test_sample['label']

In [12]:
from xgboost import XGBClassifier
XGboost = XGBClassifier(n_jobs = 1, random_state=1234, learning_rate=0.9)
XGboost.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster=None, colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints=None,
              learning_rate=0.9, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints=None,
              n_estimators=100, n_jobs=1, num_parallel_tree=1,
              objective='binary:logistic', random_state=1234, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method=None,
              validate_parameters=False, verbosity=None)

In [13]:
xgb = XGboost.predict_proba(X_test)

xgb

array([[0.37864685, 0.62135315],
       [0.82208693, 0.17791308],
       [0.995191  , 0.00480904],
       ...,
       [0.84900904, 0.15099095],
       [0.2113288 , 0.7886712 ],
       [0.54813254, 0.45186743]], dtype=float32)

In [14]:
np.shape(xgb)

(40000, 2)

In [15]:
xgb_whole = xgb[:,1]>=0.4 # cut-off value

# converting the results to integer type
xgb_int=xgb_whole.astype(np.int)

In [45]:
xgb_new = xgb[:,1]>=0.4
xgb_new_int = xgb_new.astype(np.int)

In [30]:
xgb_new_int

array([1, 0, 0, ..., 0, 1, 1])

In [31]:
y_test

0        0
1        1
2        0
3        0
4        1
        ..
39995    0
39996    1
39997    0
39998    0
39999    1
Name: label, Length: 40000, dtype: int64

In [46]:
print(roc_auc_score(y_test, xgb_new_int))

0.5864640464465128


We can see that both roc_auc and F1 scores are very low for this model. It might be because I didn't preprocess words (no lemmatization, no tokenization - but I've read that it's not recommended for sentiment analysis). Fasttext obviously gave higher results. 

For learning rate 0.1 - ROC_AUC - 58,22%
For learning rate 0.9 - ROC_AUC - 58,64%

I tried different cut-off values but the 0.4 one gave the highest score.

## Neural network

https://www.kaggle.com/muonneutrino/sentiment-analysis-with-amazon-reviews

In [18]:
from tensorflow.python.keras import models, layers, optimizers
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer, text_to_word_sequence
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [19]:
train_texts, val_texts, train_labels, val_labels = train_test_split(
    train_sample['review'], train_sample['label'], random_state=1234, test_size=0.2)

In [20]:
test_labels = test_sample['label']
test_texts = test_sample['review']

In [21]:
FEATURES = 10000
tokenizer = Tokenizer(num_words=FEATURES)
tokenizer.fit_on_texts(train_texts)

In [22]:
train_texts = tokenizer.texts_to_sequences(train_texts)
val_texts = tokenizer.texts_to_sequences(val_texts)
test_texts = tokenizer.texts_to_sequences(test_texts)

In [23]:
LENGTH = max(len(train_ex) for train_ex in train_texts)
train_texts = pad_sequences(train_texts, maxlen=LENGTH)
val_texts = pad_sequences(val_texts, maxlen=LENGTH)
test_texts = pad_sequences(test_texts, maxlen=LENGTH)

In [24]:
vals = np.asarray(val_labels)
trains = np.asarray(train_labels)

In [25]:
input_layer = layers.Input(shape=(LENGTH,))
embedding = layers.Embedding(FEATURES, 64)(input_layer)
text_layer = layers.Conv1D(128, 3, activation='relu')(embedding)
text_layer = layers.BatchNormalization()(text_layer)
text_layer = layers.MaxPooling1D(3)(text_layer)
text_layer = layers.Conv1D(128, 3, activation='relu')(text_layer)
text_layer = layers.BatchNormalization()(text_layer)
text_layer = layers.MaxPooling1D(3)(text_layer)
text_layer = layers.Conv1D(128, 3, activation='relu')(text_layer)
text_layer = layers.BatchNormalization()(text_layer)
text_layer = layers.MaxPooling1D(3)(text_layer)
text_layer = layers.Conv1D(128, 3, activation='relu')(text_layer)
text_layer = layers.GlobalMaxPooling1D()(text_layer)
text_layer = layers.Flatten()(text_layer)
text_layer = layers.Dense(128, activation='relu')(text_layer)
output_layer = layers.Dense(1, activation='sigmoid')(text_layer)
model = models.Model(input_layer, output_layer)

In [26]:
model.compile(
    optimizer='Adam',
    loss='binary_crossentropy',
    metrics=['binary_accuracy']
    )

In [27]:
model.fit(
    train_texts, 
    trains, 
    batch_size=128,
    epochs=2,
    validation_data=(val_texts, vals))

Train on 288000 samples, validate on 72000 samples
Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x1a88383b10>

In [28]:
predictions = model.predict(test_texts)

In [29]:
print('ROC AUC score: {:0.4}'.format(roc_auc_score(test_labels, predictions)))

ROC AUC score: 0.9808


CNN with 16 layers gave the best results among 3 models. I first tried working with 64 neurons, which gave me ROC-AUC score of around 97%, and with 12 layers, which gave me around 96% ROC-AUC score.

## Conclusions

I chose ROC-AUC score because it basically averages over all possiblt tresholds and it has F score values. Also, the dataset was balanced, so it too affected the choice of the metric.

FastText score: 89.88%

TFIDF + XGBoost score: 58.64% (yikes)

CNN 1D score: 98%

So, the neural network did the best job out of all three. Go neural networks!