# Import packages

In [15]:
# Packages
from time import time
import re
import os
import nltk
import numpy as np
import multiprocessing as mp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
from tqdm import tqdm
from time import time
import multiprocessing as mp
import pickle
import string

from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from emoji import demojize

from sklearn import naive_bayes, svm, linear_model
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score, confusion_matrix, roc_curve
from statlearning import plot_confusion_matrix

# Settings
sns.set_context('notebook') 
sns.set_style('ticks') 
colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']
crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']
sns.set_palette(colours)
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)
tqdm.pandas(desc="Progress bar")
cores = mp.cpu_count()
warnings.filterwarnings('ignore')
Tokenizer = TweetTokenizer()

In [16]:
def _apply_df(args):
    df, func, kwargs = args
    return df.progress_apply(func, **kwargs)

def multi_apply(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = mp.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))

# Import dataset

In [4]:
train = pd.read_csv('train.csv', header=None)

# Merge title and content
train['Text'] = train[1]+' '+train[2]
train = train.drop(columns=[1,2])

# Negative = 0, Positive = 1
train[0] = train[0].map(lambda x: x-1)
train.rename(columns={0:'Sentiment'}, inplace=True)

In [5]:
test = pd.read_csv('test.csv', header=None)

# Merge title and content
test['Text'] = test[1]+' '+test[2]
test = test.drop(columns=[1,2])

# Negative = 0, Positive = 1
test[0] = test[0].map(lambda x: x-1)
test.rename(columns={0:'Sentiment'}, inplace=True)

In [6]:
# Merge dataset
fullset = pd.concat([train,test], axis=0, ignore_index=True)
del(train)
del(test)
fullset.to_csv('fullset.csv', index=0)

In [4]:
fullset = pd.read_csv('fullset.csv')

# Data Observation

In [7]:
fullset.tail()

Unnamed: 0,Sentiment,Text
3999995,0,Unbelievable- In a Bad Way We bought this Thom...
3999996,0,"Almost Great, Until it Broke... My son recieve..."
3999997,0,Disappointed !!! I bought this toy for my son ...
3999998,1,Classic Jessica Mitford This is a compilation ...
3999999,0,"Comedy Scene, and Not Heard This DVD will be a..."


In [8]:
fullset['Sentiment'].value_counts()

1    2000000
0    2000000
Name: Sentiment, dtype: int64

In [5]:
fullset['Text'] = fullset['Text'].apply(str)

In [10]:
(fullset['Text'].apply(len)).describe(percentiles=[.95, .99]).round(0)

count    4000000.0
mean         431.0
std          238.0
min            3.0
50%          382.0
95%          894.0
99%          988.0
max         1014.0
Name: Text, dtype: float64

# Data Preprocessing

In [57]:
# Text lowercase + Stemming
def pre_proc(text):
    return ''.join([PorterStemmer().stem(x) for x in Tokenizer.tokenize(text.lower()) if x != ''])

In [None]:
fullset['Token'] = multi_apply(fullset['Text'], pre_proc, workers=cores-1)

Progress bar: 100%|██████████| 571428/571428 [53:35<00:00, 177.72it/s]  
Progress bar: 100%|██████████| 571428/571428 [53:53<00:00, 176.70it/s]
Progress bar: 100%|██████████| 571429/571429 [54:11<00:00, 175.73it/s]
Progress bar: 100%|██████████| 571428/571428 [54:45<00:00, 173.95it/s]
Progress bar: 100%|██████████| 571429/571429 [54:50<00:00, 173.65it/s]
Progress bar: 100%|██████████| 571429/571429 [55:10<00:00, 172.60it/s]
Progress bar: 100%|██████████| 571429/571429 [55:31<00:00, 171.52it/s]


In [None]:
fullset = pd.read_csv('fullset_ez_processed.csv')

In [28]:
fdist = nltk.FreqDist()

In [None]:
for i in fullset['Token']:
    for word in i.split():
        fdist[word] += 1

In [34]:
# Count the number of UNIQUE word
features = pd.Series(dict(fdist))
features.describe(percentiles=[.95, .99]).round(0)

count     1804633.0
mean          175.0
std         21403.0
min             1.0
50%             1.0
95%            17.0
99%           271.0
max      15792716.0
dtype: float64

In [37]:
features_1 = features[features==1]
print('There are',len(features_1),'features which only appear once.')

There are 1246917 words which only appear once.


In [72]:
print('Some once features are like this:','\''+features_1.index[1]+'\'')
print('So need to re-split them.')

Some once features are like this: 'peaceful.on'
So need to re-split them.


In [159]:
features_re = features[features<=1]

In [162]:
relist = [x for x in features_1.index if (not x.isalpha())]

In [163]:
print('There are',len(relist),'features with punctuations.')

There are 824988 features with punctuations.


In [164]:
relist_str = ''.join(relist)

In [181]:
separator = '|'.join(list(set([r'{}'.format(x) for x in relist_str if not x.isalpha()])))
print(separator,'\n')
print(demojize(separator))

🎥|̣|∂|😁|„|'||7|6|̇|~|-|😠|😔|]|≠|☆|#|😍|/|[|9|【|.|_|😩|↓|5|₤||)|⊖|👏|♠|8|💜|1|∅|℉|😡|&||␟|╚|👎|ً|🎉|$|💅|（|💖|⌫|:|😀|}|,|⟨||😎|0|=|>|╝|"|☼|%|<|♡|⊕|;|】|4|′||+|?|†||^|⊂|\|）|‼|{|!|2|😉|̄|@|*|─||♣|||(|3 

:movie_camera:|̣|∂|:beaming_face_with_smiling_eyes:|„|'||7|6|̇|~|-|:angry_face:|:pensive_face:|]|≠|☆|#|:smiling_face_with_heart-eyes:|/|[|9|【|.|_|:weary_face:|↓|5|₤||)|⊖|:clapping_hands:|:spade_suit:|8|:purple_heart:|1|∅|℉|:pouting_face:|&||␟|╚|:thumbs_down:|ً|:party_popper:|$|:nail_polish:|（|:sparkling_heart:|⌫|:|:grinning_face:|}|,|⟨||:smiling_face_with_sunglasses:|0|=|>|╝|"|☼|%|<|♡|⊕|;|】|4|′||+|?|†||^|⊂|\|）|:double_exclamation_mark:|{|!|2|:winking_face:|̄|@|*|─||:club_suit:|||(|3


In [30]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
def resplit(text):
    # Translate emojis
    text = demojize(text)
    # Remove punctuation
    for i in string.punctuation:
        text = text.replace(i,' ')
    # Token
    text = Tokenizer.tokenize(text)

    return text

In [None]:
fullset.Token = fullset.Token.apply(resplit)

In [49]:
fullset.tail()

Unnamed: 0,Sentiment,Token
3999995,0,"[unbeliev, in, a, bad, way, we, bought, thi, t..."
3999996,0,"[almost, great, until, it, broke, my, son, rec..."
3999997,0,"[disappoint, i, bought, thi, toy, for, my, son..."
3999998,1,"[classic, jessica, mitford, thi, is, a, compil..."
3999999,0,"[comedi, scene, and, not, heard, thi, dvd, wil..."


In [50]:
with open('fullset_resplit.pickle', 'wb') as f:
    pickle.dump(fullset, f)

In [53]:
# Re-count
fdist = nltk.FreqDist()

for i in tqdm(fullset['Token']):
    for word in i:
        fdist[word] += 1

100%|██████████| 4000000/4000000 [06:07<00:00, 10893.40it/s]


In [54]:
fdist

FreqDist({'the': 15809493, 'i': 9294446, 'and': 8586104, 'a': 8061406, 'to': 7712911, 'it': 7578653, 'of': 6298197, 'thi': 5911844, 'is': 5530069, 'in': 3713530, ...})

In [55]:
# Count the number of UNIQUE word
features = pd.Series(dict(fdist))
features.describe(percentiles=[.95, .99]).round(0)

count      858011.0
mean          375.0
std         31793.0
min             1.0
50%             1.0
95%            51.0
99%          1065.0
max      15809493.0
dtype: float64

In [56]:
features_1 = features[features==3]
print('There are',len(features_1),'features which appear only once.')

There are 47820 features which only appear once.


In [57]:
def rmonce(token):
    return [x for x in token if x not in features_1.index]

In [58]:
# Remove words which appear only once.
fullset.Token = fullset.Token.progress_apply(rmonce)

Progress bar: 100%|██████████| 4000000/4000000 [14:54<00:00, 4471.89it/s]   


In [87]:
# Count the length of sentences
(fullset.Token.apply(len)).describe(percentiles=[.95, .99]).round(0)

count    4000000.0
mean          80.0
std           44.0
min            0.0
50%           72.0
95%          165.0
99%          186.0
max          257.0
Name: Token, dtype: float64

In [64]:
fullset_original = pd.read_csv('fullset.csv')

In [66]:
print(fullset_original.Text[fullset.Token.apply(len)==0])

294435     ........ ............ ..... ..... ...... ........
3584048    -_- ' ' '''' '''' '' '' ''' '''''? '' '' ' '' ...
Name: Text, dtype: object


In [88]:
fullset = fullset[fullset.Token.apply(len)!=0]

In [89]:
(fullset.Token.apply(len)).describe(percentiles=[.95, .99]).round(0)

count    3999998.0
mean          80.0
std           44.0
min            1.0
50%           72.0
95%          165.0
99%          186.0
max          257.0
Name: Token, dtype: float64

In [90]:
with open('fullset_resplit_rmonce.pickle', 'wb') as f:
    pickle.dump(fullset, f, -1)

In [91]:
# The most frequent 20 words
fdist.most_common()[:20]

[('the', 15809493),
 ('i', 9294446),
 ('and', 8586104),
 ('a', 8061406),
 ('to', 7712911),
 ('it', 7578653),
 ('of', 6298197),
 ('thi', 5911844),
 ('is', 5530069),
 ('in', 3713530),
 ('for', 3524304),
 ('that', 3245780),
 ('you', 2776538),
 ('wa', 2680975),
 ('not', 2612517),
 ('book', 2498410),
 ('but', 2345642),
 ('with', 2308551),
 ('on', 2281343),
 ('have', 2190331)]

In [10]:
stopwordls = stopwords.words('english')

In [11]:
def rmstopword(token):
    return [x for x in token if x not in stopwordls]

In [13]:
# Remove stopwords
fullset.Token = fullset.Token.apply(rmstopword)

In [16]:
# Re-count
fdist = nltk.FreqDist()

for i in fullset['Token']:
    for word in i:
        fdist[word] += 1

In [34]:
# The most frequent 20 words
fdist.most_common(20)

[('thi', 5911844),
 ('wa', 2680975),
 ('book', 2498410),
 ('one', 1590682),
 ('like', 1289538),
 ('great', 1201557),
 ('veri', 1183127),
 ('good', 1167851),
 ('read', 1079779),
 ('use', 1002323),
 ('get', 995575),
 ('time', 944117),
 ('would', 939644),
 ('work', 875548),
 ('ha', 868094),
 ('movi', 773386),
 ('love', 772790),
 ('onli', 714689),
 ('hi', 665482),
 ('realli', 639812)]

In [38]:
# How many UNIQUE words
len(fdist)

810038

In [42]:
(fullset.Token.apply(len)).describe(percentiles=[.95, .99]).round(0)

count    3999998.0
mean          44.0
std           24.0
min            1.0
50%           39.0
95%           90.0
99%          102.0
max          212.0
Name: Token, dtype: float64

In [53]:
fq25, fq50, fq75, fq95, fq99 = 0, 0, 0, 0, 0
count = 0
for i in fdist_df[0]:
    fq25 += fdist.freq(i)
    fq50 += fdist.freq(i)
    fq75 += fdist.freq(i)
    fq95 += fdist.freq(i)
    fq99 += fdist.freq(i)
    count += 1
    if fq25 > 0.25:
        print('The most frequent',count, 'words have 25% portion.')
        fq25 -= 1
    if fq50 > 0.50:
        print('The most frequent',count, 'words have 50% portion.')
        fq50 -= 1
    if fq75 > 0.75:
        print('The most frequent',count, 'words have 75% portion.')
        fq75 -= 1
    if fq95 > 0.95:
        print('The most frequent',count, 'words have 95% portion.')
        fq95 -= 1
    if fq99 > 0.99:
        print('The most frequent',count, 'words have 99% portion.')
        fq99 -= 1
        

The most frequent 55 words have 25% portion.
The most frequent 294 words have 50% portion.
The most frequent 1263 words have 75% portion.
The most frequent 10323 words have 95% portion.
The most frequent 68782 words have 99% portion.


In [14]:
with open('fullset_resplit_rmonce_nostopword.pickle', 'wb') as f:
    pickle.dump(fullset, f, -1)

# Feature Engineering

In [17]:
with open('fullset_resplit_rmonce_nostopword.pickle', 'rb') as f:
    fullset = pickle.load(f)

ModuleNotFoundError: No module named 'pandas.core.internals.managers'; 'pandas.core.internals' is not a package

In [None]:
train_x, test_x, train_y, test_y = train_test_split(fullset['Token'], fullset['Sentiment'], train_size=0.8, random_state=1, stratify=fullset.Sentiment)

In [None]:
from sklearn.feature_extraction.text import DictVectorizer, CountVectorizer, TfidfVectorizer

## One-hot

In [None]:
t0 = time()
one_hot = DictVectorizer()
one_hot.fit((fullset['Token']))
print('# One-hot Fit:', time()-t0)

In [None]:
t0 = time()
one_hot_bs = one_hot.transform(train_x)
one_hot_bs = one_hot.transform(test_x)
print('# One-hot Transform:', time()-t0)

## Count Vectors + N-gram

In [None]:
t0 = time()
count_vect_1n = CountVectorizer(analyzer='word', ngram_range=(1,1), token_pattern=r'\w{1,}', max_features=10323)
count_vect_1n.fit(fullset['Token'])
print('# CV + 1-gram:', time()-t0)

t0 = time()
count_vect_2n = CountVectorizer(analyzer='word', ngram_range=(1,2), token_pattern=r'\w{1,}', max_features=10323)
count_vect_2n.fit(fullset['Token'])
print('# CV + 2-gram:', time()-t0)

t0 = time()
count_vect_3n = CountVectorizer(analyzer='word', ngram_range=(1,3), token_pattern=r'\w{1,}', max_features=10323)
count_vect_3n.fit(fullset['Token'])
print('# CV + 3-gram:', time()-t0)

In [None]:
# Transform the training and testing data using count vectorizer object
t0 = time()
xtrain_count_1n =  count_vect_1n.transform(train_x)
xvalid_count_1n =  count_vect_1n.transform(test_x)
print('# Finish CV 1n:', time()-t0)

t0 = time()
xtrain_count_2n =  count_vect_2n.transform(train_x)
xvalid_count_2n =  count_vect_2n.transform(test_x)
print('# Finish CV 2n:', time()-t0)

t0 = time()
xtrain_count_3n =  count_vect_3n.transform(train_x)
xvalid_count_3n =  count_vect_3n.transform(test_x)
print('# Finish CV 3n:', time()-t0)

## TF-IDF Vectors + N-gram

In [1]:
t0 = time()
tfidf_vect_1n = TfidfVectorizer(analyzer='word', ngram_range=(1,1), token_pattern=r'\w{1,}', max_features=10323)
tfidf_vect_1n.fit(fullset['Token'])
print('# TF-IDF + 1-gram:', time()-t0)

t0 = time()
tfidf_vect_2n = TfidfVectorizer(analyzer='word', ngram_range=(1,2), token_pattern=r'\w{1,}', max_features=10323)
tfidf_vect_2n.fit(fullset['Token'])
print('# TF-IDF + 2-gram:', time()-t0)

t0 = time()
tfidf_vect_3n = TfidfVectorizer(analyzer='word', ngram_range=(1,3), token_pattern=r'\w{1,}', max_features=10323)
tfidf_vect_3n.fit(fullset['Token'])
print('# TF-IDF + 3-gram:', time()-t0)

# TF-IDF + 1-gram


NameError: name 'TfidfVectorizer' is not defined

In [None]:
t0 = time()
xtrain_tfidf_1n =  tfidf_vect_1n.transform(train_x)
xvalid_tfidf_1n =  tfidf_vect_1n.transform(test_x)
print('# Finish TF-IDF 1n:', time()-t0)

t0 = time()
xtrain_tfidf_2n =  tfidf_vect_2n.transform(train_x)
xvalid_tfidf_2n =  tfidf_vect_2n.transform(test_x)
print('# Finish TF-IDF 2n:', time()-t0)

t0 = time()
xtrain_tfidf_3n =  tfidf_vect_3n.transform(train_x)
xvalid_tfidf_3n =  tfidf_vect_3n.transform(test_x)
print('# Finish TF-IDF 3n:', time()-t0)

## Word2vec

# Traditional ML

In [11]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_not_svm=True):
    # fit the training dataset on the classifier
    trm = classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = trm.predict(feature_vector_valid)
    
    print('Accuracy:', accuracy_score(predictions, valid_y).round(4))
    print('Precision:', precision_score(predictions, valid_y).round(4))
    print('Recall:', recall_score(predictions, valid_y).round(4))
    print('F1 Score:', f1_score(predictions, valid_y).round(4))
    '''
    confusion = confusion_matrix(predictions, valid_y)
    fig, ax = plt.subplots(figsize=(8,6))
    plot_confusion_matrix(confusion, classes=['negative','positive'], normalize=True)
    plt.show()
    '''
    #print('AUC:', roc_auc_score(valid_y, prob[:,1]).round(4),'\n')
    if is_not_svm:
        prob = trm.predict_proba(feature_vector_valid)
        fpr, tpr, _ = roc_curve(valid_y, prob[:,1])
        auc = roc_auc_score(valid_y, prob[:,1])

        fig, ax= plt.subplots(figsize=(8,6))
        ax.plot(fpr, tpr, label='ROC curve (AUC = {:.4f})'.format(auc))
        ax.set_xlabel('False positive rate')
        ax.set_ylabel('True Positive Rate')
        ax.set_title('ROC curve', fontsize=14)
        sns.despine()
        plt.legend()
        plt.show()

## Naive Bayes

In [None]:
t0 = time()
train_model(naive_bayes.BernoulliNB(), xtrain_count_1n, train_y, xvalid_count_1n)
print("# Naive Bayes + Count Vectors + 1-gram:", time()-t0)

t0 = time()
train_model(naive_bayes.BernoulliNB(), xtrain_count_2n, train_y, xvalid_count_2n)
print("# Naive Bayes + Count Vectors + 2-gram", time()-t0)

t0 = time()
train_model(naive_bayes.BernoulliNB(), xtrain_count_3n, train_y, xvalid_count_3n)
print("# Naive Bayes + Count Vectors + 3-gram", time()-t0)

In [None]:
t0 = time()
train_model(naive_bayes.BernoulliNB(), xtrain_tfidf_1n, train_y, xvalid_tfidf_1n)
print("# Naive Bayes + TF-IDF + 1-gram", time()-t0)

t0 = time()
train_model(naive_bayes.BernoulliNB(), xtrain_tfidf_2n, train_y, xvalid_tfidf_2n)
print("# Naive Bayes + TF-IDF + 2-gram", time()-t0)

t0 = time()
train_model(naive_bayes.BernoulliNB(), xtrain_tfidf_3n, train_y, xvalid_tfidf_3n)
print("# Naive Bayes + TF-IDF + 3-gram", time()-t0)

In [None]:
t0 = time()
train_model(naive_bayes.BernoulliNB(), train_vec, train_y, valid_vec)
print("# Naive Bayes + 1234 + Word2vec")

## SVM

In [None]:
t0 = time()
train_model(svm.LinearSVC(), xtrain_count_1n, train_y, xvalid_count_1n)
print("# SVM + Count Vectors + 1-gram", time()-t0)

t0 = time()
train_model(svm.LinearSVC(), xtrain_count_2n, train_y, xvalid_count_2n)
print("# SVM + Count Vectors + 2-gram", time()-t0)

t0 = time()
train_model(svm.LinearSVC(), xtrain_count_3n, train_y, xvalid_count_3n)
print("# SVM + Count Vectors + 3-gram", time()-t0)

In [None]:
t0 = time()
train_model(svm.LinearSVC(), xtrain_tfidf_1n, train_y, xvalid_tfidf_1n)
print("# SVM + TF-IDF + 1-gram", time()-t0)

t0 = time()
train_model(svm.LinearSVC(), xtrain_tfidf_2n, train_y, xvalid_tfidf_2n)
print("# SVM + TF-IDF + 2-gram", time()-t0)

t0 = time()
train_model(svm.LinearSVC(), xtrain_tfidf_3n, train_y, xvalid_tfidf_3n)
print("# SVM + TF-IDF + 3-gram", time()-t0)

In [None]:
t0 = time()
train_model(svm.LinearSVC(), train_vec, train_y, valid_vec)
print("# SVM + Word2vec", time()-t0)

## Logistic Regression

In [None]:
t0 = time()
train_model(linear_model.LogisticRegression(), xtrain_count_1n, train_y, xvalid_count_1n)
print("Logistic Regression + Count Vectors + 1-gram", time()-t0)

t0 = time()
train_model(linear_model.LogisticRegression(), xtrain_count_2n, train_y, xvalid_count_2n)
print("\nLogistic Regression + Count Vectors + 2-gram", time()-t0)

t0 = time()
train_model(linear_model.LogisticRegression(), xtrain_count_3n, train_y, xvalid_count_3n)
print("\nLogistic Regression + Count Vectors + 3-gram", time()-t0)

In [None]:
t0 = time()
train_model(linear_model.LogisticRegression(), xtrain_tfidf_1n, train_y, xvalid_tfidf_1n)
print("Logistic Regression + TF-IDF + 1-gram", time()-t0)

t0 = time()
train_model(linear_model.LogisticRegression(), xtrain_tfidf_2n, train_y, xvalid_tfidf_2n)
print("\nLogistic Regression + TF-IDF + 2-gram", time()-t0)

t0 = time()
train_model(linear_model.LogisticRegression(), xtrain_tfidf_3n, train_y, xvalid_tfidf_3n)
print("\nLogistic Regression + TF-IDF + 3-gram", time()-t0)

In [None]:
t0 = time()
train_model(linear_model.LogisticRegression(), train_vec, train_y, valid_vec)
print("# Logistic Regression + Word2vec", time()-t0)

# Neural Network

In [None]:
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

## CNN

## LSTM (RNN)

## CNN + LSTM