# Import packages

In [2]:
# Packages
from time import time
import re
import os
import nltk
import numpy as np
import multiprocessing as mp
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import warnings
from tqdm import tqdm
tqdm.pandas(desc="Progress bar")
from sklearn.model_selection import train_test_split, cross_val_score
import multiprocessing as mp
import pickle

cores = mp.cpu_count()
warnings.filterwarnings('ignore')

from nltk.tokenize import TweetTokenizer
Tokenizer = TweetTokenizer()
import string
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from emoji import demojize

# Plot settings
sns.set_context('notebook') 
sns.set_style('ticks') 
colours = ['#1F77B4', '#FF7F0E', '#2CA02C', '#DB2728', '#9467BD', '#8C564B', '#E377C2','#7F7F7F', '#BCBD22', '#17BECF']
crayon = ['#4E79A7','#F28E2C','#E15759','#76B7B2','#59A14F', '#EDC949','#AF7AA1','#FF9DA7','#9C755F','#BAB0AB']
sns.set_palette(colours)
%matplotlib inline
plt.rcParams['figure.figsize'] = (9, 6)


In [3]:
def _apply_df(args):
    df, func, kwargs = args
    return df.progress_apply(func, **kwargs)

def multi_apply(df, func, **kwargs):
    workers = kwargs.pop('workers')
    pool = mp.Pool(processes=workers)
    result = pool.map(_apply_df, [(d, func, kwargs) for d in np.array_split(df, workers)])
    pool.close()
    return pd.concat(list(result))

# Import dataset

In [4]:
train = pd.read_csv('train.csv', header=None)

# Merge title and content
train['Text'] = train[1]+' '+train[2]
train = train.drop(columns=[1,2])

# Negative = 0, Positive = 1
train[0] = train[0].map(lambda x: x-1)
train.rename(columns={0:'Sentiment'}, inplace=True)

In [5]:
test = pd.read_csv('test.csv', header=None)

# Merge title and content
test['Text'] = test[1]+' '+test[2]
test = test.drop(columns=[1,2])

# Negative = 0, Positive = 1
test[0] = test[0].map(lambda x: x-1)
test.rename(columns={0:'Sentiment'}, inplace=True)

In [6]:
# Merge dataset
fullset = pd.concat([train,test], axis=0, ignore_index=True)
del(train)
del(test)
fullset.to_csv('fullset.csv', index=0)

In [4]:
fullset = pd.read_csv('fullset.csv')

# Data Observation

In [7]:
fullset.tail()

Unnamed: 0,Sentiment,Text
3999995,0,Unbelievable- In a Bad Way We bought this Thom...
3999996,0,"Almost Great, Until it Broke... My son recieve..."
3999997,0,Disappointed !!! I bought this toy for my son ...
3999998,1,Classic Jessica Mitford This is a compilation ...
3999999,0,"Comedy Scene, and Not Heard This DVD will be a..."


In [8]:
fullset['Sentiment'].value_counts()

1    2000000
0    2000000
Name: Sentiment, dtype: int64

In [5]:
fullset['Text'] = fullset['Text'].apply(str)

In [10]:
(fullset['Text'].apply(len)).describe(percentiles=[.95, .99]).round(0)

count    4000000.0
mean         431.0
std          238.0
min            3.0
50%          382.0
95%          894.0
99%          988.0
max         1014.0
Name: Text, dtype: float64

# Data Preprocessing

In [57]:
# Text lowercase + Stemming
def pre_proc(text):
    return ''.join([PorterStemmer().stem(x) for x in Tokenizer.tokenize(text.lower()) if x != ''])

In [None]:
fullset['Token'] = multi_apply(fullset['Text'], pre_proc, workers=cores-1)

Progress bar: 100%|██████████| 571428/571428 [53:35<00:00, 177.72it/s]  
Progress bar: 100%|██████████| 571428/571428 [53:53<00:00, 176.70it/s]
Progress bar: 100%|██████████| 571429/571429 [54:11<00:00, 175.73it/s]
Progress bar: 100%|██████████| 571428/571428 [54:45<00:00, 173.95it/s]
Progress bar: 100%|██████████| 571429/571429 [54:50<00:00, 173.65it/s]
Progress bar: 100%|██████████| 571429/571429 [55:10<00:00, 172.60it/s]
Progress bar: 100%|██████████| 571429/571429 [55:31<00:00, 171.52it/s]


In [None]:
fullset = pd.read_csv('fullset_ez_processed.csv')

In [28]:
fdist = nltk.FreqDist()

In [None]:
for i in fullset['Token']:
    for word in i.split():
        fdist[word] += 1

In [34]:
# Count the number of UNIQUE word
features = pd.Series(dict(fdist))
features.describe(percentiles=[.95, .99]).round(0)

count     1804633.0
mean          175.0
std         21403.0
min             1.0
50%             1.0
95%            17.0
99%           271.0
max      15792716.0
dtype: float64

In [37]:
features_1 = features[features==1]
print('There are',len(features_1),'features which only appear once.')

There are 1246917 words which only appear once.


In [72]:
print('Some once features are like this:','\''+features_1.index[1]+'\'')
print('So need to re-split them.')

Some once features are like this: 'peaceful.on'
So need to re-split them.


In [159]:
features_re = features[features<=1]

In [162]:
relist = [x for x in features_1.index if (not x.isalpha())]

In [163]:
print('There are',len(relist),'features with punctuations.')

There are 824988 features with punctuations.


In [164]:
relist_str = ''.join(relist)

In [181]:
separator = '|'.join(list(set([r'{}'.format(x) for x in relist_str if not x.isalpha()])))
print(separator,'\n')
print(demojize(separator))

🎥|̣|∂|😁|„|'||7|6|̇|~|-|😠|😔|]|≠|☆|#|😍|/|[|9|【|.|_|😩|↓|5|₤||)|⊖|👏|♠|8|💜|1|∅|℉|😡|&||␟|╚|👎|ً|🎉|$|💅|（|💖|⌫|:|😀|}|,|⟨||😎|0|=|>|╝|"|☼|%|<|♡|⊕|;|】|4|′||+|?|†||^|⊂|\|）|‼|{|!|2|😉|̄|@|*|─||♣|||(|3 

:movie_camera:|̣|∂|:beaming_face_with_smiling_eyes:|„|'||7|6|̇|~|-|:angry_face:|:pensive_face:|]|≠|☆|#|:smiling_face_with_heart-eyes:|/|[|9|【|.|_|:weary_face:|↓|5|₤||)|⊖|:clapping_hands:|:spade_suit:|8|:purple_heart:|1|∅|℉|:pouting_face:|&||␟|╚|:thumbs_down:|ً|:party_popper:|$|:nail_polish:|（|:sparkling_heart:|⌫|:|:grinning_face:|}|,|⟨||:smiling_face_with_sunglasses:|0|=|>|╝|"|☼|%|<|♡|⊕|;|】|4|′||+|?|†||^|⊂|\|）|:double_exclamation_mark:|{|!|2|:winking_face:|̄|@|*|─||:club_suit:|||(|3


In [30]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [11]:
def resplit(text):
    # Translate emojis
    text = demojize(text)
    # Remove punctuation
    for i in string.punctuation:
        text = text.replace(i,' ')
    # Token
    text = Tokenizer.tokenize(text)

    return text

In [None]:
fullset.Token = fullset.Token.apply(resplit)

In [49]:
fullset.tail()

Unnamed: 0,Sentiment,Token
3999995,0,"[unbeliev, in, a, bad, way, we, bought, thi, t..."
3999996,0,"[almost, great, until, it, broke, my, son, rec..."
3999997,0,"[disappoint, i, bought, thi, toy, for, my, son..."
3999998,1,"[classic, jessica, mitford, thi, is, a, compil..."
3999999,0,"[comedi, scene, and, not, heard, thi, dvd, wil..."


In [50]:
with open('fullset_resplit.pickle', 'wb') as f:
    pickle.dump(fullset, f)

In [53]:
# Re-count
fdist = nltk.FreqDist()

for i in tqdm(fullset['Token']):
    for word in i:
        fdist[word] += 1

100%|██████████| 4000000/4000000 [06:07<00:00, 10893.40it/s]


In [54]:
fdist

FreqDist({'the': 15809493, 'i': 9294446, 'and': 8586104, 'a': 8061406, 'to': 7712911, 'it': 7578653, 'of': 6298197, 'thi': 5911844, 'is': 5530069, 'in': 3713530, ...})

In [55]:
# Count the number of UNIQUE word
features = pd.Series(dict(fdist))
features.describe(percentiles=[.95, .99]).round(0)

count      858011.0
mean          375.0
std         31793.0
min             1.0
50%             1.0
95%            51.0
99%          1065.0
max      15809493.0
dtype: float64

In [56]:
features_1 = features[features==3]
print('There are',len(features_1),'features which appear only once.')

There are 47820 features which only appear once.


In [57]:
def rmonce(token):
    return [x for x in token if x not in features_1.index]

In [58]:
# Remove words which appear only once.
fullset.Token = fullset.Token.progress_apply(rmonce)

Progress bar: 100%|██████████| 4000000/4000000 [14:54<00:00, 4471.89it/s]   


In [87]:
# Count the length of sentences
(fullset.Token.apply(len)).describe(percentiles=[.95, .99]).round(0)

count    4000000.0
mean          80.0
std           44.0
min            0.0
50%           72.0
95%          165.0
99%          186.0
max          257.0
Name: Token, dtype: float64

In [64]:
fullset_original = pd.read_csv('fullset.csv')

In [66]:
print(fullset_original.Text[fullset.Token.apply(len)==0])

294435     ........ ............ ..... ..... ...... ........
3584048    -_- ' ' '''' '''' '' '' ''' '''''? '' '' ' '' ...
Name: Text, dtype: object


In [88]:
fullset = fullset[fullset.Token.apply(len)!=0]

In [89]:
(fullset.Token.apply(len)).describe(percentiles=[.95, .99]).round(0)

count    3999998.0
mean          80.0
std           44.0
min            1.0
50%           72.0
95%          165.0
99%          186.0
max          257.0
Name: Token, dtype: float64

In [90]:
with open('fullset_resplit_rmonce.pickle', 'wb') as f:
    pickle.dump(fullset, f, -1)

In [91]:
# The most frequent 20 words
fdist.most_common()[:20]

[('the', 15809493),
 ('i', 9294446),
 ('and', 8586104),
 ('a', 8061406),
 ('to', 7712911),
 ('it', 7578653),
 ('of', 6298197),
 ('thi', 5911844),
 ('is', 5530069),
 ('in', 3713530),
 ('for', 3524304),
 ('that', 3245780),
 ('you', 2776538),
 ('wa', 2680975),
 ('not', 2612517),
 ('book', 2498410),
 ('but', 2345642),
 ('with', 2308551),
 ('on', 2281343),
 ('have', 2190331)]

In [10]:
stopwordls = stopwords.words('english')

In [11]:
def rmstopword(token):
    return [x for x in token if x not in stopwordls]

In [13]:
# Remove stopwords
fullset.Token = fullset.Token.apply(rmstopword)

In [16]:
# Re-count
fdist = nltk.FreqDist()

for i in fullset['Token']:
    for word in i:
        fdist[word] += 1

In [34]:
# The most frequent 20 words
fdist.most_common(20)

[('thi', 5911844),
 ('wa', 2680975),
 ('book', 2498410),
 ('one', 1590682),
 ('like', 1289538),
 ('great', 1201557),
 ('veri', 1183127),
 ('good', 1167851),
 ('read', 1079779),
 ('use', 1002323),
 ('get', 995575),
 ('time', 944117),
 ('would', 939644),
 ('work', 875548),
 ('ha', 868094),
 ('movi', 773386),
 ('love', 772790),
 ('onli', 714689),
 ('hi', 665482),
 ('realli', 639812)]

In [38]:
# How many UNIQUE words
len(fdist)

810038

In [42]:
(fullset.Token.apply(len)).describe(percentiles=[.95, .99]).round(0)

count    3999998.0
mean          44.0
std           24.0
min            1.0
50%           39.0
95%           90.0
99%          102.0
max          212.0
Name: Token, dtype: float64

In [53]:
fq25, fq50, fq75, fq95, fq99 = 0, 0, 0, 0, 0
count = 0
for i in fdist_df[0]:
    fq25 += fdist.freq(i)
    fq50 += fdist.freq(i)
    fq75 += fdist.freq(i)
    fq95 += fdist.freq(i)
    fq99 += fdist.freq(i)
    count += 1
    if fq25 > 0.25:
        print('The most frequent',count, 'words have 25% portion.')
        fq25 -= 1
    if fq50 > 0.50:
        print('The most frequent',count, 'words have 50% portion.')
        fq50 -= 1
    if fq75 > 0.75:
        print('The most frequent',count, 'words have 75% portion.')
        fq75 -= 1
    if fq95 > 0.95:
        print('The most frequent',count, 'words have 95% portion.')
        fq95 -= 1
    if fq99 > 0.99:
        print('The most frequent',count, 'words have 99% portion.')
        fq99 -= 1
        

The most frequent 55 words have 25% portion.
The most frequent 294 words have 50% portion.
The most frequent 1263 words have 75% portion.
The most frequent 10323 words have 95% portion.
The most frequent 68782 words have 99% portion.


In [14]:
with open('fullset_resplit_rmonce_nostopword.pickle', 'wb') as f:
    pickle.dump(fullset, f, -1)