# Deep Learning - Recurrent Neural Network for IMDB Movie Review Sentiment

Bring in Libraries and Setup Data

In [81]:
# Import Libraries
import swat
import pandas as pd
from sklearn.model_selection import train_test_split

In [82]:
# Create CAS Connection
conn = swat.CAS(host, portnum, username, password)

In [83]:
# Load CAS Actionsets
conn.loadactionset('deepLearn')
conn.loadactionset('table');

NOTE: Added action set 'deepLearn'.
NOTE: Added action set 'table'.


In [84]:
# Load Data and Preview
imdb_df = pd.read_csv('/data/andre_data/SAS_Demo_Deep_Learning_IMDB_Sentiment/movieSent.csv')
imdb_df.head(n=5)

Unnamed: 0,class,text
0,Pos,films adapted from comic books have had plent...
1,Pos,every now and then a movie comes along from a...
2,Pos,you ve got mail works alot better than it des...
3,Pos,jaws is a rare film that grabs your atte...
4,Pos,moviemaking is a lot like being the general m...


In [85]:
# Create Binary Variable for Polarity, Shuffle the Dataset Randomly, and Preview
imdb_df['sentBinary'] = [1 if x == 'Pos' else 0 for x in imdb_df['class']]
imdb_shuffled = imdb_df.sample(frac=1)
imdb_shuffled.head(n=5)

Unnamed: 0,class,text,sentBinary
251,Pos,the film magnolia can be compared to a si...,1
8,Pos,after bloody clashes and independence won l...,1
246,Pos,the tailor of panama is a different kind...,1
313,Pos,matthew broderick and high school comedy t...,1
1376,Neg,robin williams has the rarest of gifts the ...,0


In [86]:
# Split Data into Train and Test
imdb_train, imdb_test = train_test_split(imdb_shuffled, test_size=0.2)

In [87]:
# Training and Test Data Polarity Balance
pd.crosstab(index=imdb_train['class'], columns="count")

col_0,count
class,Unnamed: 1_level_1
Neg,804
Pos,796


In [88]:
pd.crosstab(index=imdb_test['class'], columns="count")

col_0,count
class,Unnamed: 1_level_1
Neg,196
Pos,204


In [89]:
# Load in-session Pandas DF's into CAS
imdb_train_cas = conn.upload_frame(imdb_train[['text','sentBinary']], casout=dict(name='Python_tr', replace=True))
imdb_test_cas = conn.upload_frame(imdb_test[['text','sentBinary']], casout=dict(name='Python_te', replace=True))
imdb_train_cas.head(n=5)

NOTE: Cloud Analytic Services made the uploaded file available as table PYTHON_TR in caslib CASUSER(sas).
NOTE: The table PYTHON_TR has been created in caslib CASUSER(sas) from binary data uploaded to Cloud Analytic Services.
NOTE: Cloud Analytic Services made the uploaded file available as table PYTHON_TE in caslib CASUSER(sas).
NOTE: The table PYTHON_TE has been created in caslib CASUSER(sas) from binary data uploaded to Cloud Analytic Services.


Unnamed: 0,text,sentBinary
0,psycho meets the exorcist with no holds ba...,1.0
1,what if one of our cities became the target f...,1.0
2,i swear i have seen the edge before in fac...,1.0
3,usually a movie is about something more than ...,1.0
4,i have never been a star trek fan and ...,0.0


## Deep Learning Modeling

In [90]:
# Build Recurrent Neural Network
conn.buildmodel(model=dict(name='IMDB_Setiment_Model', replace=True), type='RNN')
conn.addLayer(model='IMDB_Setiment_Model', name='Input_Layer', layer=dict(type='input'))

conn.addLayer(model='IMDB_Setiment_Model', name='LSTM_1', srclayers='Input_Layer',
              layer=dict(type='RECURRENT', rnnType='LSTM', n=32, init='XAVIER',
                         outputType='samelength', reverse=True))

conn.addLayer(model='IMDB_Setiment_Model', name='LSTM_2', srclayers='Input_Layer',
              layer=dict(type='RECURRENT', rnnType='LSTM', n=32, init='XAVIER',
                         outputType='samelength', reverse=True)) # False

conn.addLayer(model='IMDB_Setiment_Model', name='LSTM_3', srclayers=['LSTM_1', 'LSTM_2'],
              layer=dict(type='RECURRENT', rnnType='LSTM', n=32, init='XAVIER',
                         outputType='samelength', reverse=True))

conn.addLayer(model='IMDB_Setiment_Model', name='LSTM_4', srclayers=['LSTM_1', 'LSTM_2'],
              layer=dict(type='RECURRENT', rnnType='LSTM', n=32, init='XAVIER',
                         outputType='samelength', reverse=True)) # False

conn.addLayer(model='IMDB_Setiment_Model', name='LSTM_5', srclayers=['LSTM_3', 'LSTM_4'],
              layer=dict(type='RECURRENT', rnnType='LSTM', n=32, init='XAVIER',
                         outputType='encoding'))

conn.addLayer(model='IMDB_Setiment_Model',name='Output_Layer', srclayers='LSTM_5',
              layer=dict(type='output'))

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(sas),imdb_setiment_model,90,5,"CASTable('imdb_setiment_model', caslib='CASUSE..."


In [91]:
# Upload the GloVe Weights, 'glove', to CAS Table

# "GloVe is an unsupervised learning algorithm for obtaining vector representations for words.
#   Training is performed on aggregated global word-word co-occurrence statistics from a corpus,
#   and the resulting representations showcase interesting linear substructures of the word
#   vector space."

conn.upload(r'/data/andre_data/SAS_Demo_Deep_Learning_IMDB_Sentiment/glove_100d_tab_clean.txt',
            casout=dict(name='glove',
                        replace=True),
            importoptions=dict(fileType='delimited',
                               delimiter='\t'));

NOTE: Cloud Analytic Services made the uploaded file available as table GLOVE in caslib CASUSER(sas).
NOTE: The table GLOVE has been created in caslib CASUSER(sas) from binary data uploaded to Cloud Analytic Services.


In [92]:
# View CAS Tables Available in Session

# Notice the GloVe table has almost 400k rows by 101 columns.
#  400k is the most frequently used words from the corpuses (Wikipedia and Web Crawl)
#  These represent a count frequency in a words x 'content' Matrix. Rows are Normalized

conn.tableinfo()

Unnamed: 0,Name,Rows,Columns,IndexedColumns,Encoding,CreateTimeFormatted,ModTimeFormatted,AccessTimeFormatted,JavaCharSet,CreateTime,ModTime,AccessTime,Global,Repeated,View,SourceName,SourceCaslib,Compressed,Creator,Modifier
0,PYTHON_TR,1600,2,0,utf-8,24May2018:18:37:15,24May2018:18:37:15,24May2018:18:37:16,UTF8,1842806000.0,1842806000.0,1842806000.0,0,0,0,,,0,sas,
1,PYTHON_TE,400,2,0,utf-8,24May2018:18:37:16,24May2018:18:37:16,24May2018:18:37:16,UTF8,1842806000.0,1842806000.0,1842806000.0,0,0,0,,,0,sas,
2,IMDB_SETIMENT_MODEL,90,5,0,utf-8,24May2018:18:37:24,24May2018:18:37:24,24May2018:18:37:24,UTF8,1842806000.0,1842806000.0,1842806000.0,0,0,0,,,0,sas,
3,GLOVE,399857,101,0,utf-8,24May2018:18:37:31,24May2018:18:37:31,24May2018:18:37:31,UTF8,1842806000.0,1842806000.0,1842806000.0,0,0,0,,,0,sas,


In [93]:
# Fit the Built Model on the Training Data. Show both Training and Test Error
conn.dlTrain(table=imdb_train_cas,
             model='IMDB_Setiment_Model',
             validtable=imdb_test_cas,
             modelWeights=dict(name='IMDB_Setiment_Model_Weights',
                               replace=True),
             textParms=dict(initEmbeddings='glove',
                            hasInputTermIds=False,
                            embeddingTrainable=False),
             target='sentBinary', 
             inputs=['text'], 
             texts=['text'], 
             nominals=['sentBinary'],
             optimizer=dict(miniBatchSize=2,
                            maxEpochs=50,
                            algorithm=dict(method='ADAM',
                                           learningRate=0.001,
                                           stepSize=20,
                                           gamma=0.5,
                                           lrPolicy='step')
                           ),
            seed=52318
         )

##### LEARNING RATE POLICY #####
# lrPolicy - STEP: Specifies to set the learning rate to the current learning rate multiplied by the gamma parameter
#  value. The number of steps is specified in the stepSize parameter. The rate is recalculated for each group of
#  epochs, according to the step size.
# Gamma is the discount value (Like Reinforcement Learning)

##### ALGOS #####
# 'vanilla', 'vanillaSGD', 'momentum', 'momentumSGD', 'adam', 'adamSGD', 'lbfgs'

# method='ADAM': Method that calculates learning rate for each parameter. Developers also propose the default
#  values for the Adam optimizer parameters as Beta1 – 0.9 Beta2 – 0.999

# SGD maintains a single learning rate for all weight updates and the learning rate does not change during training.
#  A learning rate is maintained for each network weight (parameter) and separately adapted as learning unfolds.

# Instead of adapting the parameter learning rates based on the average first moment (the mean) as in RMSProp, Adam
#  also makes use of the average of the second moments of the gradients (the uncentered variance).

# Specifically, the algorithm calculates an exponential moving average of the gradient and the squared gradient, and
#  the parameters beta1 and beta2 control the decay rates of these moving averages


Unnamed: 0,Descr,Value
0,Model Name,imdb_setiment_model
1,Model Type,Recurrent Neural Network
2,Number of Layers,7
3,Number of Input Layers,1
4,Number of Output Layers,1
5,Number of Convolutional Layers,0
6,Number of Pooling Layers,0
7,Number of Fully Connected Layers,0
8,Number of Recurrent Layers,5
9,Number of Weight Parameters,71200

Unnamed: 0,Epoch,LearningRate,Loss,FitError,ValidLoss,ValidError
0,0.0,0.001,0.714327,0.521875,0.69876,0.51
1,1.0,0.001,0.695112,0.490625,0.701764,0.51
2,2.0,0.001,0.681221,0.43375,0.72221,0.4775
3,3.0,0.001,0.676756,0.40375,0.662254,0.39
4,4.0,0.001,0.647985,0.360625,0.688002,0.4125
5,5.0,0.001,0.645585,0.35875,0.658938,0.37
6,6.0,0.001,0.633878,0.349375,0.66745,0.4
7,7.0,0.001,0.628729,0.345625,0.660535,0.3575
8,8.0,0.001,0.617147,0.330625,0.654112,0.3925
9,9.0,0.001,0.60652,0.315625,0.656461,0.3775

Unnamed: 0,casLib,Name,Rows,Columns,casTable
0,CASUSER(sas),IMDB_Setiment_Model_Weights,71842,3,"CASTable('IMDB_Setiment_Model_Weights', caslib..."


View Test Misclassification Error %

In [94]:
# Show Score of the Test Data. We Used 'Test' as Validation Above so Scores are Identical
# More Simple / Condensed View than Line Above
conn.dlScore(table=imdb_test_cas, model='IMDB_Setiment_Model',
             initWeights='IMDB_Setiment_Model_Weights',
             copyVars=['text', 'sentBinary'],
             textParms=dict(initInputEmbeddings='glove'))

Unnamed: 0,Descr,Value
0,Number of Observations Read,400.0
1,Number of Observations Used,400.0
2,Misclassification Error (%),23.0
3,Loss Error,0.810873
