In [12]:
import pandas as pd
from keras.datasets import imdb
from keras import preprocessing
import keras
max_features = 1000

max_length = 50
(x_train,y_train),(x_test, y_test) = imdb.load_data(num_words = max_features)


In [13]:
y_train

array([1, 0, 0, ..., 0, 1, 0], dtype=int64)

In [14]:
x_train= preprocessing.sequence.pad_sequences(x_train, maxlen = max_length)
x_train

array([[  2,  56,  26, ...,  19, 178,  32],
       [  2,   5,   2, ...,  16, 145,  95],
       [215,  28, 610, ...,   7, 129, 113],
       ...,
       [  4,  65, 496, ...,   4,   2,   2],
       [ 13,  18,  31, ...,  12,   9,  23],
       [  2,   8,   2, ..., 204, 131,   9]])

In [15]:
x_test= preprocessing.sequence.pad_sequences(x_test, maxlen = max_length)


In [16]:
num_classes=2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [17]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding

model = Sequential()
model.add(Embedding(10000,8,input_length=max_length))
model.add(Flatten())
model.add(Dense(2,activation='softmax'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 50, 8)             80000     
_________________________________________________________________
flatten_2 (Flatten)          (None, 400)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 2)                 802       
Total params: 80,802
Trainable params: 80,802
Non-trainable params: 0
_________________________________________________________________


**Training and testing a model on imdb review data**

In [18]:
history = model.fit(x_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [19]:
model.evaluate(x_test, y_test)



[0.4756104110717773, 0.77988]

**trying to tokenize our transcripts and building a basic model**

In [20]:
transcripts = pd.read_csv("C:/University of Chicago/Project/MOUD/TranslatedTransripts/AllText.csv")

In [21]:
transcripts = transcripts[transcripts.Annotation !=0]
transcripts.head(10)

Unnamed: 0.1,Unnamed: 0,TranslatedText,Annotation
0,1,I had seen remarks that said it stings when yo...,-1
1,2,and the truth is that if I use it once and t a...,-1
2,3,and I said no it could be possible so much I w...,-1
3,4,This also pulls a little hair but do it as it ...,-1
4,5,but the same with the washings has stopped bei...,1
5,6,em they wash super easy they dry fast they dry...,1
6,7,and good with the washings no longer scrape so...,1
7,8,it is already bearable,1
8,9,well yes it stings a p,-1
9,10,And what I love about this movie is that it39s...,1


In [22]:
labels = transcripts["Annotation"].tolist()

In [23]:
texts = transcripts["TranslatedText"].tolist()

In [24]:
len(labels)==len(texts)

True

In [25]:
labels = [1 if x==1 else 0 for x in labels] #converting to format used in the Chollet 

In [26]:
#tokeinizing the text
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

In [27]:
maxlen=50
training_samples=250
validation_samples = 200
max_words = 10000


In [28]:
tokenizer= Tokenizer(num_words= max_words)

In [29]:
tokenizer.fit_on_texts(texts)

In [30]:
sequences = tokenizer.texts_to_sequences(texts)

In [31]:
word_index = tokenizer.word_index

In [32]:
print("found {} unique tokens".format(len(word_index)))

found 1241 unique tokens


In [33]:
labels = np.asarray(labels)

In [77]:
data = pad_sequences(sequences, maxlen=maxlen)

In [78]:
labels = np.asarray(labels)

In [79]:
print("shape of data tensor:",data.shape)
print("shape of labels tensor",labels.shape)

shape of data tensor: (450, 50)
shape of labels tensor (450, 2)


In [80]:
indices = np.arange(data.shape[0])

In [81]:
np.random.shuffle(indices)

In [82]:
data = data[indices]
labels = labels[indices]

In [71]:
x_train2 = data[:training_samples]
y_train2 = labels[:training_samples]
x_test2= data[training_samples:training_samples+validation_samples]
y_test2 = labels[training_samples:training_samples+validation_samples]
# convert class vectors to binary class matrices
y_train2 = keras.utils.to_categorical(y_train2, num_classes)
y_test2 = keras.utils.to_categorical(y_test2, num_classes)

In [42]:
num_classes=2
labels = keras.utils.to_categorical(labels, num_classes)


In [43]:
model.evaluate(data, labels)



[1.051520997153388, 0.5622222222222222]

**parsing the glove word embeddings file to use pre-trained glove embeddings**

In [44]:
import os
glove_dir = "C:/University of Chicago/Project/glove.6B/glove.6B.50d.txt"
embeddings_index={}
f=open(glove_dir,encoding="utf8")
for line in f:
    values = line.split()
    word=values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word]=coefs
f.close()


**Preparing the Glove word embeddings matrix**

In [45]:
embedding_dim = 50
embedding_vector = ()
embedding_matrix = np.zeros((max_words, embedding_dim))

for word,i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [46]:
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding


**Training the model on imdb data with pretrained glove embeddings and Dense layers**

In [48]:
#model definition
model1 = Sequential()
model1.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model1.add(Flatten())
model1.add(Dense(32,activation='relu'))
model1.add(Dense(2,activation='softmax'))
model1.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
model1.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 50, 50)            500000    
_________________________________________________________________
flatten_3 (Flatten)          (None, 2500)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 32)                80032     
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 66        
Total params: 580,098
Trainable params: 580,098
Non-trainable params: 0
_________________________________________________________________


In [49]:
model1.layers[0].set_weights([embedding_matrix])
model1.layers[0].trainable = False

In [50]:
model1.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
history = model1.fit(x_train, y_train, epochs=10, batch_size=32,validation_split=0.2)
model1.save_weights('pre_trained_glove_model.h5')

Train on 200 samples, validate on 50 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [51]:
model1.load_weights('pre_trained_glove_model.h5')
model1.evaluate(x_train2,y_train2)



[0.7905377435684204, 0.675]

**Training the same model without pretrained embeddings and Dense layers**


In [54]:
model2 = Sequential()
model2.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model2.add(Flatten())
model2.add(Dense(32,activation='relu'))
model2.add(Dense(2,activation='softmax'))
model2.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 50, 50)            500000    
_________________________________________________________________
flatten_5 (Flatten)          (None, 2500)              0         
_________________________________________________________________
dense_7 (Dense)              (None, 32)                80032     
_________________________________________________________________
dense_8 (Dense)              (None, 2)                 66        
Total params: 580,098
Trainable params: 580,098
Non-trainable params: 0
_________________________________________________________________


In [55]:
model2.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
history = model2.fit(x_train, y_train, epochs=10, batch_size=32,validation_split=0.2)
model2.save_weights('densenonpretrained')


Train on 200 samples, validate on 50 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [129]:
model2.evaluate(data,labels)



[1.0038612450493707, 0.5355555555555556]

**Training a model with imdb data, RNN layers **

In [58]:
from keras.layers import SimpleRNN

In [116]:
model3 = Sequential()
model3.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model3.add(SimpleRNN(32, return_sequences=True))
model3.add(SimpleRNN(32, return_sequences=True))
model3.add(Flatten())
model3.add(Dense(32, activation="relu"))
model3.add(Dense(2, activation ="softmax"))
model3.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 50, 50)            500000    
_________________________________________________________________
simple_rnn_13 (SimpleRNN)    (None, 50, 32)            2656      
_________________________________________________________________
simple_rnn_14 (SimpleRNN)    (None, 50, 32)            2080      
_________________________________________________________________
flatten_14 (Flatten)         (None, 1600)              0         
_________________________________________________________________
dense_25 (Dense)             (None, 32)                51232     
_________________________________________________________________
dense_26 (Dense)             (None, 2)                 66        
Total params: 556,034
Trainable params: 556,034
Non-trainable params: 0
_________________________________________________________________


In [117]:
maxlen

50

In [118]:
embedding_dim

50

In [119]:
model3.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])

In [120]:
history = model3.fit(x_train, y_train, epochs=10, batch_size=32,validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [101]:
#model3.save_weights('simpleRNN')

In [123]:
model3.evaluate(data,labels)



[0.6929802269405789, 0.5511111111111111]

**training the model with imdb data, glove embedding and simple LSTM layers**

In [134]:
##training the model with embedding and simple LSTM layers
from keras.layers import LSTM 
from keras.layers import Dropout

In [136]:
model4 = Sequential()
model4.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model4.add(LSTM(32, return_sequences=True))
model4.add(Dropout(0.5))
model4.add(LSTM(32, return_sequences=True))
model4.add(Dropout(0.25))
model4.add(LSTM(32, return_sequences=True))
model4.add(Flatten())
model4.add(Dense(32, activation="relu"))
model4.add(Dropout(0.25))
model4.add(Dense(2, activation ="softmax"))
model4.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_18 (Embedding)     (None, 50, 50)            500000    
_________________________________________________________________
lstm_10 (LSTM)               (None, 50, 32)            10624     
_________________________________________________________________
dropout_3 (Dropout)          (None, 50, 32)            0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 50, 32)            8320      
_________________________________________________________________
dropout_4 (Dropout)          (None, 50, 32)            0         
_________________________________________________________________
lstm_12 (LSTM)               (None, 50, 32)            8320      
_________________________________________________________________
flatten_17 (Flatten)         (None, 1600)              0         
__________

In [137]:
model4.layers[0].set_weights([embedding_matrix])
model4.layers[0].trainable = False

In [138]:
model4.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])

In [139]:
history = model4.fit(x_train, y_train, epochs=10, batch_size=32,validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [140]:
model4.save_weights('LSTM')

In [143]:
model4.evaluate(data,labels)



[0.9248037634955513, 0.47555555555555556]