In [16]:
import pandas as pd
from keras.datasets import imdb
from keras import preprocessing
import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Dropout, Conv1D, MaxPooling1D,LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np


**importing imdb dataset**

In [17]:
max_features = 1000
max_length = 50
(x_train,y_train),(x_test, y_test) = imdb.load_data(num_words = max_features)


In [18]:
x_train= preprocessing.sequence.pad_sequences(x_train, maxlen = max_length)
x_test= preprocessing.sequence.pad_sequences(x_test, maxlen = max_length)

In [19]:
num_classes=2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

**importing and tokenising MOUD dataset**

In [20]:
transcripts = pd.read_csv("C:/University of Chicago/Project/MOUD/TranslatedTransripts/AllText.csv")
transcripts = transcripts[transcripts.Annotation !=0]
transcripts.head(10)
labels = transcripts["Annotation"].tolist()
texts = transcripts["TranslatedText"].tolist()
labels = [1 if x==1 else 0 for x in labels] #converting to format used in the Chollet 


In [21]:
maxlen=50
training_samples=250
validation_samples = 200
max_words = 10000

In [22]:
tokenizer= Tokenizer(num_words= max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print("found {} unique tokens".format(len(word_index)))
labels = np.asarray(labels)

found 1241 unique tokens


In [23]:
data = pad_sequences(sequences, maxlen=maxlen)

In [24]:
print("shape of data tensor:",data.shape)
print("shape of labels tensor",labels.shape)

shape of data tensor: (450, 50)
shape of labels tensor (450,)


In [25]:
indices = np.arange(data.shape[0])

In [26]:
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [27]:
x_train2 = data[:training_samples]
y_train2 = labels[:training_samples]
x_test2= data[training_samples:training_samples+validation_samples]
y_test2 = labels[training_samples:training_samples+validation_samples]
# convert class vectors to binary class matrices
y_train2 = keras.utils.to_categorical(y_train2, num_classes)
y_test2 = keras.utils.to_categorical(y_test2, num_classes)

In [28]:
num_classes=2
labels = keras.utils.to_categorical(labels, num_classes)


**importing and tokenising the product data set for reviews**

In [29]:

import gzip

def parse(path):
  g = gzip.open(path, 'rb')
  for l in g:
    yield eval(l)

def getDF(path):
  i = 0
  df = {}
  for d in parse(path):
    df[i] = d
    i += 1
  return pd.DataFrame.from_dict(df, orient='index')

df = getDF('C:/University of Chicago/Project/reviews_Clothing_Shoes_and_Jewelry_5.json.gz')

In [40]:
sentiment = [1 if x>=3.0 else 0 for x in df.overall]


In [35]:
transcript = df['reviewText']

In [51]:
maxlen=50
max_words = 10000
num_classes=2

In [52]:
tokenizer2= Tokenizer(num_words= max_words)
tokenizer2.fit_on_texts(transcript)
sequences2 = tokenizer2.texts_to_sequences(transcript)
word_index2 = tokenizer2.word_index
print("found {} unique tokens".format(len(word_index2)))
labels2 = np.asarray(sentiment)

found 84924 unique tokens


In [53]:
data2 = pad_sequences(sequences2, maxlen=maxlen)

In [54]:
print("shape of data tensor:",data2.shape)
print("shape of labels tensor",labels2.shape)

shape of data tensor: (278677, 50)
shape of labels tensor (278677,)


In [55]:
indices = np.arange(data2.shape[0])

In [57]:
np.random.shuffle(indices)
data2 = data2[indices]
labels2 = labels2[indices]

In [58]:
training_samples2 = int(0.7*data2.shape[0])
training_samples2
validation_samples2 = data2.shape[0]-training_samples2

In [59]:
x_train_product = data2[:training_samples2]
y_train_product = labels2[:training_samples2]
x_test_product= data2[training_samples2:training_samples2+validation_samples2]
y_test_product = labels2[training_samples2:training_samples2+validation_samples2]
# convert class vectors to binary class matrices
y_train_product = keras.utils.to_categorical(y_train_product, num_classes)
y_test_product = keras.utils.to_categorical(y_test_product, num_classes)

In [60]:

labels2 = keras.utils.to_categorical(labels2, num_classes)


**parsing the glove word embeddings file to use pre-trained glove embeddings**

In [61]:
#parsing the glove word embeddings file to use pre-trained glove embeddings
import os
glove_dir = "C:/University of Chicago/Project/glove.6B/glove.6B.50d.txt"
embeddings_index={}
f=open(glove_dir,encoding="utf8")
for line in f:
    values = line.split()
    word=values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word]=coefs
f.close()

In [62]:
#Preparing the Glove word embeddings matrix
embedding_dim = 50
embedding_vector = ()
embedding_matrix = np.zeros((max_words, embedding_dim))
for word,i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

**CNN and LSTM model trained on product dataset**

In [63]:
#model definition
model = Sequential()
model.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(32,return_sequences=True))
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dense(2,activation='softmax'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 50)            500000    
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 46, 64)            16064     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 11, 64)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 11, 32)            12416     
_________________________________________________________________
flatten_1 (Flatten)          (None, 352)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                11296     
__________

In [64]:
history = model.fit(x_train_product, y_train_product, epochs=10, batch_size=32,validation_split=0.2)

Train on 156058 samples, validate on 39015 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [82]:
model.evaluate(x_test_product,y_test_product)



[0.2083440597051154, 0.925314578244141]

**Model using Product data reaches 92% accuracy**

**Model using CNN, LSTM and MOUD data**

In [99]:
model2 = Sequential()
model2.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model2.add(Dropout(0.2))
model2.add(Conv1D(64, 5, activation='relu'))
model2.add(MaxPooling1D(pool_size=4))
model2.add(LSTM(32,return_sequences=True))
model2.add(Flatten())
model2.add(Dense(32,activation='relu'))
model2.add(Dense(2,activation='softmax'))
model2.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
model2.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 50, 50)            500000    
_________________________________________________________________
dropout_17 (Dropout)         (None, 50, 50)            0         
_________________________________________________________________
conv1d_26 (Conv1D)           (None, 46, 64)            16064     
_________________________________________________________________
max_pooling1d_24 (MaxPooling (None, 11, 64)            0         
_________________________________________________________________
lstm_14 (LSTM)               (None, 11, 32)            12416     
_________________________________________________________________
flatten_13 (Flatten)         (None, 352)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 32)                11296     
__________

In [100]:
history = model2.fit(x_train2,y_train2, epochs=10, batch_size=32 ,validation_split=0.2)

Train on 200 samples, validate on 50 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [101]:
model2.evaluate(x_test2,y_test2)



[0.7131406927108764, 0.645]