In [23]:
import pandas as pd
from keras.datasets import imdb
from keras import preprocessing
import keras
from keras.models import Sequential
from keras.layers import Flatten, Dense, Embedding, Dropout, Conv1D, MaxPooling1D,LSTM
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np


In [2]:
max_features = 1000
max_length = 50
(x_train,y_train),(x_test, y_test) = imdb.load_data(num_words = max_features)


In [3]:
x_train= preprocessing.sequence.pad_sequences(x_train, maxlen = max_length)
x_test= preprocessing.sequence.pad_sequences(x_test, maxlen = max_length)

In [4]:
num_classes=2
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [5]:
transcripts = pd.read_csv("C:/University of Chicago/Project/MOUD/TranslatedTransripts/AllText.csv")
transcripts = transcripts[transcripts.Annotation !=0]
transcripts.head(10)
labels = transcripts["Annotation"].tolist()
texts = transcripts["TranslatedText"].tolist()
labels = [1 if x==1 else 0 for x in labels] #converting to format used in the Chollet 


In [6]:
maxlen=50
training_samples=250
validation_samples = 200
max_words = 10000

In [7]:
tokenizer= Tokenizer(num_words= max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print("found {} unique tokens".format(len(word_index)))
labels = np.asarray(labels)

found 1241 unique tokens


In [8]:
data = pad_sequences(sequences, maxlen=maxlen)

In [9]:
print("shape of data tensor:",data.shape)
print("shape of labels tensor",labels.shape)

shape of data tensor: (450, 50)
shape of labels tensor (450,)


In [10]:
indices = np.arange(data.shape[0])

In [11]:
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

In [12]:
x_train2 = data[:training_samples]
y_train2 = labels[:training_samples]
x_test2= data[training_samples:training_samples+validation_samples]
y_test2 = labels[training_samples:training_samples+validation_samples]
# convert class vectors to binary class matrices
y_train2 = keras.utils.to_categorical(y_train2, num_classes)
y_test2 = keras.utils.to_categorical(y_test2, num_classes)

In [13]:
num_classes=2
labels = keras.utils.to_categorical(labels, num_classes)


In [15]:
#parsing the glove word embeddings file to use pre-trained glove embeddings
import os
glove_dir = "C:/University of Chicago/Project/glove.6B/glove.6B.50d.txt"
embeddings_index={}
f=open(glove_dir,encoding="utf8")
for line in f:
    values = line.split()
    word=values[0]
    coefs = np.asarray(values[1:], dtype="float32")
    embeddings_index[word]=coefs
f.close()

In [16]:
#Preparing the Glove word embeddings matrix
embedding_dim = 50
embedding_vector = ()
embedding_matrix = np.zeros((max_words, embedding_dim))
for word,i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

In [26]:
#model definition
model = Sequential()
model.add(Embedding(max_words,embedding_dim,input_length=maxlen))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=4))
model.add(LSTM(32,return_sequences=True))
model.add(Flatten())
model.add(Dense(32,activation='relu'))
model.add(Dense(2,activation='softmax'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy',metrics=['acc'])
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_6 (Embedding)      (None, 50, 50)            500000    
_________________________________________________________________
dropout_5 (Dropout)          (None, 50, 50)            0         
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 46, 64)            16064     
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 11, 64)            0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 11, 32)            12416     
_________________________________________________________________
flatten_2 (Flatten)          (None, 352)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 32)                11296     
__________

In [27]:
history = model.fit(x_train, y_train, epochs=10, batch_size=32,validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [28]:
model.evaluate(data,labels)



[1.0756058547231886, 0.5044444444444445]