In [71]:
import json
import tensorflow as tf
import csv
import random
import numpy as np

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import LSTM, Bidirectional, Dense, Embedding, Dropout, SpatialDropout1D, Conv1D, GlobalMaxPooling1D, Activation

vocab_size = 10000
embedding_dim = 100
max_length = 32
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size=16000
test_portion=.1

In [39]:
embeddings_index = {}
with open('glove.6B.100d.txt') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [4]:
num_sentences = 0
corpus = []
with open("training_cleaned.csv") as csvfile:
    reader = csv.reader(csvfile, delimiter=',')
    for row in reader:
        list_item = []
        line = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)"," ", row[5].lower()).split())
        list_item.append(line)
        list_item.append(0 if row[0] == '0' else 1)
        num_sentences = num_sentences + 1
        corpus.append(list_item)
        if(num_sentences % 100000 == 0):
            print("Completed:", num_sentences)

Completed: 100000
Completed: 200000
Completed: 300000
Completed: 400000
Completed: 500000
Completed: 600000
Completed: 700000
Completed: 800000
Completed: 900000
Completed: 1000000
Completed: 1100000
Completed: 1200000
Completed: 1300000
Completed: 1400000
Completed: 1500000
Completed: 1600000


In [51]:
sentences = []
labels = []
random.shuffle(corpus)
for x in range(training_size):
    sentences.append(corpus[x][0])
    labels.append(corpus[x][1])

In [52]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index
sequences = tokenizer.texts_to_sequences(sentences)
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
print(len(word_index))

19361


In [57]:
split = int(test_portion * training_size)
test_sequences = np.array(padded_sequences[:split])
training_sequences = np.array(padded_sequences[split:])
test_labels = np.array(labels[:split])
training_labels = np.array(labels[split:])
print(Counter(test_labels), Counter(training_labels))

Counter({0: 814, 1: 786}) Counter({0: 7202, 1: 7198})


In [54]:
embeddings_matrix = np.zeros((vocab_size + 1, embedding_dim))
for word, i in word_index.items():
    if i >= vocab_size:
        continue
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embeddings_matrix[i] = embedding_vector
print(len(embeddings_matrix))

10001


In [78]:
model = tf.keras.Sequential([
    Embedding(vocab_size+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    SpatialDropout1D(0.2),
    LSTM(128, dropout=0.5, recurrent_dropout=0.5),
    Dense(250),
    Dropout(0.2),
    Dense(1),
    Activation('sigmoid')
])
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
model.summary()

Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_14 (Embedding)     (None, 32, 100)           1000100   
_________________________________________________________________
spatial_dropout1d_8 (Spatial (None, 32, 100)           0         
_________________________________________________________________
lstm_11 (LSTM)               (None, 128)               117248    
_________________________________________________________________
dense_17 (Dense)             (None, 250)               32250     
_________________________________________________________________
dropout_9 (Dropout)          (None, 250)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 1)                 251       
_________________________________________________________________
activation_3 (Activation)    (None, 1)               

In [79]:
num_epochs = 50
history = model.fit(training_sequences, training_labels, epochs=num_epochs, validation_data=(test_sequences, test_labels), verbose=2)

print("Training Complete")

Epoch 1/50
450/450 - 14s - loss: 0.6610 - accuracy: 0.6039 - val_loss: 0.5945 - val_accuracy: 0.6694
Epoch 2/50
450/450 - 14s - loss: 0.6372 - accuracy: 0.6467 - val_loss: 0.6143 - val_accuracy: 0.6625
Epoch 3/50
450/450 - 13s - loss: 0.6243 - accuracy: 0.6538 - val_loss: 0.5941 - val_accuracy: 0.6856
Epoch 4/50
450/450 - 13s - loss: 0.6093 - accuracy: 0.6708 - val_loss: 0.5610 - val_accuracy: 0.7144
Epoch 5/50
450/450 - 13s - loss: 0.5989 - accuracy: 0.6799 - val_loss: 0.5411 - val_accuracy: 0.7181
Epoch 6/50
450/450 - 13s - loss: 0.5881 - accuracy: 0.6869 - val_loss: 0.5418 - val_accuracy: 0.7163
Epoch 7/50
450/450 - 13s - loss: 0.5805 - accuracy: 0.6926 - val_loss: 0.5435 - val_accuracy: 0.7219
Epoch 8/50
450/450 - 14s - loss: 0.5752 - accuracy: 0.6940 - val_loss: 0.5298 - val_accuracy: 0.7250
Epoch 9/50
450/450 - 13s - loss: 0.5662 - accuracy: 0.7055 - val_loss: 0.5309 - val_accuracy: 0.7294
Epoch 10/50
450/450 - 13s - loss: 0.5618 - accuracy: 0.7056 - val_loss: 0.5225 - val_accura

KeyboardInterrupt: 