In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import Dense, LSTM, Embedding, Dropout, Activation, Input
from keras.models import Model
import matplotlib.pyplot as plt
import csv

In [45]:
# Load data
df_train = pd.read_csv('data/twitter_training.csv', header=None)
df_train.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [2]:
label_to_idx = {'Irrelevant': 0, 'Negative': -1, 'Neutral': 0, 'Positive': 1}

In [39]:
def find_vocabulary(data_file, top_words = 10000):
    """
    Find out top top_words words occuring in training set.
    Return index to word, word to index and word to vector mapping of these top_words
    """
    with open(data_file, 'r') as f:
        csvReader = csv.reader(f)

        word_freq = dict()

        for row in csvReader:
            sentence = row[3]
            li = sentence.lower().strip().split()
            for w in li:
                # If there is no alphabet in w, then continue
                if not any(chr.isalpha() for chr in w): continue
                
                if w in word_freq.keys():
                    word_freq[w] += 1
                else: word_freq[w] = 1
    most_freq_words = sorted(word_freq.items(), key = lambda x: x[1], reverse=True)[:top_words]
    most_freq_words = sorted(most_freq_words, key=lambda x:x[0])

    idx_to_word = dict()
    word_to_idx = dict()

    i = 1
    for w, _ in most_freq_words:
        word_to_idx[w] = i
        idx_to_word[i] = w
        i += 1

    word_to_vec_map = dict()
    
    temp_arr = np.zeros((top_words, ))
    for w, idx in word_to_idx.items():
        temp_arr[idx-1] = 1.0
        word_to_vec_map[w] = temp_arr.copy()
        temp_arr[idx-1] = 0.0

    return word_to_vec_map, word_to_idx, idx_to_word

In [40]:
word_to_vec_map, word_to_idx, idx_to_word = find_vocabulary('data/twitter_training.csv', top_words=15000)

In [41]:
word_to_idx

{'!commands': 1,
 '"a': 2,
 '"amazon': 3,
 '"an': 4,
 '"and': 5,
 '"apex': 6,
 '"are': 7,
 '"baby";': 8,
 '"bad': 9,
 '"ban': 10,
 '"best': 11,
 '"big': 12,
 '"bill': 13,
 '"call': 14,
 '"came': 15,
 '"cyberpunk': 16,
 '"digital': 17,
 '"ebo': 18,
 '"fact': 19,
 '"get': 20,
 '"ghost': 21,
 '"google': 22,
 '"great': 23,
 '"hello': 24,
 '"i': 25,
 '"i\'m': 26,
 '"i\'ve': 27,
 '"if': 28,
 '"in': 29,
 '"is': 30,
 '"it\'s': 31,
 '"johnson': 32,
 '"just': 33,
 '"league': 34,
 '"life': 35,
 '"little': 36,
 '"make': 37,
 '"microsoft': 38,
 '"most': 39,
 '"murgur': 40,
 '"my': 41,
 '"new': 42,
 '"next': 43,
 '"no': 44,
 '"not': 45,
 '"now': 46,
 '"odyssey': 47,
 '"oh': 48,
 '"oh,': 49,
 '"pay': 50,
 '"people': 51,
 '"punk': 52,
 '"racist': 53,
 '"real': 54,
 '"red': 55,
 '"so': 56,
 '"super': 57,
 '"take': 58,
 '"thank': 59,
 '"the': 60,
 '"there': 61,
 '"they': 62,
 '"this': 63,
 '"too': 64,
 '"toxic': 65,
 '"unexplained': 66,
 '"very': 67,
 '"we': 68,
 '"what': 69,
 '"why': 70,
 '"world': 71,

In [46]:
maxLen = len(max(df_train[3], key=lambda x:len(str(x).strip().split())).strip().split())
maxLen

198

In [47]:
def load_data(label_to_idx, csv_file = 'data/twitter_training.csv'):
    X = []
    y = []
    with open(csv_file, 'r') as f:
        csvReader = csv.reader(f)
        for row in csvReader:
            X.append(row[3])
            y.append(label_to_idx[row[2]])
    X = np.asarray(X)
    y = np.asarray(y, dtype=int)
    return X, y

In [48]:
X_train, y_train = load_data(label_to_idx, 'data/twitter_training.csv')
X_val, y_val = load_data(label_to_idx, 'data/twitter_validation.csv')

In [49]:
def sentence_to_indices(X, words_to_idx ,maxLen):
    m = X.shape[0]
    X_out = np.zeros((m, maxLen))
    for i in range(m):
        li = X[i].lower().strip().split()
        j = 0
        for w in li:
            if (j >= maxLen): break
            if w in words_to_idx.keys():
                X_out[i,j] = words_to_idx[w]
            j += 1
    return X_out

In [50]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_size = len(word_to_index) + 1              # adding 1 to fit Keras embedding (requirement)
    any_word = list(word_to_vec_map.keys())[0]
    emb_dim = word_to_vec_map[any_word].shape[0]    # define dimensionality of your GloVe word vectors (= 50)
      
    ### START CODE HERE ###
    # Step 1
    # Initialize the embedding matrix as a numpy array of zeros.
    # See instructions above to choose the correct shape.
    emb_matrix = np.zeros((vocab_size, emb_dim))
    
    # Step 2
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    # Step 3
    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(input_dim = vocab_size, output_dim = emb_dim, trainable = False)
    ### END CODE HERE ###

    # Step 4 (already done for you; please do not modify)
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,)) # Do not modify the "None".  This line of code is complete as-is.
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [None]:
X_train_indices = sentence_to_indices(X_train, word_to_idx, maxLen)
X_val_indices = sentence_to_indices(X_val, word_to_idx, maxLen)

In [None]:
def build_model(input_shape, words_to_vec_map, words_to_idx):
    sentence_indices = Input(shape = input_shape)

    embedding_layer = pretrained_embedding_layer(words_to_vec_map, words_to_idx)

    embeddings = embedding_layer(sentence_indices)

    X = LSTM(units=128, return_sequences=True)(embeddings)

    X = Dropout(rate = 0.5)(X)

    X = LSTM(units = 128, return_sequences=False)(X)

    X = Dropout(rate = 0.5)(X)

    X = Dense(units= 1)(X)

    X = Activation('tanh')(X)

    model = Model(inputs = sentence_indices, outputs = X)

    return model

In [None]:
model = build_model((maxLen, ), word_to_vec_map, word_to_idx)

In [None]:
model.compile(loss='mean_squared_error', optimizer= 'adam', metrics=['accuracy'])

In [None]:
history = model.fit(X_train_indices, y_train, epochs = 50, batch_size = 32, shuffle=True)

In [None]:
model.save('model_no_emb.h5')

In [None]:
model.evaluate(X_val_indices, y_val)