In [1]:
import numpy as np
import pandas as pd
from tensorflow import keras
from keras.layers import Dense, LSTM, Embedding, Dropout, Activation, Input
from keras.models import Model
import matplotlib.pyplot as plt
import csv

2023-12-12 23:35:42.786542: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-12 23:35:42.833341: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-12-12 23:35:42.833382: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-12-12 23:35:42.834461: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-12-12 23:35:42.840644: I external/local_tsl/tsl/cuda/cudart_stub.cc:31] Could not find cuda drivers on your machine, GPU will not be used.
2023-12-12 23:35:42.841456: I tensorflow/core/platform/cpu_feature_guard.cc:1

In [2]:
# Load data
df_train = pd.read_csv('data/twitter_training.csv', header=None)
df_train.head()

Unnamed: 0,0,1,2,3
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
2,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
3,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
4,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...


In [3]:
label_to_idx = {'Irrelevant': 0, 'Negative': -1, 'Neutral': 0, 'Positive': 1}

In [4]:
# Find out number of different words in training set
words = set()
for sentence in df_train[3]:
    for word in str(sentence).split():
        words.add(word)
len(words)

82367

In [5]:
# Since the above approach doesn't seem to be promising, continue with embeddings from glove-50
def read_glove_file(filename = 'glove.6B.50d.txt'):
    with open(filename, 'r') as f:
        words = set()
        words_to_vec_map = dict()
        for line in f:
            # Remove extra white spaces and split the line
            li = line.strip().split()
            words.add(li[0])
            words_to_vec_map[li[0]] = np.array(li[1:], dtype=np.float64)
    idx_to_words = dict()
    words_to_idx = dict()
    i = 1
    for word in sorted(words):
        words_to_idx[word] = i
        idx_to_words[i] = word
        i += 1
    return words_to_vec_map, words_to_idx, idx_to_words

In [6]:
words_to_vec_map, words_to_idx, idx_to_words = read_glove_file()

In [7]:
maxLen = len(max(df_train[3], key=lambda x:len(str(x).strip().split())).strip().split())
maxLen

198

In [8]:
def load_data(label_to_idx, csv_file = 'data/twitter_training.csv'):
    X = []
    y = []
    with open(csv_file, 'r') as f:
        csvReader = csv.reader(f)
        for row in csvReader:
            X.append(row[3])
            y.append(label_to_idx[row[2]])
    X = np.asarray(X)
    y = np.asarray(y, dtype=int)
    return X, y

In [9]:
X_train, y_train = load_data(label_to_idx, 'data/twitter_training.csv')
X_val, y_val = load_data(label_to_idx, 'data/twitter_validation.csv')

In [10]:
def sentence_to_indices(X, words_to_idx ,maxLen):
    m = X.shape[0]
    X_out = np.zeros((m, maxLen))
    for i in range(m):
        li = X[i].lower().strip().split()
        j = 0
        for w in li:
            if (j >= maxLen): break
            if w in words_to_idx.keys():
                X_out[i,j] = words_to_idx[w]
            j += 1
    return X_out

In [11]:
def pretrained_embedding_layer(word_to_vec_map, word_to_index):
    """
    Creates a Keras Embedding() layer and loads in pre-trained GloVe 50-dimensional vectors.
    
    Arguments:
    word_to_vec_map -- dictionary mapping words to their GloVe vector representation.
    word_to_index -- dictionary mapping from words to their indices in the vocabulary (400,001 words)

    Returns:
    embedding_layer -- pretrained layer Keras instance
    """
    
    vocab_size = len(word_to_index) + 1              # adding 1 to fit Keras embedding (requirement)
    any_word = list(word_to_vec_map.keys())[0]
    emb_dim = word_to_vec_map[any_word].shape[0]    # define dimensionality of your GloVe word vectors (= 50)
      
    ### START CODE HERE ###
    # Step 1
    # Initialize the embedding matrix as a numpy array of zeros.
    # See instructions above to choose the correct shape.
    emb_matrix = np.zeros((vocab_size, emb_dim))
    
    # Step 2
    # Set each row "idx" of the embedding matrix to be 
    # the word vector representation of the idx'th word of the vocabulary
    for word, idx in word_to_index.items():
        emb_matrix[idx, :] = word_to_vec_map[word]

    # Step 3
    # Define Keras embedding layer with the correct input and output sizes
    # Make it non-trainable.
    embedding_layer = Embedding(input_dim = vocab_size, output_dim = emb_dim, trainable = False)
    ### END CODE HERE ###

    # Step 4 (already done for you; please do not modify)
    # Build the embedding layer, it is required before setting the weights of the embedding layer. 
    embedding_layer.build((None,)) # Do not modify the "None".  This line of code is complete as-is.
    
    # Set the weights of the embedding layer to the embedding matrix. Your layer is now pretrained.
    embedding_layer.set_weights([emb_matrix])
    
    return embedding_layer

In [12]:
X_train_indices = sentence_to_indices(X_train, words_to_idx, maxLen)
X_val_indices = sentence_to_indices(X_val, words_to_idx, maxLen)

In [13]:
def build_model(input_shape, words_to_vec_map, words_to_idx):
    sentence_indices = Input(shape = input_shape)

    embedding_layer = pretrained_embedding_layer(words_to_vec_map, words_to_idx)

    embeddings = embedding_layer(sentence_indices)

    X = LSTM(units=128, return_sequences=True)(embeddings)

    X = Dropout(rate = 0.5)(X)

    X = LSTM(units = 128, return_sequences=False)(X)

    X = Dropout(rate = 0.5)(X)

    X = Dense(units= 1)(X)

    X = Activation('tanh')(X)

    model = Model(inputs = sentence_indices, outputs = X)

    return model

In [32]:
model = build_model((maxLen, ), words_to_vec_map, words_to_idx)

In [33]:
model.compile(loss='mean_squared_error', optimizer= 'adam', metrics=['accuracy'])

In [36]:
history = model.fit(X_train_indices, y_train, epochs = 30, batch_size = 32, shuffle=True)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [37]:
model.evaluate(X_val_indices, y_val)



[0.056153979152441025, 0.699999988079071]

In [38]:
model.save("model_numbers.h5")

  saving_api.save_model(


In [41]:
y_pred_val = model.predict(X_val_indices)
for idx, res in enumerate(y_pred_val):
    temp = np.argmax(res)
    # print(idx, res)
    if (temp ==0): continue
    elif (temp == 1): print(f"Predicted 'Negative' for '{X_val[idx]}'")
    else: print(f"Predicted 'Positive' for '{X_val[idx]}'")

 1/32 [..............................] - ETA: 1s

