## Deep Learning Model - NN with Word Embeddings from GloVe
### Dataset: English

In [1]:
#Imports for model
from os import path
import numpy as np
import pandas as pd
import tensorflow as tf
import os
import csv
from tensorflow import keras

#Import GloVe model
from glove import Glove

print("TensorFlow Version: "+tf.__version__)
if tf.test.gpu_device_name(): 
    print('Default GPU Device:{}'.format(tf.test.gpu_device_name()))
else:
    print("GPU not found. Please install GPU version of TF if needed")

TensorFlow Version: 2.3.1
Default GPU Device:/device:GPU:0


In [None]:
#Install import_ipynb to import other notebooks
!pip install import_ipynb

In [2]:
#Download and prepare the Pre-trained GloVe Word Embedding model
path_to_glove_zipfile = "../processed_files/glove.42B.300d.zip"
path_to_glove_file = "../processed_files/glove.42B.300d.txt"

if not path.exists(path_to_glove_file):
    if not path.exists(path_to_glove_zipfile):
        print("downloading glove .zip file...")
        !wget http://nlp.stanford.edu/data/glove.42B.300d.zip
    print("unzipping glove .zip file...")
    !unzip -q glove.42B.300d.zip

In [3]:
#Create instance of glove
glove = Glove()

Found 1917494 word vectors.


In [4]:
#Check some word vector representations
print(glove.vector("decimal"))
print(glove.vector("nhibernate").shape)

[ 6.0242e-01 -3.5931e-01 -6.5666e-01  1.6470e-01  2.2212e-01  3.2755e-01
 -9.8938e-01  1.3407e+00  3.3532e-02 -4.3492e-01 -1.1260e-01 -9.6771e-02
 -8.2175e-01  1.0123e+00 -6.2944e-01 -1.2833e-01  7.6772e-01 -2.9737e-01
  6.3013e-01 -5.2358e-01  2.1238e-01  7.7167e-02  5.0815e-01  4.8051e-03
  6.6603e-02  6.4908e-01  4.9159e-01 -4.5719e-01 -4.3848e-01 -5.1041e-01
 -3.9617e-01 -4.4244e-01  1.2044e+00  9.1132e-02 -3.6845e-01 -2.0362e-01
  1.5433e-01  6.5747e-01 -3.1456e-01  9.7153e-01 -6.3147e-01  1.0481e-02
 -4.7715e-01  4.7417e-01 -2.6940e-01 -4.5268e-01  2.1765e-01  1.5206e-01
  1.8309e-01 -1.6915e-01  2.3382e-02  8.2740e-01  3.9396e-01 -8.1216e-02
 -1.5340e-01  2.9491e-01  1.9455e-02 -1.7298e-01 -2.4993e-01  3.2447e-01
  8.3227e-01  7.6610e-02  1.7777e-01  2.8370e-01  1.8154e-01 -3.5773e-01
 -4.7704e-01  1.6308e-01 -5.6907e-02  3.7091e-01 -1.7129e-01  5.7642e-01
  6.3547e-01  2.0492e-01 -4.5779e-01 -9.6861e-02 -6.2884e-01  1.9092e-01
 -1.4184e-01 -1.6334e-01 -4.7154e-02 -8.3175e-02 -3

In [5]:
#Define the input data for training as batches
#For input file, the label is given by the "stars" column, which is the 3rd col
#Generator of batches
def batch_generator(train_df,batch_size,steps):
    idx=1
    while True: 
        yield load_data(train_df,idx-1,batch_size) # Yields data
        if idx < steps:
            idx+=1
        else:
            idx=1

#Loads the requested batch given its index
def load_data(train_df,idx,batch_size):
    df = pd.read_csv(train_df, skiprows=idx*batch_size,nrows=batch_size)
    x = df.iloc[:,1:]
    y = df.iloc[:,0]
    return (np.array(x), np.array(y))

In [19]:
#Sentence to sequence vectors
def convert_to_vec(sentence, max_sequence_length, vec_dim):
    words = sentence.split(" ")
    vec = np.zeros((max_sequence_length, vec_dim))
    for i in range(max_sequence_length):
        if i == len(words):
            break
        vec[i] = glove.vector(words[i])
    return vec

#Read CSV and fill up given matrix
def read_and_parse(file_path, input_size=float("inf"), max_sequence_length=30, vec_dim=300):
    matrix = []
    labels = []
    with open(file_path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter=",")
        i = 0
        for row in reader:
            question = row["title"]
            vec = convert_to_vec(question, max_sequence_length, vec_dim)
            # matrix[i,:] = vec
            matrix.append(vec)
            # labels[i] = float(row["stars"])
            labels.append(float(row["stars"]))
            i += 1
            if i == input_size:
                break
        return np.array(matrix), np.array(labels)

#Parameters
MAX_SEQUENCE_LENGTH = 30
VEC_DIM = 300
INPUT_SIZE_TRAIN = 97528
INPUT_SIZE_TEST = 5418
INPUT_SIZE_VAL = 5418
INPUT_FILE_TRAIN = "../processed_files/english_train.csv"
INPUT_FILE_TEST = "../processed_files/english_test.csv"
INPUT_FILE_VAL = "../processed_files/english_val.csv"

#Read CSV for training
input_sequences_train,labels_train = read_and_parse(INPUT_FILE_TRAIN, max_sequence_length=MAX_SEQUENCE_LENGTH)

#Read CSV and create input matrix for testing
input_sequences_test,labels_test = read_and_parse(INPUT_FILE_TEST, max_sequence_length=MAX_SEQUENCE_LENGTH)

#Read CSV and create input matrix for validation
input_sequences_val,labels_val = read_and_parse(INPUT_FILE_VAL, max_sequence_length=MAX_SEQUENCE_LENGTH)


In [23]:
print("input shape:",input_sequences_train.shape)
print("labels train:",labels_train.shape)
print("input shape:",input_sequences_test.shape)
print("labels train:",labels_test.shape)
labels_train[0]
print("trues over all:", sum(labels_train)/len(labels_train))

input shape: (97528, 30, 300)
labels train: (97528,)
input shape: (5418, 30, 300)
labels train: (5418,)
trues over all: 0.4639078008366828


In [20]:
#Shuffle train data
idx = np.random.choice(range(INPUT_SIZE_TRAIN), INPUT_SIZE_TRAIN, replace=False)
input_sequences_train = input_sequences_train[idx]
labels_train = labels_train[idx]

#Shuffle test data
idx = np.random.choice(range(INPUT_SIZE_TEST), INPUT_SIZE_TEST, replace=False)
input_sequences_test = input_sequences_test[idx]
labels_test = labels_test[idx]

#Shuffle val data
idx = np.random.choice(range(INPUT_SIZE_VAL), INPUT_SIZE_VAL, replace=False)
input_sequences_val = input_sequences_val[idx]
labels_val = labels_val[idx]

In [21]:
# Create model #1
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import LSTM, Conv1D, MaxPooling1D, Flatten, Dense, Bidirectional, Input, Dropout
from tensorflow.keras import Model

# inputs: A 3D tensor with shape [batch, timesteps, feature].
# inputs = tf.random.normal([32, 10, 8])
# lstm = tf.keras.layers.LSTM(4)
# output = lstm(inputs)
# print(output.shape) = (32,4)

input = Input(shape=(MAX_SEQUENCE_LENGTH,glove.dim))
x = Bidirectional(LSTM(256, activation="relu", return_sequences=True))(input)
#x = Dropout(0.3)(x)
x = Bidirectional(LSTM(256, activation="relu", return_sequences=True))(x)
x = Dropout(0.3)(x)
x = Bidirectional(LSTM(256, activation="relu", return_sequences=False))(x)
x = Dropout(0.3)(x)
#x = LSTM(256, activation="relu")(x)
x = Flatten()(x)
#x = Dropout(0.2)(x)
x = Dense(256 ,activation="relu")(x)
preds = Dense(1, activation="sigmoid")(x)

model1 = Model(input, preds)
model1.compile(loss='mean_squared_error',
              optimizer=tf.keras.optimizers.Adam(0.0003),
              metrics=["accuracy"])
model1.summary()


Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 30, 300)]         0         
_________________________________________________________________
bidirectional (Bidirectional (None, 30, 512)           1140736   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 30, 512)           1574912   
_________________________________________________________________
dropout (Dropout)            (None, 30, 512)           0         
_________________________________________________________________
bidirectional_2 (Bidirection (None, 512)               1574912   
_________________________________________________________________
dropout_1 (Dropout)          (None, 512)               0         
_________________________________________________________________
flatten (Flatten)            (None, 512)              

In [25]:
#Fit model
model1.fit(input_sequences_train, labels_train, validation_data=(input_sequences_test, labels_test), epochs=10, batch_size=128)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
119/762 [===>..........................] - ETA: 4:53 - loss: 0.1502 - accuracy: 0.7842

KeyboardInterrupt: 

In [None]:
questions = [
              "compressing / decompressing folders & files",
              "HOW TO decompress and compress files and folders",
              "how to load a specific version of an assembly",
              "how would one code test and set behavior without a special hardware instruction?",
             "can you debug a .net app with only the source code of one file?",
             "what columns generally make good indexes?",
             "why is there no generic synchronized queue in .net?"
]

for question in questions:
    input_sample = convert_to_vec(question, MAX_SEQUENCE_LENGTH, VEC_DIM)
    input_sample = input_sample[np.newaxis,...]
    print(model1.predict(input_sample))



In [None]:
predictions = model1.predict(input_sequences_test)
y_hat_class = 
print(sum(predictions) / predictions)
for y_hat,y  in zip(predictions[:50], labels_test[:50]):
    print(y,":", y_hat)

In [None]:
#create word to index dictionary: word->index
# index 0 is for unnexistent word
word_index = {}
cont = 0
for word in glove.embeddings.keys():
    cont +=1
    word_index[word] = cont

#Processing of GloVe data into the embedding matrix to use in Keras
embedding_matrix = np.zeros((len(word_index) + 1, glove.dim))
EMBEDDING_DIM = glove.dim

for word, i in word_index.items():
    embedding_matrix[i] = glove.vector(word)

# delete glove dictionary to save memory RAM
#del glove

In [None]:
# Create model #1
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Input
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,
                            trainable=False)

#del embedding_matrix

sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
x = Conv1D(16, 5, activation='relu')(embedded_sequences)
x = MaxPooling1D(5)(x)
x = Conv1D(16, 5, activation='relu')(x)
x = MaxPooling1D(5)(x)
x = Conv1D(16, 5, activation='relu')(x)
x = MaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(1, activation='relu')(x)

model1 = Model(sequence_input, preds)
model1.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
# model1.fit(x_train, y_train, validation_data=(x_val, y_val),
#           epochs=2, batch_size=128)
