In [46]:
import os
import sys
import numpy as np
import keras
import pandas as pd
import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras_preprocessing.sequence import pad_sequences
from keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D
from keras.layers import MaxPool1D
from keras.models import Model
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam


In [47]:
label_replacement = {
    'OFF': 0,
    'NOT': 1,
}

In [49]:
# Replace labels with numbers
train_labels = [label_replacement[label] for label in train_labels]
dev_labels = [label_replacement[label] for label in dev_labels]
test_labels = [label_replacement[label] for label in test_labels]

In [68]:
# Pre-Processing parameters
BATCH_SIZE = 32
MAX_WORDS = 10000
MAX_SEQ_LENGTH = 1000

# Model Parameters
EMBEDDING_DIM = 100
filter_sizes = [3,4,5]
num_filters = 256
embedding_dim = 100

# Drop out probabilities
drop = 0.5
batch_size = 32
epochs = 2


In [51]:
# Load dataset
df_train = pd.read_csv("train_preprocessed.csv")
df_test = pd.read_csv("test_preprocessed.csv")
df_val = pd.read_csv("val_preprocessed.csv")

df_train = df_train[['preprocessed_text', 'label']]
df_test = df_test[['preprocessed_text', 'label']]
df_val = df_val[['preprocessed_text', 'label']]

df_train = df_train[df_train.preprocessed_text.notna()]
df_val = df_val[df_train.preprocessed_text.notna()]
df_test = df_test[df_train.preprocessed_text.notna()]


  # This is added back by InteractiveShellApp.init_path()
  if sys.path[0] == '':


In [52]:
X_train = df_train['preprocessed_text'].values
X_dev = df_val['preprocessed_text'].values
X_test = df_test['preprocessed_text'].values

In [53]:
# map words to numbers for ref (tokenize)
tokenizer = Tokenizer(num_words = MAX_WORDS)
tokenizer.fit_on_texts(X_train)

train_sequences = tokenizer.texts_to_sequences(X_train)
dev_sequences = tokenizer.texts_to_sequences(X_dev)

train_sequences = pad_sequences(train_sequences, maxlen=MAX_SEQ_LENGTH)
dev_sequences = pad_sequences(dev_sequences, maxlen=MAX_SEQ_LENGTH)

word_index = tokenizer.word_index
print(len(word_index))




17927


In [54]:
X_train = tf.convert_to_tensor(train_sequences, dtype=tf.int64)
y_train = tf.convert_to_tensor(train_labels)

X_dev = tf.convert_to_tensor(dev_sequences, dtype=tf.int64)
y_dev = tf.convert_to_tensor(dev_labels)


In [55]:
embeddings_index = {}
f = open("glove.6B.100d.txt")
for line in f:
  values = line.split()
  word = values[0]
  coeff = np.asarray(values[1:], dtype='float32')
  embeddings_index[word] = coeff
f.close()

print(len(embeddings_index))

400000


In [56]:
embedding_matrix = np.zeros((len(word_index)+1, EMBEDDING_DIM))
# initializing matrix (id -> vector)
for word, i in word_index.items():
  embedding_vector = embeddings_index.get(word)
  if embedding_vector is not None: 
    embedding_matrix[i] = embedding_vector

In [75]:
# Custom loss function
import keras.backend as K


def f1(y_true, y_pred):
  y_true = tf.cast(y_true, tf.float32)
  y_pred = tf.cast(y_pred, tf.float32)
  y_pred = K.round(y_pred)
  tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
  tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
  fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
  fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

  p = tp / (tp + fp + K.epsilon())
  r = tp / (tp + fn + K.epsilon())

  f1 = 2*p*r / (p+r+K.epsilon())
  f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
  return K.mean(f1)
  
  
def f1_loss(y_true, y_pred):
  y_true = tf.cast(y_true, tf.float32)
  y_pred = tf.cast(y_pred, tf.float32)
  tp = K.sum(K.cast(y_true*y_pred, 'float'), axis=0)
  tn = K.sum(K.cast((1-y_true)*(1-y_pred), 'float'), axis=0)
  fp = K.sum(K.cast((1-y_true)*y_pred, 'float'), axis=0)
  fn = K.sum(K.cast(y_true*(1-y_pred), 'float'), axis=0)

  p = tp / (tp + fp + K.epsilon())
  r = tp / (tp + fn + K.epsilon())

  f1 = 2*p*r / (p+r+K.epsilon())
  f1 = tf.where(tf.math.is_nan(f1), tf.zeros_like(f1), f1)
  return 1 - K.mean(f1)

In [58]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1,
                            EMBEDDING_DIM,
                            weights=[embedding_matrix],
                            input_length=MAX_SEQ_LENGTH,
                            trainable=False)

In [76]:
inputs = Input(shape=(MAX_SEQ_LENGTH,), dtype='int32')
embedding = embedding_layer(inputs)

print(embedding.shape)
reshape = Reshape((MAX_SEQ_LENGTH,EMBEDDING_DIM,1))(embedding)
print(reshape.shape)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(MAX_SEQ_LENGTH - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(MAX_SEQ_LENGTH - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(MAX_SEQ_LENGTH - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=1, activation='softmax')(dropout)

# this creates a model that includes
model = Model(inputs=inputs, outputs=output)

adam = Adam(learning_rate=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss=f1_loss, metrics=['accuracy', f1])
model.summary()


(None, 1000, 100)
(None, 1000, 100, 1)
Model: "model_5"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_6 (InputLayer)           [(None, 1000)]       0           []                               
                                                                                                  
 embedding (Embedding)          (None, 1000, 100)    1792800     ['input_6[0][0]']                
                                                                                                  
 reshape_5 (Reshape)            (None, 1000, 100, 1  0           ['embedding[5][0]']              
                                )                                                                 
                                                                                                  
 conv2d_15 (Conv2D)             (None, 998, 1, 256)  

In [77]:
# Training model

model.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, verbose=1, validation_data=(X_dev, y_dev))

Epoch 1/2
Epoch 2/2


<keras.callbacks.History at 0x7f61417ee5d0>

In [None]:
tokenized_test_text = tokenizer.texts_to_sequences(X_test)
tokenized_test_text = pad_sequences(tokenized_test_text, MAX_SEQ_LENGTH)
X_test = tf.convert_to_tensor(tokenized_test_text, dtype=tf.int64)

In [80]:
# Predictions
train_pred = model.predict(X_train)
test_pred = model.predict(X_test)
val_pred = model.predict(X_dev)

# Convert predictions to labels
train_pred = np.where(train_pred > 0.5, 1, 0)
test_pred = np.where(test_pred > 0.5, 1, 0)
val_pred = np.where(val_pred > 0.5, 1, 0)



In [83]:
computeAllScores(train_pred, val_pred, test_pred)

Accuracy Train:  0.6709780966767371
Accuracy Dev:  0.6544561933534743
Weighted F1 Train:  0.5388599732280435
Weighted F1 Dev:  0.5177688121805848
Macro F1 Train:  0.4015481100627154
Macro F1 Dev:  0.39557178726318193
Micro F1 Train:  0.6709780966767371
Micro F1 Dev:  0.6544561933534743
Weighted Recall Train:  0.6709780966767371
Weighted Recall Dev:  0.6544561933534743
Macro Recall Train:  0.5
Macro Recall Dev:  0.5
Micro Recall Train:  0.6709780966767371
Micro Recall Dev:  0.6544561933534743
Confusion Matrix Train: 
[[   0 3485]
 [   0 7107]]
Confusion Matrix Dev: 
[[   0  915]
 [   0 1733]]
