In [1]:
# To connect google drive to Colab

from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# To set the directory for code

import os
os.chdir('/content/drive/My Drive/Colab Notebooks/BDA/Final Project')

In [3]:
import re
import string
import numpy as np
import pandas as pd
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras import Sequential, Model
from tensorflow.keras.layers import Conv2D, MaxPool2D, GlobalAveragePooling2D
from tensorflow.keras.layers import Dense, Flatten, BatchNormalization, Activation, Dropout
from tensorflow.keras.layers import Conv1D, Embedding, GlobalAveragePooling1D 
from tensorflow.keras.optimizers import Adam, RMSprop

In [4]:
df = pd.read_json("data/train.jsonl", lines=True)
df.drop(df.columns[df.columns.str.contains('unnamed',case = False)],axis = 1, inplace = True)
df.head()

Unnamed: 0,id,img,label,text
0,42953,img/42953.png,0,its their character not their color that matters
1,23058,img/23058.png,0,don't be afraid to love again everyone is not ...
2,13894,img/13894.png,0,putting bows on your pet
3,37408,img/37408.png,0,i love everything and everybody! except for sq...
4,82403,img/82403.png,0,"everybody loves chocolate chip cookies, even h..."


In [5]:
df = df.drop(columns = ['id'])

In [6]:
cleaned = df.copy()
cleaned.dropna(inplace=True)
cleaned.isnull().any()

img      False
label    False
text     False
dtype: bool

In [7]:
target = cleaned['label']
target = pd.get_dummies(target)
target.head()

Unnamed: 0,0,1
0,1,0
1,1,0
2,1,0
3,1,0
4,1,0


In [8]:
def standardization(data):
    data = data.apply(lambda x: x.lower())
    data = data.apply(lambda x: re.sub(r'\d+', '', x))
    data = data.apply(lambda x: re.sub(r'.com', '', x, flags=re.MULTILINE))
    data = data.apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)))
    return data

cleaned['text'] = standardization(cleaned.text)

In [9]:
from tensorflow.keras.layers.experimental.preprocessing import TextVectorization
vocab_size = 10000
sequence_length = 50

vectorize_layer = TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

text_ds = np.asarray(cleaned['text'])
vectorize_layer.adapt(tf.convert_to_tensor(text_ds))

In [10]:
X_text_train, X_text_test, y_text_train, y_text_test = train_test_split(cleaned.text, target, test_size = 0.2)

In [11]:
embedding_dim = 16

text_input = tf.keras.Input(shape=(None,), dtype=tf.string, name='text')
text_layers = vectorize_layer(text_input)
text_layers = tf.keras.layers.Embedding(vocab_size, embedding_dim, name="embedding")(text_layers)

text_layers = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(512, activation='relu', return_sequences=True))(text_layers)
text_layers = tf.keras.layers.BatchNormalization()(text_layers)
text_layers = tf.keras.layers.Bidirectional(tf.keras.layers.GRU(512, activation='relu', return_sequences=True))(text_layers)
text_layers = tf.keras.layers.BatchNormalization()(text_layers)

text_layers = tf.keras.layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(text_layers)

text_layers = tf.keras.layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(text_layers)

text_layers = tf.keras.layers.GlobalMaxPooling1D()(text_layers)

text_layers = tf.keras.layers.Dense(2048, activation="relu")(text_layers)
text_layers = tf.keras.layers.Dropout(0.5)(text_layers)



In [12]:
output = tf.keras.layers.Dense(2, activation='softmax', name = 'overall')(text_layers)

model = tf.keras.Model(inputs = text_input, outputs = output)

In [13]:
base_learning_rate = 0.001
losses = {
      "overall": tf.keras.losses.CategoricalCrossentropy(from_logits=True)
}
lossWeights = {
      "overall": 1.0
}
metric = {
    "overall": ['accuracy']
}

model.compile(optimizer=tf.keras.optimizers.Adam(lr=base_learning_rate),
              loss = losses,
              loss_weights= lossWeights,
              metrics=metric)

In [14]:
history = model.fit(x = {"text": X_text_train},
                    y = {"overall": y_text_train},
                    batch_size=32,
                    epochs=30,
                    verbose=1
                   )

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [19]:
model_json = model.to_json()
with open("LSTM_30.json", "w") as json_file:
    json_file.write(model_json)
model.save_weights("LSTM_30", save_format='tf')

In [None]:
history = model.evaluate(x = {"text": X_text_test},
                    y = {"overall": y_text_test},
                    batch_size=32,  
                    verbose=1
                   )