In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf

In [None]:
data = pd.read_csv("archive/train.csv")

In [None]:
data.head(5)

In [None]:
data.drop(data.columns[0], axis = 1, inplace = True)

In [None]:
data

In [None]:
emoji_data = pd.read_csv("archive/Mapping.csv")
emoji_data

In [None]:
data.isnull().sum()

import re

## Text Cleaning Function


In [None]:
import re
def clean_text(text):
    text = re.sub(r"http\S+","", text)
    text = re.sub(r"@\w+","", text)
    text = re.sub(r"#\w+","", text)
    text = re.sub(r'[^a-zA-Z0-9 ]+', '', text)
    text = text.lower()
    return text
data["TEXT"] = data["TEXT"].apply(clean_text)
    

In [None]:
data

In [None]:
X = data["TEXT"]
Y = data["Label"]

## Splitting Data

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, random_state = 42, test_size = 0.2)

## Tokenizing Text

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.text_to_sequences(X_test)
word2index = tokenizer.word_index


In [None]:
word2index 

## Word Embeddings

In [None]:
import gensim.downloader as api
wv = api.load("word2vec-google-news-300")

## Padding Sequences

In [None]:
def get_maxlen(data):
    maxlen = 0
    for index, row in data.iterrows():
        sent = row["TEXT"].split()
        maxlen = max(maxlen, len(sent))
    return maxlen
max_len = get_maxlen(data)
print(max_len)

In [None]:
max_length = 40
def padding(seq):
    data = pad_sequences(seq, maxlen = max_length, padding = "post", truncating = 'post')
    return data
X_train_pad = padding(X_train_seq)
X_test_pad = padding(X_test_seq)

## Convert Labels to Categorical

In [None]:
from tensorflow.keras.utils import to_categorical
Y_train = to_categorical(Y)
Y_test = to_categorical(Y)

In [None]:
Y_train[0]

In [None]:
print(X_train[0])
print(X_train_pad[0], Y_train[0])

In [None]:
num_output = len(emoji_data)
num_output

## Building and Training LSTM models

In [None]:
import numpy as np
embed_size = 100
embedding_matrix = np.zeros(len(word2index)+1, embed_size)
for word, i in word2index.items():
    embed_vector = wv[word]
    embedding_matrix[i] = embed_vector

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
model = Sequential()
model.add(Embedding(input_dim = len(word2index)+1, output_dim = embed_size,  trainable = False, weight = [embedding_matrix]))
model.add(Bidirectional(LSTM(units = 512, return_sequences=True)))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(units=256)))
model.add(Dropout(0.3))
model.add(Dense(units=128, activation='relu'))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(units=32, activation='relu'))
model.add(Dense(units=20, activation='softmax'))
model.summary()

## Compiling Model

In [None]:
model.compile(loss = "categorical_crossentropy", optimizer = 'adam', metrics = ['accuracy'])

## Train the model

In [None]:
history = model.fit(X_train_pad,Y_train, epochs = 10, batch_size = 128, validation_split = 0.2, shuffle = True)

## Model Evaluation

In [None]:
loss, accuracy = model.evaluate(X_test_pad, Y_test)