In [40]:
import pandas as pd
import numpy as np 

from nltk.tokenize import TweetTokenizer 

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout, Input 
from keras.callbacks import EarlyStopping

import pickle

In [9]:
df_train = pd.read_csv("emoji_train.csv")
df_val = pd.read_csv("emoji_validation.csv")
df_test = pd.read_csv("emoji_test.csv")

In [6]:
def tokenizeText(text): 
    tknzr = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True) 
    return tknzr.tokenize(text.lower())

In [5]:
## using stanfords pretrained twitter dataset for word vectors
embedding_index = {}
with open("glove.twitter.27B.100d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

In [10]:
# vocabulary creation
maxLen = 0
vocab = {}
index = 1
for s in df_train["text"]:
  # print(s.lower())
  words = tokenizeText(s)
  maxLen = max(maxLen,len(words))
  for word in words:
    if(vocab.get(word)==None):
      vocab[word]=index
      index+=1
vocabSize = len(vocab)
print(vocabSize)
print(maxLen)

61227
40


In [12]:
embedding_dim = 100 # bcs embedded vector size of 100d is 100
embedding_matrix = np.zeros((vocabSize+1, embedding_dim)) 
progress = 0 
for word, i in vocab.items(): 
    if(progress % 10000 == 0 ) : print(progress) 
    progress+=1
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

0
10000
20000
30000
40000
50000
60000


In [25]:
# training set 
X_train_temp = df_train.drop("label",axis=1).values
X_train = []
for x in X_train_temp:
  wordVec = np.zeros(maxLen)
  words = tokenizeText(x[0].lower())
  for j in range(min(len(words),maxLen)):
    if(vocab.get(words[j])) : wordVec[j]=vocab[words[j]]
  X_train.append(wordVec)
X_train = np.array(X_train)
Y_train = df_train["label"].values

In [26]:
#validation set
X_val_temp = df_val.drop("label",axis=1).values
X_val = []
for x in X_val_temp:
  wordVec = np.zeros(maxLen)
  words = tokenizeText(x[0].lower())
  for j in range(min(len(words),maxLen)):
    if(vocab.get(words[j])) : wordVec[j]=vocab[words[j]]
  X_val.append(wordVec)
X_val = np.array(X_val)
Y_val = df_val["label"].values

In [27]:
# testing set 
X_test_temp = df_test.drop("label",axis=1).values
X_test = []
for x in X_test_temp:
  wordVec = np.zeros(maxLen)
  words = tokenizeText(x[0].lower())
  for j in range(min(len(words),maxLen)):
    if(vocab.get(words[j])) : wordVec[j]=vocab[words[j]]
  X_test.append(wordVec)
X_test = np.array(X_test)
Y_test = df_test["label"].values

In [42]:

model = Sequential()

model.add(Input(shape=(maxLen,)))
model.add(Embedding(vocabSize+1, embedding_dim, weights=[embedding_matrix], trainable=True))
model.add(Conv1D(128, 5, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Bidirectional(LSTM(128, dropout=0.3)))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(20, activation='softmax'))

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()

In [43]:
early_stop = EarlyStopping(monitor='val_accuracy', patience=5, restore_best_weights=True)

history = model.fit(
    X_train, Y_train,
    validation_data=(X_val, Y_val),
    epochs=40,
    batch_size=64, 
    callbacks=[early_stop]
)

Epoch 1/40
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 57ms/step - accuracy: 0.3260 - loss: 2.2740 - val_accuracy: 0.2170 - val_loss: 2.5556
Epoch 2/40
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 54ms/step - accuracy: 0.4121 - loss: 1.9500 - val_accuracy: 0.2406 - val_loss: 2.4889
Epoch 3/40
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 55ms/step - accuracy: 0.4778 - loss: 1.7025 - val_accuracy: 0.2460 - val_loss: 2.5514
Epoch 4/40
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 54ms/step - accuracy: 0.5550 - loss: 1.4265 - val_accuracy: 0.2378 - val_loss: 2.7137
Epoch 5/40
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 55ms/step - accuracy: 0.6342 - loss: 1.1645 - val_accuracy: 0.2316 - val_loss: 2.9674
Epoch 6/40
[1m704/704[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 56ms/step - accuracy: 0.7009 - loss: 0.9505 - val_accuracy: 0.2232 - val_loss: 3.1892
Epoch 7/40
[1m7

In [44]:
Y_predicted = model.predict(X_test) 

[1m1563/1563[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 6ms/step


In [45]:
Y_pred = [] 
for y in Y_predicted: 
    maxIndex = 0 
    for i in range(len(y)): 
        if(y[i]>y[maxIndex]) : maxIndex = i  
    Y_pred.append(maxIndex)

accuracy = accuracy_score(Y_test,Y_pred) 
precision = precision_score(Y_test,Y_pred, average="macro") 
recall = recall_score(Y_test,Y_pred, average="macro") 
f1 = f1_score(Y_test,Y_pred, average="macro")  

print(accuracy)
print(precision)
print(recall)
print(f1)

0.40594
0.29914055182654364
0.2733060320537491
0.25849302166331134


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [46]:
model.save("CONV1D_LSTM_31acc.keras") 

In [39]:
with open("vocab.pkl", "wb") as f:
    pickle.dump(vocab, f)

In [47]:
y = Y_predicted[0] 
print(y)

[0.02401281 0.10496021 0.12028302 0.03118647 0.12815556 0.03263855
 0.07612061 0.06968606 0.02615641 0.02338566 0.05396041 0.01934358
 0.00295179 0.0222238  0.03106003 0.09249417 0.04347313 0.01306252
 0.05178379 0.03306139]


In [55]:
test = [(y[i],i) for i in range(len(y))] 
Y = [(y[i],i) for i in range(len(y))] 
Y.sort(key=lambda a:-a[0]) 
print(Y)
top3 = [it[1] for it in Y[0:3]]
print(top3) 

[(np.float32(0.12815556), 4), (np.float32(0.12028302), 2), (np.float32(0.10496021), 1), (np.float32(0.092494175), 15), (np.float32(0.07612061), 6), (np.float32(0.069686055), 7), (np.float32(0.053960413), 10), (np.float32(0.051783793), 18), (np.float32(0.043473132), 16), (np.float32(0.03306139), 19), (np.float32(0.032638554), 5), (np.float32(0.031186465), 3), (np.float32(0.031060029), 14), (np.float32(0.026156412), 8), (np.float32(0.024012808), 0), (np.float32(0.023385655), 9), (np.float32(0.022223802), 13), (np.float32(0.01934358), 11), (np.float32(0.013062515), 17), (np.float32(0.0029517903), 12)]
[4, 2, 1]
