
# Emoji Prediction using LSTM Model



## Import Libraries

In [1]:
!unzip -uq '/content/Emoji.zip' -d '/content'

In [2]:
#!pip install keras
import numpy as np
import pandas as pd
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,LSTM,SimpleRNN,Embedding




# Data Pre-processing

In [3]:
data=pd.read_csv('/content/Train_data.csv', header=None)
data=data.drop(0)

data.head()


Unnamed: 0,0,1
1,A little throwback with my favourite person @ ...,0
2,glam on @user yesterday for #kcon makeup using...,7
3,Democracy Plaza in the wake of a stunning outc...,11
4,Then &amp; Now. VILO @ Walt Disney Magic Kingdom,0
5,Who never... @ A Galaxy Far Far Away,2


In [4]:
import re
import string  # Import string to use string.punctuation

def clean_text(text):
    text = str(text).lower()
    rm_mention = re.sub(r'@[A-Za-z0-9]+', '', text)                       # remove @mentions
    rm_rt = re.sub(r'RT[\s]+', '', rm_mention)                            # remove RT (with proper space handling)
    rm_links = re.sub(r'http\S+', '', rm_rt)                              # remove hyperlinks
    rm_nums = re.sub('[0-9]+', '', rm_links)                              # remove numbers
    rm_punc = [char for char in rm_nums if char not in string.punctuation] # remove punctuation
    rm_punc = ''.join(rm_punc)
    cleaned = rm_punc

    return cleaned


data[0] = data[0].apply(clean_text)



In [5]:
data.to_csv('/content/train.csv', index=False)

In [6]:
data=pd.read_csv('/content/train.csv', header=None)
data=data.drop(0)
data.head()

Unnamed: 0,0,1
1,a little throwback with my favourite person w...,0
2,glam on yesterday for kcon makeup using in f...,7
3,democracy plaza in the wake of a stunning outc...,11
4,then amp now vilo walt disney magic kingdom,0
5,who never a galaxy far far away,2


#Mapping of Emoji data to the given labels

In [7]:
map_data=pd.read_csv('/content/Mapping.csv', header=None)
map_data=map_data.drop(0)
map_data.head()

emoji_dict = {}
for index, row in map_data.iterrows():
    emoji_dict[str(row[1])] = row[0]
print(emoji_dict)



{'0': '❤️', '1': '😄', '2': '😂', '3': '💕', '4': '🔥', '5': '😊', '6': '😎', '7': '✨', '8': '💙', '9': '😘', '10': '📷', '11': '🇺🇸', '12': '☀️', '13': '💜', '14': '🙂', '15': '💯', '16': '😆', '17': '🎄', '18': '🏠', '19': '📸'}


In [8]:
# Assuming X and Y are columns from the DataFrame data
X = data[0].fillna('')  # Fill NaN values with empty strings
Y = data[1].fillna('')  # If Y also has NaN, you may want to handle that too

# Convert all entries in X and Y to strings (if necessary)
X = X.astype(str)
Y = Y.astype(str)

# Print the cleaned data to verify
print(X.head())
print(Y.head())


1    a little throwback with my favourite person  w...
2    glam on  yesterday for kcon makeup using  in f...
3    democracy plaza in the wake of a stunning outc...
4         then amp now vilo  walt disney magic kingdom
5                     who never  a galaxy far far away
Name: 0, dtype: object
1     0
2     7
3    11
4     0
5     2
Name: 1, dtype: object


#Embeddings

In [9]:
file=open('/content/glove.6B.100d.txt','r',encoding='utf8')

content= file.readlines()
file.close()

embeddings={}
for line in content:
  line=line.split()
  embeddings[line[0]]=np.array(line[1:],dtype=float)



In [10]:
import tensorflow
import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [11]:
tokenizer=Tokenizer()
tokenizer.fit_on_texts(X)
word2index=tokenizer.word_index
Xtoken=tokenizer.texts_to_sequences(X)



In [12]:
def get_max_len(data):
  maxi=0
  for sentense in data:
    siz=len(sentense)
    maxi=max(maxi,siz)
  return maxi

max_len=get_max_len(Xtoken)
print(max_len)

33


In [13]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

In [14]:
Xtrain=pad_sequences(Xtoken,max_len,padding='post', truncating='post')

In [15]:
Ytrain=to_categorical(Y)
print(Ytrain.shape)

(50000, 20)


#Model Training

In [16]:
embed_dim = 100
embedding_matrix = np.zeros((len(word2index) + 1, embed_dim))

for word, i in word2index.items():
    if word in embeddings:
        embed_vector = embeddings[word]
    else:
        embed_vector = np.random.randn(embed_dim)
    embedding_matrix[i] = embed_vector


In [17]:
embedding_matrix

array([[ 0.        ,  0.        ,  0.        , ...,  0.        ,
         0.        ,  0.        ],
       [-0.038194  , -0.24487   ,  0.72812   , ..., -0.1459    ,
         0.8278    ,  0.27062   ],
       [ 0.080273  , -0.10861   ,  0.72067   , ..., -0.34842   ,
         0.31466   ,  1.0087    ],
       ...,
       [ 0.04412444, -0.80622932, -0.3318888 , ...,  0.52828227,
         0.45870009,  1.10861013],
       [-0.24637768,  0.53167005,  1.3706176 , ..., -0.29922417,
        -2.9847395 ,  0.92366879],
       [-1.56411365,  0.91364875, -0.78404771, ..., -0.46735889,
         0.92572685,  1.13814462]])

In [18]:
from tensorflow.keras.layers import Dropout

model = Sequential([
    Embedding(input_dim=len(word2index)+1, output_dim=embed_dim, input_length=max_len, weights=[embedding_matrix], trainable=True),
    LSTM(units=32, return_sequences=True ),
    Dropout(0.5),
    LSTM(units=16 ),
    Dropout(0.5),
    Dense(20, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [19]:
print(Xtrain.shape)
print(Ytrain.shape)

(50000, 33)
(50000, 20)


In [20]:
model.fit(Xtrain,Ytrain,epochs=35)

Epoch 1/35
Epoch 2/35
Epoch 3/35
Epoch 4/35
Epoch 5/35
Epoch 6/35
Epoch 7/35
Epoch 8/35
Epoch 9/35
Epoch 10/35
Epoch 11/35
Epoch 12/35
Epoch 13/35
Epoch 14/35
Epoch 15/35
Epoch 16/35
Epoch 17/35
Epoch 18/35
Epoch 19/35
Epoch 20/35
Epoch 21/35
Epoch 22/35
Epoch 23/35
Epoch 24/35
Epoch 25/35
Epoch 26/35
Epoch 27/35
Epoch 28/35
Epoch 29/35
Epoch 30/35
Epoch 31/35
Epoch 32/35
Epoch 33/35
Epoch 34/35
Epoch 35/35


<keras.src.callbacks.History at 0x7f3f38120cd0>

**Accuracy of our model is 80.33 %**

In [21]:
model.save('emotion_model.h5')


  saving_api.save_model(


In [22]:
import pickle
# Save the tokenizer after training
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


#Model Testing

In [23]:
test=[
    "I love spending time with my family.",
    "I can't stop smiling when I see you.",
    "That joke was absolutely hilarious!",
    "Sending you a kiss to brighten your day.",
    "This pizza is absolutely delicious, it's so good!",
    "Feeling happy and content today.",
    "Looking cool in my new sunglasses.",
    "Everything feels magical and sparkly tonight.",
    "Sending you all my love and support.",
    "Blowing a kiss to the best friend ever.",
    "Let me take a quick selfie before we leave.",
    "Celebrating the Fourth of July with fireworks!",
    "It's such a sunny and beautiful day outside.",
    "Purple is my favorite color of all time.",
    "Feeling a little sad and down today.",
    "That answer is absolutely correct!",
    "I can't help but smile with joy!",
    "Can't wait to decorate the Christmas tree.",
    "It's so cozy being at home right now.",
    "Taking some photos for my travel blog.",

    "Love is all around us today.",
    "That was a fantastic concert, wasn't it?",
    "I just finished watching the funniest video.",
    "I'm so lucky to have you in my life.",
    "The new restaurant is on fire with its flavors.",
    "Today has been such a peaceful day.",
    "Got a new pair of shades and feeling cool.",
    "The stars are shining extra bright tonight.",
    "Hope you're having an amazing day, thinking of you.",
    "Saying goodbye with a little kiss.",
    "Capturing moments with my camera.",
    "Celebrating Independence Day with my friends.",
    "What a beautiful sunny morning to wake up to!",
    "I've always loved purple ever since I was a child.",
    "Feeling a bit under the weather today.",
    "You absolutely nailed that answer!",
    "The joy I feel right now is hard to describe.",
    "Decorating the house for Christmas is so much fun.",
    "It's the perfect day to relax at home.",
    "Taking pictures is my favorite hobby on trips."
]
test_seq=tokenizer.texts_to_sequences(test)

Xtest=pad_sequences(test_seq,max_len,padding='post', truncating='post')
ypred=model.predict(Xtest)
ypred=np.argmax(ypred,axis=1)
ypred



array([ 9,  0,  2,  7,  2,  7,  6,  4,  0,  9, 14, 11,  0,  3,  7,  2,  4,
       17,  4, 14,  0,  1,  2,  0,  6,  3, 15,  4, 13,  9, 18,  0,  7,  0,
        6,  0,  3, 17,  0,  5])

In [24]:
for i in range(len(ypred)):
  print(test[i], "  ", emoji_dict[str(ypred[i])])

I love spending time with my family.    😘
I can't stop smiling when I see you.    ❤️
That joke was absolutely hilarious!    😂
Sending you a kiss to brighten your day.    ✨
This pizza is absolutely delicious, it's so good!    😂
Feeling happy and content today.    ✨
Looking cool in my new sunglasses.    😎
Everything feels magical and sparkly tonight.    🔥
Sending you all my love and support.    ❤️
Blowing a kiss to the best friend ever.    😘
Let me take a quick selfie before we leave.    🙂
Celebrating the Fourth of July with fireworks!    🇺🇸
It's such a sunny and beautiful day outside.    ❤️
Purple is my favorite color of all time.    💕
Feeling a little sad and down today.    ✨
That answer is absolutely correct!    😂
I can't help but smile with joy!    🔥
Can't wait to decorate the Christmas tree.    🎄
It's so cozy being at home right now.    🔥
Taking some photos for my travel blog.    🙂
Love is all around us today.    ❤️
That was a fantastic concert, wasn't it?    😄
I just finished watch