<a href="https://colab.research.google.com/github/akhilmuvva/emoji-predictor/blob/main/emoji_predictor.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ReduceLROnPlateau, EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [2]:
data = pd.DataFrame({
    'text': [
        'I am so happy today', 'This is amazing', 'I love this so much', 'Feeling joyful', 'Such a wonderful day',
        'I feel so sad', 'This is terrible', 'Heartbroken today', 'Feeling down and out', 'Tears in my eyes',
        'Wow this is awesome', 'You are the best', 'So excited', 'Absolutely fantastic', 'Incredible work',
        'I am so angry', 'This is frustrating', 'Really annoyed', 'Furious right now', 'Mad about this situation',
        'Having a great time with friends', 'Enjoying life to the fullest', 'Pure bliss',
        'Feeling lonely tonight', 'It\'s a gloomy day', 'Depressed by the news',
        'Adore this song', 'Completely smitten', 'Cannot get enough of this',
        'Outraged by the injustice', 'This makes me fume', 'Irritated beyond belief',
        'Super thrilled about this', 'Life is beautiful', 'Over the moon',
        'So upset right now', 'Feeling miserable', 'Down in the dumps',
        'This is so cool', 'Really loving this moment', 'Totally enchanted',
        'This is infuriating', 'So mad at this', 'Completely ticked off'
    ],
    'emoji': ['😊', '😊', '😊', '😊', '😊',
              '😢', '😢', '😢', '😢', '😢',
              '😍', '😍', '😍', '😍', '😍',
              '😡', '😡', '😡', '😡', '😡',
              '😊', '😊', '😊',
              '😢', '😢', '😢',
              '😍', '😍', '😍',
              '😡', '😡', '😡',
              '😊', '😊', '😊',
              '😢', '😢', '😢',
              '😍', '😍', '😍',
              '😡', '😡', '😡']
})

In [3]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    return text

In [4]:
data['text'] = data['text'].apply(clean_text)

In [5]:
max_words = 5000
max_len = 15
embedding_dim = 100
epochs = 50
batch_size = 8

In [6]:
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(data['text'])
sequences = tokenizer.texts_to_sequences(data['text'])
X = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

In [7]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(data['emoji'])
y = to_categorical(y)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
def load_glove_embeddings(glove_file='glove.6B.100d.txt'):
    embeddings_index = {}
    try:
        with open(glove_file, encoding='utf-8') as f:
            for line in f:
                values = line.split()
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
        print(f"Loaded {len(embeddings_index)} GloVe embeddings.")
    except FileNotFoundError:
        print(f"Error: GloVe file '{glove_file}' not found. Download from "
              "https://nlp.stanford.edu/projects/glove/ and place in directory.")

        embeddings_index = {word: np.random.rand(embedding_dim) for word in tokenizer.word_index.keys()}
    return embeddings_index


embeddings_index = load_glove_embeddings()
embedding_matrix = np.zeros((max_words, embedding_dim))
for word, i in tokenizer.word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector

Error: GloVe file 'glove.6B.100d.txt' not found. Download from https://nlp.stanford.edu/projects/glove/ and place in directory.


In [10]:
model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len, weights=[embedding_matrix], trainable=True),
    LSTM(64, return_sequences=False),
    Dropout(0.5),
    Dense(32, activation='relu'),
    Dropout(0.5),
    Dense(len(label_encoder.classes_), activation='softmax')
])




In [11]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [12]:
print("\n--- Model Summary ---")
model.summary()
print("---------------------")


--- Model Summary ---


---------------------


In [13]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=0.00001, verbose=1)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=1)

In [14]:
print("\n--- Model Training ---")
history = model.fit(
    X_train, y_train,
    epochs=epochs,
    batch_size=batch_size,
    validation_data=(X_test, y_test),
    callbacks=[reduce_lr, early_stopping],
    verbose=1
)
print("----------------------")


--- Model Training ---
Epoch 1/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 278ms/step - accuracy: 0.2437 - loss: 1.3958 - val_accuracy: 0.2222 - val_loss: 1.3880 - learning_rate: 0.0010
Epoch 2/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 28ms/step - accuracy: 0.3079 - loss: 1.3826 - val_accuracy: 0.2222 - val_loss: 1.3899 - learning_rate: 0.0010
Epoch 3/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.2836 - loss: 1.3909 - val_accuracy: 0.2222 - val_loss: 1.3885 - learning_rate: 0.0010
Epoch 4/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.2793 - loss: 1.3877 - val_accuracy: 0.2222 - val_loss: 1.3902 - learning_rate: 0.0010
Epoch 5/50
[1m5/5[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.2741 - loss: 1.3782 - val_accuracy: 0.2222 - val_loss: 1.3894 - learning_rate: 0.0010
Epoch 6/50
[1m1/5[0m [32m━━━━[0m[37m━━━━━━━━━━━━━━━━[

In [15]:
loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
print(f"\n--- Model Evaluation ---")
print(f"Test Loss: {loss:.4f}")
print(f"Test Accuracy: {accuracy:.4f}")
print("------------------------")


--- Model Evaluation ---
Test Loss: 1.3880
Test Accuracy: 0.2222
------------------------


In [18]:
new_texts = [
    "I am super excited",

]

print("\n--- Predictions for New Texts ---")
new_texts = [clean_text(text) for text in new_texts]
new_sequences = tokenizer.texts_to_sequences(new_texts)
new_X = pad_sequences(new_sequences, maxlen=max_len, padding='post', truncating='post')
predictions = model.predict(new_X, verbose=0)
predicted_emojis = label_encoder.inverse_transform(np.argmax(predictions, axis=1))


--- Predictions for New Texts ---


In [19]:
for text, emoji in zip(new_texts, predicted_emojis):
    print(f"Text: '{text}' -> Predicted Emoji: {emoji}")
print("---------------------------------")

Text: 'i am super excited' -> Predicted Emoji: 😊
---------------------------------
