# Text emotion classification
Involves assigning an emotion to a given text by analyzing its context and identifying the underlying emotional tone.

Dataset: https://www.kaggle.com/datasets/praveengovi/emotions-dataset-for-nlp

Example Solution: https://thecleverprogrammer.com/2023/02/06/text-emotions-classification-using-python/

Hugging Face: https://huggingface.co/spaces/alperugurcan/text-emotion-detector

## 1. Data Preprocessing


In [61]:
import pandas as pd
import numpy as np
import seaborn as sns
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical

In [18]:
df= pd.read_csv("/kaggle/input/emotions-dataset-for-nlp/train.txt", sep=";", header=None)

In [20]:
df.columns = ["text","emotions"]

In [42]:
df.head()

Unnamed: 0,text,emotions
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [26]:
df.emotions.value_counts()

emotions
joy         5362
sadness     4666
anger       2159
fear        1937
love        1304
surprise     572
Name: count, dtype: int64

In [34]:
import neattext as nt
allwords = nt.TextFrame(
            " ".join(df["text"])
            )
allwords.describe()

Key      Value          
Length  : 1565532        
vowels  : 496902         
consonants: 761970         
stopwords: 176234         
punctuations: 0              
special_char: 0              
tokens(whitespace): 306661         
tokens(words): 306661         


In [41]:
allwords.length

1565532

In [49]:
texts = df["text"].tolist()  # Metinleri listeye çeviriyoruz
labels = df["emotions"].tolist()  # Etiketleri listeye çeviriyoruz

In [58]:
# Metin verisini tokenize ediyoruz
tokenizer = Tokenizer()  # Tokenizer nesnesi oluşturuyoruz
tokenizer.fit_on_texts(texts)  # Metinleri tokenize ediyoruz
sequences = tokenizer.texts_to_sequences(texts)  # Metinleri dizilere çeviriyoruz
max_length = max([len(seq) for seq in sequences])  # Maksimum dizi uzunluğunu buluyoruz
padded_sequences = pad_sequences(sequences, maxlen=max_length)  # Dizileri aynı uzunlukta olacak şekilde dolduruyoruz

In [59]:
# Etiketleri tamsayıya çeviriyoruz
label_encoder = LabelEncoder()  # LabelEncoder nesnesi oluşturuyoruz
labels = label_encoder.fit_transform(labels)  # Etiketleri tamsayıya çeviriyoruz

In [63]:
# Etiketleri One-hot encode ediyoruz
one_hot_labels = to_categorical(labels)  # Etiketleri One-hot encode ediyoruz

In [64]:
xtrain, xtest, ytrain, ytest = train_test_split(padded_sequences, one_hot_labels, test_size=0.2)

## 2. Model Building and Training

In [65]:
from keras.models import Sequential  # Keras'tan Sequential sınıfını içe aktarıyoruz
from keras.layers import Embedding, Flatten, Dense  # Keras'tan gerekli katmanları içe aktarıyoruz

In [66]:
model = Sequential()  # Sequential model oluşturuyoruz
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length))  # Gömme katmanı ekliyoruz
model.add(Flatten())  # Düzleştirme katmanı ekliyoruz
model.add(Dense(units=128, activation="relu"))  # Gizli katman ekliyoruz
model.add(Dense(units=len(one_hot_labels[0]), activation="softmax"))  # Çıkış katmanı ekliyoruz



In [67]:
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])  # Modeli derliyoruz
model.fit(xtrain, ytrain, epochs=10, batch_size=32, validation_data=(xtest, ytest))  # Modeli eğitiyoruz

Epoch 1/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 21ms/step - accuracy: 0.3835 - loss: 1.5197 - val_accuracy: 0.6866 - val_loss: 0.8897
Epoch 2/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.8536 - loss: 0.4668 - val_accuracy: 0.8188 - val_loss: 0.5626
Epoch 3/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.9823 - loss: 0.0740 - val_accuracy: 0.8106 - val_loss: 0.6192
Epoch 4/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.9954 - loss: 0.0264 - val_accuracy: 0.8172 - val_loss: 0.6241
Epoch 5/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 20ms/step - accuracy: 0.9968 - loss: 0.0154 - val_accuracy: 0.8200 - val_loss: 0.6485
Epoch 6/10
[1m400/400[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 19ms/step - accuracy: 0.9973 - loss: 0.0139 - val_accuracy: 0.8200 - val_loss: 0.6946
Epoch 7/10
[1m400/40

<keras.src.callbacks.history.History at 0x7b6c0da80b50>

## 3. Making Predictions

In [83]:
# Input text for prediction
input_text = "She smiling a lot"

In [84]:
# Preprocess the input text
input_sequence = tokenizer.texts_to_sequences([input_text])
padded_input_sequence = pad_sequences(input_sequence, maxlen=max_length)
prediction = model.predict(padded_input_sequence)
predicted_label = label_encoder.inverse_transform([np.argmax(prediction[0])])
print("Predicted Label:",predicted_label[0])

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step
Predicted Label: joy


In [86]:
import pickle
model.save('model.h5')

with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

with open('label_encoder.pkl', 'wb') as handle:
    pickle.dump(label_encoder, handle, protocol=pickle.HIGHEST_PROTOCOL)