In [None]:
import pandas as pd

In [None]:
emotions_dict = {
    "num":{ 0: "sadness",
        1: "anger",
        2: "love",
        3: "surprise",
        4: "fear",
        5: "joy"}
}

tag_id_dict = {}
for key, value in emotions_dict["num"].items():
    tag_id_dict[value] = key

emotions_dict["num"] = tag_id_dict

In [None]:
train = pd.read_csv('C:/Users/NEGIN COMPUTER/Downloads/archive/train.txt', names=['Text', 'label'], delimiter=';')
val = pd.read_csv('C:/Users/NEGIN COMPUTER/Downloads/archive/val.txt', names=['Text', 'label'], delimiter=';')
test = pd.read_csv('C:/Users/NEGIN COMPUTER/Downloads/archive/test.txt', names=['Text', 'label'], delimiter=';')
df = pd.concat([train, val, test])
df.head()

In [None]:
val.head()

In [None]:
val.shape

In [None]:
train.label.value_counts()

In [None]:
train.isnull().sum()

In [None]:
import seaborn as sns
sns.countplot(train.label)

In [None]:
train.duplicated().sum()

In [None]:
index = train[train.duplicated() == True].index
train.drop(index, axis = 0, inplace = True)
train.reset_index(inplace=True, drop = True)

In [None]:
index = train[train['Text'].duplicated() == True].index
train.drop(index, axis = 0, inplace = True)
train.reset_index(inplace=True, drop = True)

In [None]:
from wordcloud import WordCloud
import nltk
import matplotlib.pyplot as plt
emotions = df['label'].unique()
for emotion in emotions:
    text = " ".join(df[df['label'] == emotion]['Text'])
    wordcloud = WordCloud(width = 800, height = 800, 
                    background_color ='white', 
                    stopwords = set(nltk.corpus.stopwords.words("english")), 
                    min_font_size = 10).generate(text)
    plt.figure(figsize = (4, 4), facecolor = None) 
    plt.imshow(wordcloud) 
    plt.axis("off") 
    plt.tight_layout(pad = 0) 
    plt.title(emotion)
    plt.show()

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def preprocess_data(train, test, val, max_length=None):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(train)

    vocab_size = len(tokenizer.word_index) + 1
    print(f"Vocabulary size: {vocab_size}")

    if max_length is None:
        max_length = max(len(sequence.split()) for sequence in train)

    train_sequences = tokenizer.texts_to_sequences(train)
    test_sequences = tokenizer.texts_to_sequences(test)
    val_sequences = tokenizer.texts_to_sequences(val)

    x_train = pad_sequences(train_sequences, maxlen=max_length, padding='post')
    x_test = pad_sequences(test_sequences, maxlen=max_length, padding='post')
    x_val = pad_sequences(val_sequences, maxlen=max_length, padding='post')

    print("x_train shape:", x_train.shape)
    print("x_test shape:", x_test.shape)
    print("x_val shape:", x_val.shape)

    return x_train, x_test, x_val, vocab_size, tokenizer

In [None]:
x_train, x_test, x_val, VOCAB_SIZE, tokenizer = preprocess_data(train, test, val)

In [None]:
from sklearn.preprocessing import LabelEncoder

def label_preprocessing(data):
    encoder = LabelEncoder()
    y = encoder.fit_transform(data["label"].to_list())
    y = y.reshape(-1, 1)
    return y

y_train = label_preprocessing(train)
y_test = label_preprocessing(test)
y_val = label_preprocessing(val)

In [None]:
import tensorflow as tf

embedding_layer = tf.keras.layers.Embedding(VOCAB_SIZE, 64, input_length=30)

inputs = tf.keras.Input(shape=(x_train.shape[1],))
x = embedding_layer(inputs)
x = tf.keras.layers.LSTM(10, return_sequences=True)(x)
x = tf.keras.layers.Flatten()(x)
output = tf.keras.layers.Dense(len(encoder.classes_), activation="softmax")(x)

model = tf.keras.Model(inputs, output)
model.summary()

In [None]:
model.compile(optimizer='adam',
             loss="sparse_categorical_crossentropy",
             metrics=["accuracy"])

In [None]:
history = model.fit(x_train, y_train, epochs=20, batch_size=32,
                    validation_data=(x_val, y_val))

In [None]:
model.evaluate(x_test, y_test)

In [None]:
class Predict:
    def __init__(self, model, tokenizer, tokenizer):
        self.model = model
        self.tokenizer = Tokenizer()
    
    def predict(self, txt):
        x = self.tokenizer.texts_to_sequences([txt])
        x = tf.keras.preprocessing.sequence.pad_sequences(x, maxlen=30)
        predictions = self.model.predict(x)
        predicted_label_index = np.argmax(predictions[0])
        predicted_label = self.emotions_dict["num"][predicted_label_index]
        return predicted_label

predict = Predict(model, tokenizer, emotions_dict)

In [None]:
predict = Predict(model, tokenizer)
predict.predict("im so sad")