#### Required Libraries

In [2]:
import pandas as pd
import re
import string
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

#### Step 1: Load Datasets

In [3]:
train_df = pd.read_csv("G:\\CAP_Guvi\\train_emotions.csv")
val_df = pd.read_csv("G:\\CAP_Guvi\\val_emotions.csv")
test_df = pd.read_csv("G:\\CAP_Guvi\\test_emotions.csv")

#### Step 2: Text Cleaning Function and Apply Text Cleaning

In [4]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"@\w+", "", text)     # Remove mentions
    text = re.sub(r"#\w+", "", text)     # Remove hashtags
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuations
    text = re.sub(r"\d+", "", text)      # Remove digits
    text = text.strip()
    return text

for df in [train_df, val_df, test_df]:
    df['sentence'] = df['sentence'].astype(str).apply(clean_text)

#### Step 3: Encode Labels

In [5]:
label_encoder = LabelEncoder()
train_df['label'] = label_encoder.fit_transform(train_df['emotion'])
val_df['label'] = label_encoder.transform(val_df['emotion'])
test_df['label'] = label_encoder.transform(test_df['emotion'])


#### Step 5: Tokenization and Padding

In [6]:
# Fit tokenizer only on training data
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(train_df['sentence'])

# Convert text to sequences
X_train_seq = tokenizer.texts_to_sequences(train_df['sentence'])
X_val_seq = tokenizer.texts_to_sequences(val_df['sentence'])
X_test_seq = tokenizer.texts_to_sequences(test_df['sentence'])

# Padding sequences
max_length = max(len(seq) for seq in X_train_seq)  # You can also use a fixed number like 100
X_train_pad = pad_sequences(X_train_seq, maxlen=max_length, padding='post', truncating='post')
X_val_pad = pad_sequences(X_val_seq, maxlen=max_length, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_length, padding='post', truncating='post')

# Labels
y_train = train_df['label'].values
y_val = val_df['label'].values
y_test = test_df['label'].values


#### Save Tokenizer and LabelEncoder

In [7]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)


#### Model Architecture:1D CNN + Global Max Pooling 

In [9]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dropout, Dense

vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 128
num_classes = len(label_encoder.classes_)

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=X_train_pad.shape[1]))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()


Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 101, 128)          6230144   
                                                                 
 conv1d_1 (Conv1D)           (None, 97, 128)           82048     
                                                                 
 global_max_pooling1d_1 (Gl  (None, 128)               0         
 obalMaxPooling1D)                                               
                                                                 
 dropout_1 (Dropout)         (None, 128)               0         
                                                                 
 dense_2 (Dense)             (None, 64)                8256      
                                                                 
 dense_3 (Dense)             (None, 5)                 325       
                                                      

####  Train the Model

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

# Define early stopping to prevent overfitting
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=3,
    restore_best_weights=True,
    verbose=1
)

# Train the model
history = model.fit(
    X_train_pad, y_train,                  
    validation_data=(X_val_pad, y_val),   
    epochs=10,                            
    batch_size=334,                       
    callbacks=[early_stop],               
    verbose=1                             
)


#### Save the trained model

In [11]:
# Save the trained model
model.save("emotion_cnn_model.h5")
print("âœ… Model saved as 'emotion_cnn_model.h5'")


âœ… Model saved as 'emotion_cnn_model.h5'


  saving_api.save_model(


#### Classification Report of the Model

In [12]:
from sklearn.metrics import classification_report
import numpy as np

# Predict class probabilities
y_pred_probs = model.predict(X_test_pad)

# Get predicted class labels
y_pred = np.argmax(y_pred_probs, axis=1)

# Generate classification report
target_names = label_encoder.classes_
report = classification_report(y_test, y_pred, target_names=target_names)
print("ðŸ“Š Classification Report:\n")
print(report)

# Optional: Save report to a text file
with open("classification_report.txt", "w") as f:
    f.write(report)


ðŸ“Š Classification Report:

              precision    recall  f1-score   support

       anger       0.93      0.95      0.94     14830
        fear       0.93      0.95      0.94     12413
         joy       0.97      0.92      0.94     21460
        love       0.84      0.94      0.89      8639
         sad       0.97      0.95      0.96     18179

    accuracy                           0.94     75521
   macro avg       0.93      0.94      0.93     75521
weighted avg       0.94      0.94      0.94     75521



#### To avoid loading it with pickle and instead use a JSON format, tokenizer.pkl is saved as json file, which is more stable and version-independent.

In [2]:
import pickle
import json
from tensorflow.keras.preprocessing.text import tokenizer_from_json

# Step 1: Load the tokenizer from the .pkl file
with open("C:\\Users\\HP\\AppData\\Local\\Programs\\Python\\tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)

# Step 2: Convert tokenizer to JSON
tokenizer_json = tokenizer.to_json()

# Step 3: Save JSON to a file
with open("C:\\Users\\HP\\AppData\\Local\\Programs\\Python\\tokenizer.json", "w", encoding='utf-8') as f:
    f.write(tokenizer_json)

print("âœ… Tokenizer converted and saved as JSON.")


âœ… Tokenizer converted and saved as JSON.
