In [None]:
import mlflow
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np

In [3]:
train_df=pd.read_csv(r"D:\projects\testing\notebooks\emotions\training.csv")
valid_df=pd.read_csv(r"D:\projects\testing\notebooks\emotions\validation.csv")
test_df=pd.read_csv(r"D:\projects\testing\notebooks\emotions\test.csv")

train_df.head()

Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned...,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplac...,2
4,i am feeling grouchy,3


sadness (0), joy (1), love (2), anger (3), fear (4)

In [4]:
train_df.isnull().sum()

text     0
label    0
dtype: int64

In [5]:
test_df.isnull().sum()

text     0
label    0
dtype: int64

Data Preprocessing

In [6]:
def lemmatization(text):
    """Lemmatizes the input text."""
    lemmatizer=WordNetLemmatizer()
    words=text.split()
    lemmatized_words=[]

    for word in words:
        word=lemmatizer.lemmatize(word)
        lemmatized_words.append(word)
    
    result_text=' '.join(lemmatized_words)
    return result_text

def lower_case(text):
    """Converts the input text to lower case."""
    return text.lower()

def remove_stop_words(text):
    """Removes stopwords from the input text."""
    stop_words=set(stopwords.words('english'))
    words=text.split()
    filtered_words=[]

    for word in words:
        if word not in stop_words:
            filtered_words.append(word)
    
    result_text=' '.join(filtered_words)
    return result_text

def removing_numbers(text):
    """Removes numbers from input_text."""
    filtered_words=[]
    words=text.split()

    for word in words:
        if not word.isdigit():
            filtered_words.append(word)
    result_text=' '.join(filtered_words)
    return result_text

def removing_urls(text):
    """Removes URLs from the text"""
    url_pattern = r'https?://\S+|www.\S+'
    result_text= re.sub(url_pattern, ' ', text)
    return result_text

def removing_punctuations(text):
    """Removes punctuation form text"""
    punctuation_pattern = f"[{re.escape(string.punctuation)}]"
    result_text = re.sub(punctuation_pattern, ' ', text)
    return result_text



In [7]:
def normalize_text(df):
    """Normalize the text data."""
    try:
        df['text'] = df['text'].apply(lower_case)
        df['text'] = df['text'].apply(remove_stop_words)
        df['text'] = df['text'].apply(removing_numbers)
        df['text'] = df['text'].apply(removing_punctuations)
        df['text'] = df['text'].apply(removing_urls)
        df['text'] = df['text'].apply(lemmatization)
        return df
    except Exception as e:
        print(f'Error during text normalization: {e}')
        raise

In [9]:
train_df= normalize_text(train_df)
valid_df= normalize_text(valid_df)

In [10]:
train_df.head()
valid_df.head()

Unnamed: 0,text,label
0,im feeling quite sad sorry ill snap soon,0
1,feel like still looking blank canvas blank pie...,0
2,feel like faithful servant,2
3,feeling cranky blue,3
4,treat feeling festive,1


In [11]:
train_df['label'].value_counts()

label
1    5362
0    4666
3    2159
4    1937
2    1304
5     572
Name: count, dtype: int64

In [12]:
test_df['label'].value_counts()

label
1    695
0    581
3    275
4    224
2    159
5     66
Name: count, dtype: int64

In [13]:
import dagshub

mlflow.set_tracking_uri('https://dagshub.com/harshitneverdebugs/testing.mlflow')
dagshub.init(repo_owner='harshitneverdebugs', repo_name='testing', mlflow=True)


mlflow.set_experiment("BiLSTM Baseline")

<Experiment: artifact_location='mlflow-artifacts:/1dd70d45af314efdafee614ecfdf3010', creation_time=1748203020719, experiment_id='0', last_update_time=1748203020719, lifecycle_stage='active', name='BiLSTM Baseline', tags={}>

In [18]:
import mlflow
import logging
import time
import numpy as np
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logging.info("Starting MLflow BiLSTM run...")

with mlflow.start_run():
    start_time=time.time()
    try:
        MAX_FEATURES=10000
        MAX_LEN=100
        EMBEDDING_DIM=64

        mlflow.log_param("model", "BiLSTM")
        mlflow.log_param("max_features", MAX_FEATURES)
        mlflow.log_param("max_len", MAX_LEN)
        mlflow.log_param("embedding_dim", EMBEDDING_DIM)

        tokenizer=Tokenizer(num_words=MAX_FEATURES, oov_token="<OOV>")
        tokenizer.fit_on_texts(train_df["text"])

        X_train=pad_sequences(tokenizer.texts_to_sequences(train_df["text"]), maxlen=MAX_LEN)
        X_valid=pad_sequences(tokenizer.texts_to_sequences(valid_df["text"]), maxlen=MAX_LEN)
        X_test=pad_sequences(tokenizer.texts_to_sequences(test_df["text"]), maxlen=MAX_LEN)

        y_train=train_df["label"].values
        y_valid=valid_df["label"].values
        y_test=test_df["label"].values

        num_classes = len(np.unique(train_df["label"]))

        y_train_cat=to_categorical(y_train, num_classes)
        y_test_cat=to_categorical(y_test, num_classes)
        y_valid_cat=to_categorical(y_valid, num_classes)

        model=Sequential([
            Embedding(input_dim=MAX_FEATURES, output_dim=EMBEDDING_DIM, input_length=MAX_LEN),
            Bidirectional(LSTM(64, return_sequences=False)),
            Dropout(0.5),
            Dense(num_classes, activation="softmax")
        ])

        model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
        logging.info("Model compiled successfully.")

        model.fit(X_train, y_train_cat, validation_data=[X_valid, y_valid_cat], epochs=5, batch_size=64)
        logging.info("Model training completed.")

        y_pred_prob=model.predict(X_test)
        y_pred=np.argmax(y_pred_prob, axis=1)

        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred, average='weighted')
        recall = recall_score(y_test, y_pred, average='weighted')
        f1 = f1_score(y_test, y_pred, average='weighted')

        # Log metrics to MLflow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1_score", f1)

        # Save model and log artifact
        model.save("bilstm_model.h5")
        mlflow.log_artifact("bilstm_model.h5")

        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")

        end_time = time.time()
        logging.info(f"Run completed in {end_time - start_time:.2f} seconds.")

    except Exception as e:
        logging.error(f"An error occurred: {e}", exc_info=True)

2025-05-28 21:15:44,331 - INFO - Starting MLflow BiLSTM run...
2025-05-28 21:15:46,758 - INFO - Model compiled successfully.


Epoch 1/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 55ms/step - accuracy: 0.3650 - loss: 1.5572 - val_accuracy: 0.7090 - val_loss: 0.8394
Epoch 2/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 42ms/step - accuracy: 0.7998 - loss: 0.6196 - val_accuracy: 0.8900 - val_loss: 0.3327
Epoch 3/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 43ms/step - accuracy: 0.9333 - loss: 0.2096 - val_accuracy: 0.9060 - val_loss: 0.2857
Epoch 4/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 44ms/step - accuracy: 0.9639 - loss: 0.1220 - val_accuracy: 0.9160 - val_loss: 0.2788
Epoch 5/5
[1m250/250[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 49ms/step - accuracy: 0.9759 - loss: 0.0781 - val_accuracy: 0.9130 - val_loss: 0.2792


2025-05-28 21:16:48,059 - INFO - Model training completed.


[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step


2025-05-28 21:16:54,732 - INFO - Accuracy: 0.7705
2025-05-28 21:16:54,733 - INFO - Precision: 0.838632516107728
2025-05-28 21:16:54,733 - INFO - Recall: 0.7705
2025-05-28 21:16:54,733 - INFO - F1 Score: 0.7768934188124293
2025-05-28 21:16:54,734 - INFO - Run completed in 70.01 seconds.
2025/05/28 21:16:55 INFO mlflow.tracking._tracking_service.client: 🏃 View run stylish-shad-811 at: https://dagshub.com/harshitneverdebugs/testing.mlflow/#/experiments/0/runs/160bd6449ec747c7826b431f20e67d4c.
2025/05/28 21:16:55 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://dagshub.com/harshitneverdebugs/testing.mlflow/#/experiments/0.
