<a href="https://colab.research.google.com/github/avikumart/LLM-GenAI-Transformers-Notebooks/blob/main/DeepLearningFiles/Emotions_classification_tensorflow_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
# import the colab folder
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Omdena Bhutan Chapter - Leveraging AI to Combat Mental Health Problems/Tasks/Task 2/combined_data.csv")
df.head()

In [None]:
df.info()

In [None]:
df["mental_state"].value_counts()

In [None]:
df.isnull().sum()

In [None]:
ndf = df.dropna()

In [None]:
ndf.info()

In [None]:
ndf.isnull().sum()


In [None]:
# prompt: genereate the text data cleaning function for using nltk and regex
# pre-process the text data using nltk and regex

import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
# Clean the text data
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z]+', ' ', text)
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Stemming
    ps = PorterStemmer()
    text = ' '.join([ps.stem(word) for word in text.split()])
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

In [None]:
ndf["cleaned_text"] = ndf["text"].apply(clean_text)
ndf.head()

In [None]:
ndf.sample(10)

In [None]:
ndf.drop(columns="text", inplace=True)


In [None]:
ndf["mental_state"].value_counts()

## Model 1: Classification by SMOTE train data balancing

In [None]:
# let's build keras model with the embeddings of the dataset
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

text = ndf["cleaned_text"].values
labels = ndf["mental_state"].values

labels_encoded, labels_names = pd.factorize(labels)
labels = to_categorical(labels_encoded, num_classes=len(labels_names))

# tokenize the dataset
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(text)
sequences = tokenizer.texts_to_sequences(text)

vocal_size = len(tokenizer.word_index) + 1
max_length = max([len(seq) for seq in sequences])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding="post")

In [None]:
# train and test split the model and SMOTE the input data
from imblearn.keras import BalancedBatchGenerator

X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# generate balanced data
training_generator = BalancedBatchGenerator(X_train, y_train, batch_size=32, random_state=42)

# define the model
model = Sequential()
model.add(Embedding(vocal_size, 128, input_length=max_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(len(labels_names), activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# fit the model to the batch generator
model.fit(training_generator, epochs=10, batch_size=32, validation_data=(X_test, y_test))

- Model is overfitting on the training dataset.

## Model 2: Keras classification using imbalance data without SMOTE

In [None]:
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, labels, test_size=0.2, random_state=42)

# modeling using keras tf apis
from tensorflow.keras.layers import GRU
from tensorflow.keras.layers import SimpleRNN

model = Sequential()
model.add(Embedding(vocal_size, 128, input_length=max_length))
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Bidirectional(GRU(64, return_sequences=True)))
model.add(Dropout(0.2))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.2))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.2))
model.add(Dense(len(labels_names), activation="softmax"))

model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# fit the model to the train dataset
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

## Model 3: Multi-label classification using imbalance data

In [None]:
# create the dataset with 2 labels 1) emotion and 2) sentiment
# developr the training of the tensorflow or keras model without data balance

## Model 4: Classifying text using bert-emotion using hugging face library

In [None]:
import torch
from transformers import Trainer, TrainingArguments

ndf.head()

In [None]:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_id = "boltuix/bert-emotion"
emotion_detection = pipeline("sentiment-analysis", model=model_id)

In [None]:
output = emotion_detection(ndf["cleaned_text"][13454])
output

In [None]:
from datasets import Dataset

dataset = Dataset.from_pandas(ndf)
dataset

In [None]:
# load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSequenceClassification.from_pretrained(model_id)

In [None]:
# define tokenizer
def tokenizer_func(examples):
  return tokenizer(examples["cleaned_text"], padding="max_length", truncation=True, max_length=512)

tokenized_dataset = dataset.map(tokenizer_func, batched=True)

In [None]:
tokenized_dataset

In [None]:
# convert the input data into pytorch format
def pytorch_formate(example):
  return {
      "input_ids": torch.tensor(example["input_ids"]),
      "attention_mask": torch.tensor(example["attention_mask"]),
      "labels": example["labels"]
  }
label_map = {name: i for i, name in enumerate(ndf["mental_state"].unique())}

tokenized_dataset = tokenized_dataset.map(lambda examples: {"labels": [label_map[label] for label in examples["mental_state"]]}, batched=True)

#tokenized_data = tokenized_dataset.map(pytorch_formate)

In [None]:
# set the training config and train the model
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

training_args = TrainingArguments(
     output_dir="./bert_emotion_results",
     num_train_epochs=5,
     per_device_train_batch_size=2,
     logging_dir="./bert_emotion_logs",
     logging_steps=10,
     save_steps=100,
     eval_strategy="no",
     learning_rate=3e-5,
     report_to="none"  # Disable W&B auto-logging if not needed
 )

 # 6. Initialize Trainer
trainer = Trainer(
     model=model,
     args=training_args,
     train_dataset=tokenized_dataset,
     data_collator=data_collator,
 )

 # 7. Fine-tune the model
trainer.train()