Importing Necessary Libraries

In [None]:
import pandas as pd
import numpy as np

# For preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# For handling class imbalance
from sklearn.utils.class_weight import compute_class_weight

# For modeling
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# For hyperparameter tuning
from sklearn.model_selection import GridSearchCV

# For evaluation
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score, f1_score

# For handling warnings
import warnings
warnings.filterwarnings('ignore')



Loading the Dataset

In [None]:
# --------------------------------------------
# 1. Load the Dataset
# --------------------------------------------

# Load the dataset (replace the file path with the actual path)
df = pd.read_excel('/content/Emotions_DS.xlsx')

# --------------------------------------------
# 2. Replace emotion values
# --------------------------------------------

# Replace -1, -2, -3 with -1 (negative emotions)
# Replace 1, 2, 9 with 1 (positive emotions)
df['Emotion'] = df['Emotion'].replace([-1, -2, -3], -1).replace([1, 2, 9], 1)

# Create a new dataframe after replacements
new_df = df.copy()

# Display the first few rows of the new dataframe
print("New DataFrame after Emotion Replacement:")
print(new_df.head())

New DataFrame after Emotion Replacement:
      ID Type                                          Utterance Dialogue_Act  \
0  194_0    T  Hi. Alvina, how are you doing today? It's good...           gt   
1  194_1    P                                    I'm just tired.           gt   
2  194_2    T                                        just tired?          crq   
3  194_3    P                                               Yeah           cd   
4  194_4    T  you know, we did some pre visit planning with ...      gc, irq   

   Emotion  
0        0  
1       -1  
2        0  
3       -1  
4        0  


In [None]:
# Save the new dataframe as a CSV file in Colab
new_df.to_excel('/content/Emotions_DS_updated.xlsx', index=False)

print("New DataFrame saved as Emotions_DS_updated.xlsx")

New DataFrame saved as Emotions_DS_updated.xlsx


Handling Class Imbalance with Class Weights

In [None]:
# --------------------------------------------
# 3. Handling Class Imbalance with Class Weights
# --------------------------------------------

# Assuming the target column is named 'emotion'
target_column = 'Emotion'

# Extract the classes and compute class weights
classes = new_df[target_column].unique()
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=classes,
                                     y=new_df[target_column])
class_weight_dict = dict(zip(classes, class_weights))
print("\nClass Weights:")
print(class_weight_dict)



Class Weights:
{0: 0.4267935578330893, -1: 1.6345794392523365, 1: 22.139240506329113}


In [None]:
!pip install imbalanced-learn




In [None]:

# Initialize SMOTE
smote = SMOTE(random_state=42)

Feature Engineering

In [None]:
# --------------------------------------------
# 4. Feature Engineering
# --------------------------------------------

# Define feature columns
utterance_col = 'Utterance'
dialog_act_col = 'Dialogue_Act'  # Replace with your actual column name
type_col = 'Type'              # Replace with your actual column name

# Separate features and target
X = new_df[[utterance_col, dialog_act_col, type_col]]
y = new_df[target_column]





In [None]:
# Step 1: Convert boolean values to strings explicitly
X[utterance_col] = X[utterance_col].apply(lambda x: str(x) if isinstance(x, bool) else x)

# Step 2: Convert the entire utterance column to string to avoid any errors
X[utterance_col] = X[utterance_col].astype(str)

Train-Test-Split

In [None]:
# Split into training and testing sets (80-20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
                                                    random_state=42, stratify=y)

Building the Preprocessing Pipeline

In [None]:
# One-Hot Encoding for categorical features
categorical_features = [dialog_act_col, type_col]
categorical_transformer = OneHotEncoder(handle_unknown='ignore')

In [None]:
# TF-IDF Vectorization for text data
text_features = utterance_col
text_transformer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))

In [None]:
# Combine transformations using ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_features),
        ('cat', categorical_transformer, categorical_features)
    ])

Building the Pipeline with Logistic Regression

In [None]:
# Initialize Logistic Regression with class weights
log_reg = LogisticRegression(class_weight=class_weight_dict,
                             solver='liblinear',
                             max_iter=1000,
                             random_state=42)

# Create a pipeline that first preprocesses the data and then applies the classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('smote', smote),
    ('classifier', log_reg)
])

Hyperparameter Tuning using GridSearchCV

In [None]:
# Define hyperparameter grid
param_grid = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty': ['l1', 'l2']
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid,
                           cv=5,
                           scoring='f1_macro',
                           n_jobs=-1,
                           verbose=1)

# Fit GridSearchCV
print("\nStarting Hyperparameter Tuning...")
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


Starting Hyperparameter Tuning...
Fitting 5 folds for each of 10 candidates, totalling 50 fits
Best Parameters: {'classifier__C': 10, 'classifier__penalty': 'l1'}
Best CV Score: 0.5783905315435491


Training the Final Model

In [None]:
# Retrieve the best estimator
best_model = grid_search.best_estimator_

# Fit the best model on the entire training data
best_model.fit(X_train, y_train)

Evaluating the Model

In [None]:
# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

print("\nModel Evaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-Score : {f1:.4f}")


Model Evaluation Metrics:
Accuracy : 0.8467
Precision: 0.5617
Recall   : 0.5418
F1-Score : 0.5483


In [None]:
# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

          -1       0.66      0.65      0.65       214
           0       0.90      0.91      0.91       820
           1       0.12      0.06      0.08        16

    accuracy                           0.85      1050
   macro avg       0.56      0.54      0.55      1050
weighted avg       0.84      0.85      0.84      1050

Confusion Matrix:
[[139  73   2]
 [ 66 749   5]
 [  6   9   1]]


Creating DataFrame for Actual and Predicted Values

In [None]:

# Reset index to align actual and predicted values
X_test_reset = X_test.reset_index(drop=True)
y_test_reset = y_test.reset_index(drop=True)
y_pred_series = pd.Series(y_pred, name='Predicted')

# Combine Utterance, Dialogue Act, Type, Actual, and Predicted columns into a DataFrame
results_df = pd.concat([X_test_reset[utterance_col],
                        X_test_reset[dialog_act_col],
                        X_test_reset[type_col],
                        y_test_reset.rename('Actual'),
                        y_pred_series], axis=1)

# Print the formatted Actual vs Predicted DataFrame
print("\nFormatted Actual vs Predicted DataFrame:")
print(results_df.tail(10))



Formatted Actual vs Predicted DataFrame:
                                              Utterance Dialogue_Act Type  \
1040  I mean, I guess I could tell myself like I can...           cr    P   
1041                                               ....           cr    P   
1042             yes or no. Are you on any medications?           gc    T   
1043  Yeah. It makes sense. And, you know, when you ...           op    P   
1044                        Ohh I'm sorry to hear that.           ap    T   
1045  But if you know there's a a treatment team mee...           id    T   
1046  Um, I it's just a, it's how I meet people. So ...           id    P   
1047  Okay, let's look at it real quick. Bring it ov...           gc    T   
1048  Maybe something natural exercise, oh, heavy en...           gc    P   
1049  Okay, so you live with mom, dad and sister. wh...          ack    T   

      Actual  Predicted  
1040      -1         -1  
1041       0          0  
1042       0          0  
1043  

Creating DataFrame for Incorrect Predictions

In [None]:

incorrect_predictions_df = results_df[results_df['Actual'] != results_df['Predicted']]
print("\nIncorrect Predictions DataFrame:")
print(incorrect_predictions_df.head())


Incorrect Predictions DataFrame:
                                            Utterance Dialogue_Act Type  \
12  Exactly. Just so I take part with the social c...           cd    P   
30  Well, I guess I think initially that I would n...           od    P   
46  No, I guess I see what you're saying. I mean, ...           gc    P   
51  start naming Skittles, cherry Starburst basica...           id    P   
64  No, what has happened in previous online class...           on    P   

    Actual  Predicted  
12       0         -1  
30       0         -1  
46      -1          0  
51      -1          0  
64      -1          0  


Making Predictions on New Samples

In [None]:
# Example new samples
new_samples = pd.DataFrame({
    'Utterance': [
        "I'm not sure yet. But I'm looking into schools like in the area, so maybe, I don't know, maybe I'll leave home or something.",
        "What about at these parties are going to Are there other kids there who are drinking a lot?"
    ],
    'Dialogue_Act': [
        "id",    # Replace with actual dialog act labels
        "irq"  # Replace with actual dialog act labels
    ],
    'Type': [
        "P",        # Replace with actual type labels
        "T"      # Replace with actual type labels
    ]
})

# Predict emotions for new samples
new_predictions = best_model.predict(new_samples)

# Combine new samples with their predictions
new_results = new_samples.copy()
new_results['Predicted Emotion'] = new_predictions

print("\nNew Sample Predictions:")
print(new_results)


New Sample Predictions:
                                           Utterance Dialogue_Act Type  \
0  I'm not sure yet. But I'm looking into schools...           id    P   
1  What about at these parties are going to Are t...          irq    T   

   Predicted Emotion  
0                  0  
1                  0  


Saving the Results

In [None]:

# Save actual vs predicted to CSV
results_df.to_csv('actual_vs_predicted.csv', index=False)

# Save incorrect predictions to CSV
incorrect_predictions_df.to_csv('incorrect_predictions.csv', index=False)

# Save new predictions to CSV
new_results.to_csv('new_sample_predictions.csv', index=False)

print("\nResults have been saved to CSV files.")


Results have been saved to CSV files.


 RandomForestClassifier

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Initialize Random Forest Classifier with class weights
rf_classifier = RandomForestClassifier(class_weight=class_weight_dict,
                                        random_state=42)


In [None]:

# Create a pipeline that first preprocesses the data and then applies the classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', rf_classifier)
])


In [None]:
# Define hyperparameter grid
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid,
                           cv=5,
                           scoring='f1_macro',
                           n_jobs=-1,
                           verbose=1)

# Fit GridSearchCV
print("\nStarting Hyperparameter Tuning...")
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


Starting Hyperparameter Tuning...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 10, 'classifier__n_estimators': 200}
Best CV Score: 0.5531352175219751


In [None]:
# 8. Training the Final Model
# --------------------------------------------

# Retrieve the best estimator
best_model = grid_search.best_estimator_

# Fit the best model on the entire training data
best_model.fit(X_train, y_train)

# --------------------------------------------
# 9. Evaluating the Model
# --------------------------------------------

# Predict on the test set
y_pred = best_model.predict(X_test)

In [None]:

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

print("\nModel Evaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-Score : {f1:.4f}")



Model Evaluation Metrics:
Accuracy : 0.8714
Precision: 0.5353
Recall   : 0.5722
F1-Score : 0.5512


In [None]:
# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

          -1       0.67      0.81      0.73       214
           0       0.94      0.90      0.92       820
           1       0.00      0.00      0.00        16

    accuracy                           0.87      1050
   macro avg       0.54      0.57      0.55      1050
weighted avg       0.87      0.87      0.87      1050

Confusion Matrix:
[[174  40   0]
 [ 79 741   0]
 [  8   8   0]]


In [None]:
# Creating DataFrame for Actual and Predicted Values

In [None]:

# Reset index to align actual and predicted values
X_test_reset = X_test.reset_index(drop=True)
y_test_reset = y_test.reset_index(drop=True)
y_pred_series = pd.Series(y_pred, name='Predicted')

# Combine Utterance, Dialogue Act, Type, Actual, and Predicted columns into a DataFrame
results_df = pd.concat([X_test_reset[utterance_col],
                        X_test_reset[dialog_act_col],
                        X_test_reset[type_col],
                        y_test_reset.rename('Actual'),
                        y_pred_series], axis=1)

# Print the formatted Actual vs Predicted DataFrame
print("\nFormatted Actual vs Predicted DataFrame:")
print(results_df.tail(10))



Formatted Actual vs Predicted DataFrame:
                                              Utterance Dialogue_Act Type  \
1040  I mean, I guess I could tell myself like I can...           cr    P   
1041                                               ....           cr    P   
1042             yes or no. Are you on any medications?           gc    T   
1043  Yeah. It makes sense. And, you know, when you ...           op    P   
1044                        Ohh I'm sorry to hear that.           ap    T   
1045  But if you know there's a a treatment team mee...           id    T   
1046  Um, I it's just a, it's how I meet people. So ...           id    P   
1047  Okay, let's look at it real quick. Bring it ov...           gc    T   
1048  Maybe something natural exercise, oh, heavy en...           gc    P   
1049  Okay, so you live with mom, dad and sister. wh...          ack    T   

      Actual  Predicted  
1040      -1         -1  
1041       0          0  
1042       0          0  
1043  

In [None]:
# Example new samples
new_samples = pd.DataFrame({
    'Utterance': [
        "I'm not sure yet. But I'm looking into schools like in the area, so maybe, I don't know, maybe I'll leave home or something.",
        "What about at these parties are going to Are there other kids there who are drinking a lot?"
    ],
    'Dialogue_Act': [
        "id",    # Replace with actual dialog act labels
        "irq"  # Replace with actual dialog act labels
    ],
    'Type': [
        "P",        # Replace with actual type labels
        "T"      # Replace with actual type labels
    ]
})

# Predict emotions for new samples
new_predictions = best_model.predict(new_samples)

# Combine new samples with their predictions
new_results = new_samples.copy()
new_results['Predicted Emotion'] = new_predictions

print("\nNew Sample Predictions:")
print(new_results)


New Sample Predictions:
                                           Utterance Dialogue_Act Type  \
0  I'm not sure yet. But I'm looking into schools...           id    P   
1  What about at these parties are going to Are t...          irq    T   

   Predicted Emotion  
0                  0  
1                  0  


XGBOOST Classifier

In [None]:
# For modeling
from xgboost import XGBClassifier


In [None]:
# Load the dataset (replace the file path with the actual path)
df = pd.read_excel('/content/Emotions_DS.xlsx')

In [None]:
# -1 -> 0 (negative emotion), 0 -> 1 (neutral emotion), 1 -> 2 (positive emotion)
df['Emotion'] = df['Emotion'].replace({-1: 2,-2:2,-3:2, 0: 0, 1: 1,2:1,9:1})

# Create a new dataframe after replacements
new_df = df.copy()

# Display the first few rows of the new dataframe
print("New DataFrame after Emotion Replacement:")
print(new_df.head())


New DataFrame after Emotion Replacement:
      ID Type                                          Utterance Dialogue_Act  \
0  194_0    T  Hi. Alvina, how are you doing today? It's good...           gt   
1  194_1    P                                    I'm just tired.           gt   
2  194_2    T                                        just tired?          crq   
3  194_3    P                                               Yeah           cd   
4  194_4    T  you know, we did some pre visit planning with ...      gc, irq   

   Emotion  
0        0  
1        2  
2        0  
3        2  
4        0  


In [None]:
# Assuming the target column is named 'emotion'
target_column = 'Emotion'

# Extract the classes and compute class weights
classes = new_df[target_column].unique()
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=classes,
                                     y=new_df[target_column])
class_weight_dict = dict(zip(classes, class_weights))
print("\nClass Weights:")
print(class_weight_dict)



Class Weights:
{0: 0.4267935578330893, 2: 1.6345794392523365, 1: 22.139240506329113}


In [None]:
# Initialize XGBoost Classifier
xgb_classifier = XGBClassifier(scale_pos_weight=class_weight_dict[1],
                                random_state=42,
                                use_label_encoder=False,
                                eval_metric='mlogloss')

# Create a pipeline that first preprocesses the data and then applies the classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', xgb_classifier)
])

In [None]:
# --------------------------------------------
# 7. Hyperparameter Tuning using GridSearchCV
# --------------------------------------------
# Define hyperparameter grid
param_grid = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [None, 10, 20, 30],
    'classifier__min_samples_split': [2, 5, 10]
}

# Initialize GridSearchCV
grid_search = GridSearchCV(pipeline, param_grid,
                           cv=5,
                           scoring='f1_macro',
                           n_jobs=-1,
                           verbose=1)

# Fit GridSearchCV
print("\nStarting Hyperparameter Tuning...")
grid_search.fit(X_train, y_train)
print("Best Parameters:", grid_search.best_params_)
print("Best CV Score:", grid_search.best_score_)


Starting Hyperparameter Tuning...
Fitting 5 folds for each of 36 candidates, totalling 180 fits
Best Parameters: {'classifier__max_depth': None, 'classifier__min_samples_split': 2, 'classifier__n_estimators': 50}
Best CV Score: 0.5378695572235803


In [None]:
# --------------------------------------------
# 8. Training the Final Model
# --------------------------------------------

# Retrieve the best estimator
best_model = grid_search.best_estimator_

# Fit the best model on the entire training data
best_model.fit(X_train, y_train)

# --------------------------------------------
# 9. Evaluating the Model
# --------------------------------------------

# Predict on the test set
y_pred = best_model.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='macro', zero_division=0)
recall = recall_score(y_test, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)

print("\nModel Evaluation Metrics:")
print(f"Accuracy : {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall   : {recall:.4f}")
print(f"F1-Score : {f1:.4f}")


Model Evaluation Metrics:
Accuracy : 0.8505
Precision: 0.5209
Recall   : 0.5138
F1-Score : 0.5168


In [None]:

# Detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, zero_division=0))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.93      0.91       820
           1       0.00      0.00      0.00        16
           2       0.67      0.61      0.64       214

    accuracy                           0.85      1050
   macro avg       0.52      0.51      0.52      1050
weighted avg       0.83      0.85      0.84      1050

Confusion Matrix:
[[762   1  57]
 [  8   0   8]
 [ 82   1 131]]
