In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.metrics import classification_report
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.utils.class_weight import compute_class_weight

In [None]:
df = pd.read_excel('/content/Emotions_DS.xlsx')

In [None]:
# Step 1: Handle class imbalance
# Assuming 'Emotions' is the target column
y = df['Emotion']

In [None]:
# Compute class weights
class_weights = compute_class_weight(class_weight='balanced',
                                     classes=y.unique(),
                                     y=y)
class_weight_dict = dict(zip(y.unique(), class_weights))

In [None]:
# Step 2: Convert categorical data to numerical format (One-Hot Encoding)
# We'll use OneHotEncoder for the 'Dialogue_Act' and 'Type' columns

categorical_features = ['Dialogue_Act', 'Type']
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

In [None]:
# Step 3: Tokenize and vectorize the 'Utterance' column using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

In [None]:
# Function to convert text to string (just in case)
def to_string(X):
    X['Utterance'] = X['Utterance'].astype(str)
    return X

In [None]:
# Step 4: Combine Features (TF-IDF + One-Hot Encoded Categorical Columns)
# We'll use ColumnTransformer to apply these transformations

preprocessor = ColumnTransformer(
    transformers=[
        ('tfidf', tfidf_vectorizer, 'Utterance'),  # Apply TF-IDF on 'Utterance'
        ('onehot', one_hot_encoder, categorical_features)  # One-Hot on 'Dialogue_Act' and 'Type'
    ])

In [None]:
# Step 5: Random Forest for emotion prediction
rf_classifier = RandomForestClassifier(class_weight=class_weight_dict, random_state=42)

In [None]:
# Step 6: Fine-tuning Hyperparameters with GridSearchCV
param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

In [None]:
# Step 7: Build the Pipeline
pipeline = Pipeline(steps=[
    ('to_string', FunctionTransformer(to_string)),  # Convert text to strings
    ('preprocessor', preprocessor),
    ('classifier', GridSearchCV(rf_classifier, param_grid, cv=5, n_jobs=-1, verbose=1))
])

In [None]:
# Split the dataset into training and testing sets
X = df.drop(columns=['Emotion', 'ID'])  # Dropping 'ID' as it's not relevant for training
y = df['Emotion']

In [None]:
# Ensure 'Utterance' column contains only strings
X['Utterance'] = X['Utterance'].astype(str)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

Fitting 5 folds for each of 24 candidates, totalling 120 fits




In [None]:
# Step 7: Evaluate the model
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -3       0.20      0.08      0.11        26
          -2       0.00      0.00      0.00        39
          -1       0.47      0.65      0.54       147
           0       0.90      0.91      0.90       827
           1       0.00      0.00      0.00         9
           2       0.00      0.00      0.00         1
           9       0.00      0.00      0.00         1

    accuracy                           0.81      1050
   macro avg       0.22      0.23      0.22      1050
weighted avg       0.78      0.81      0.79      1050



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Create a DataFrame for Actual vs Predicted values
results_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
results_df.head(10)

Unnamed: 0,Actual,Predicted
1231,-1,0
4997,0,0
4407,-3,0
681,-1,0
626,0,0
8,0,0
157,0,0
23,-1,-1
1918,-1,-1
1752,0,0
