In [2]:
!pip install scikit-learn transformers datasets


Collecting datasets
  Downloading datasets-3.0.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
INFO: pip is looking at multiple versions of multiprocess to determine which version is compatible with other requirements. This could take a while.
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.1-py3-none-any.whl (471 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m471.6/471.6 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:

In [3]:
from google.colab import files
uploaded = files.upload()

Saving shakespeare-sentiment.csv to shakespeare-sentiment.csv


In [4]:
from google.colab import files
uploaded = files.upload()

Saving test_sent_emo.csv to test_sent_emo.csv


In [5]:
import pandas as pd

# Load datasets
shakespeare_sentiment = pd.read_csv('shakespeare-sentiment.csv')
test_sent_emo = pd.read_csv('test_sent_emo.csv')

# Display first few rows of each dataset
print(shakespeare_sentiment.head())
print(test_sent_emo.head())

                                                text  start    end sentiment  \
0  With no fees or minimums, banking with Capital...    330  10106  POSITIVE   
1  And with no overdraft fees, is it even a decis...  10218  14666   NEUTRAL   
2                       What's in your wallet terms?  14778  16302   NEUTRAL   
3                                             Apply.  16356  16766   NEUTRAL   
4  See capitalone.com Slash Bank Capital one NA m...  16868  24634  POSITIVE   

   confidence  speaker  
0    0.938870      NaN  
1    0.723502      NaN  
2    0.871523      NaN  
3    0.620469      NaN  
4    0.511470      NaN  
   Sr No.                                          Utterance Speaker  \
0       1  Why do all youre coffee mugs have numbers on ...    Mark   
1       2  Oh. Thats so Monica can keep track. That way ...  Rachel   
2       3                                       Y'know what?  Rachel   
3      19                     Come on, Lydia, you can do it.    Joey   
4      20

In [6]:
from sklearn.model_selection import train_test_split


In [7]:
# Print the column names of both datasets to identify the actual column names
print(shakespeare_sentiment.columns)
print(test_sent_emo.columns)


Index(['text', 'start', 'end', 'sentiment', 'confidence', 'speaker'], dtype='object')
Index(['Sr No.', 'Utterance', 'Speaker', 'Emotion', 'Sentiment', 'Dialogue_ID',
       'Utterance_ID', 'Season', 'Episode', 'StartTime', 'EndTime'],
      dtype='object')


In [8]:
def split_data(df, text_column, label_column):
    """
    Split dataset into train (80%), validation (10%), and test (10%) sets.

    Args:
    df (DataFrame): Input dataframe with text and labels.
    text_column (str): The column name for text data.
    label_column (str): The column name for labels (sentiment/emotion).

    Returns:
    X_train, X_val, X_test, y_train, y_val, y_test: Split data.
    """
    # Extract text and labels
    X = df[text_column]
    y = df[label_column]

    # First split: 80% train, 20% temp
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

    # Second split: 50% validation, 50% test from the 20% temp set
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

    return X_train, X_val, X_test, y_train, y_val, y_test

# Split Shakespeare Sentiment Dataset using 'text' and 'sentiment'
X_train_s, X_val_s, X_test_s, y_train_s, y_val_s, y_test_s = split_data(shakespeare_sentiment, 'text', 'sentiment')

# Split test_sent_emo Dataset using 'Utterance' and 'Sentiment'
X_train_e, X_val_e, X_test_e, y_train_e, y_val_e, y_test_e = split_data(test_sent_emo, 'Utterance', 'Sentiment')

# Check the split
print("Shakespeare Sentiment Train Size:", len(X_train_s))
print("Shakespeare Sentiment Validation Size:", len(X_val_s))
print("Shakespeare Sentiment Test Size:", len(X_test_s))

print("test_sent_emo Train Size:", len(X_train_e))
print("test_sent_emo Validation Size:", len(X_val_e))
print("test_sent_emo Test Size:", len(X_test_e))

Shakespeare Sentiment Train Size: 616
Shakespeare Sentiment Validation Size: 77
Shakespeare Sentiment Test Size: 77
test_sent_emo Train Size: 2088
test_sent_emo Validation Size: 261
test_sent_emo Test Size: 261


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [10]:
!pip install scikit-plot


Collecting scikit-plot
  Downloading scikit_plot-0.3.7-py3-none-any.whl.metadata (7.1 kB)
Downloading scikit_plot-0.3.7-py3-none-any.whl (33 kB)
Installing collected packages: scikit-plot
Successfully installed scikit-plot-0.3.7


In [11]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_auc_score, roc_curve, confusion_matrix, ConfusionMatrixDisplay
import numpy as np


In [12]:
def preprocess_tfidf(X_train, X_val, X_test):
    vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_val_tfidf = vectorizer.transform(X_val)
    X_test_tfidf = vectorizer.transform(X_test)
    return X_train_tfidf, X_val_tfidf, X_test_tfidf

In [19]:
vectorizer_s = TfidfVectorizer()
X_train_tfidf_s = vectorizer_s.fit_transform(X_train_s)  # Fit on train set
X_test_tfidf_s = vectorizer_s.transform(X_test_s)  # Transform test set using the same vectorizer


In [20]:
# For test_sent_emo Dataset
vectorizer_e = TfidfVectorizer()
X_train_tfidf_e = vectorizer_e.fit_transform(X_train_e)  # Fit on train set
X_test_tfidf_e = vectorizer_e.transform(X_test_e)

In [13]:
# Shakespeare Sentiment Dataset
X_train_tfidf_s, X_val_tfidf_s, X_test_tfidf_s = preprocess_tfidf(X_train_s, X_val_s, X_test_s)

# test_sent_emo Dataset
X_train_tfidf_e, X_val_tfidf_e, X_test_tfidf_e = preprocess_tfidf(X_train_e, X_val_e, X_test_e)


In [21]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Function to train and evaluate the model
def train_and_evaluate_model(model, X_train, y_train, X_val, y_val, X_test, y_test):
    model.fit(X_train, y_train)

    # Validation set evaluation
    y_val_pred = model.predict(X_val)
    print(f"Validation Accuracy: {accuracy_score(y_val, y_val_pred)}")
    print(f"Validation F1-score: {f1_score(y_val, y_val_pred, average='weighted')}")
    print(f"Validation Precision: {precision_score(y_val, y_val_pred, average='weighted')}")
    print(f"Validation Recall: {recall_score(y_val, y_val_pred, average='weighted')}")

    # Test set evaluation
    y_test_pred = model.predict(X_test)
    print(f"Test Accuracy: {accuracy_score(y_test, y_test_pred)}")
    print(f"Test F1-score: {f1_score(y_test, y_test_pred, average='weighted')}")
    print(f"Test Precision: {precision_score(y_test, y_test_pred, average='weighted')}")
    print(f"Test Recall: {recall_score(y_test, y_test_pred, average='weighted')}")
    print("\n")


In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier

# Initialize the models
naive_bayes = MultinomialNB()
linear_svc = LinearSVC()
logistic_regression = LogisticRegression(max_iter=1000)
sgd_classifier = SGDClassifier()


In [23]:
# Naive Bayes on Shakespeare Sentiment Dataset
print("Naive Bayes Model Results for Shakespeare Sentiment Dataset:")
train_and_evaluate_model(naive_bayes, X_train_tfidf_s, y_train_s, X_val_tfidf_s, y_val_s, X_test_tfidf_s, y_test_s)

# Linear SVC on Shakespeare Sentiment Dataset
print("Linear SVC Model Results for Shakespeare Sentiment Dataset:")
train_and_evaluate_model(linear_svc, X_train_tfidf_s, y_train_s, X_val_tfidf_s, y_val_s, X_test_tfidf_s, y_test_s)

# Logistic Regression on Shakespeare Sentiment Dataset
print("Logistic Regression Model Results for Shakespeare Sentiment Dataset:")
train_and_evaluate_model(logistic_regression, X_train_tfidf_s, y_train_s, X_val_tfidf_s, y_val_s, X_test_tfidf_s, y_test_s)

# SGD Classifier on Shakespeare Sentiment Dataset
print("SGD Classifier Model Results for Shakespeare Sentiment Dataset:")
train_and_evaluate_model(sgd_classifier, X_train_tfidf_s, y_train_s, X_val_tfidf_s, y_val_s, X_test_tfidf_s, y_test_s)


Naive Bayes Model Results for Shakespeare Sentiment Dataset:
Validation Accuracy: 0.6233766233766234
Validation F1-score: 0.5014489642588816
Validation Precision: 0.6261471861471861
Validation Recall: 0.6233766233766234
Test Accuracy: 0.7532467532467533
Test F1-score: 0.6581210996795412
Test Precision: 0.685064935064935
Test Recall: 0.7532467532467533


Linear SVC Model Results for Shakespeare Sentiment Dataset:
Validation Accuracy: 0.6493506493506493
Validation F1-score: 0.611594009192637
Validation Precision: 0.6138028638028638
Validation Recall: 0.6493506493506493
Test Accuracy: 0.7922077922077922
Test F1-score: 0.7288961038961039
Test Precision: 0.6920919456130724
Test Recall: 0.7922077922077922


Logistic Regression Model Results for Shakespeare Sentiment Dataset:
Validation Accuracy: 0.6363636363636364
Validation F1-score: 0.538762931983271
Validation Precision: 0.5811688311688312
Validation Recall: 0.6363636363636364
Test Accuracy: 0.7792207792207793
Test F1-score: 0.70413555909

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [17]:
# Naive Bayes on test_sent_emo Dataset
print("Naive Bayes Model Results for test_sent_emo Dataset:")
train_and_evaluate_model(naive_bayes, X_train_tfidf_e, y_train_e, X_val_tfidf_e, y_val_e, X_test_tfidf_e, y_test_e)

# Linear SVC on test_sent_emo Dataset
print("Linear SVC Model Results for test_sent_emo Dataset:")
train_and_evaluate_model(linear_svc, X_train_tfidf_e, y_train_e, X_val_tfidf_e, y_val_e, X_test_tfidf_e, y_test_e)

# Logistic Regression on test_sent_emo Dataset
print("Logistic Regression Model Results for test_sent_emo Dataset:")
train_and_evaluate_model(logistic_regression, X_train_tfidf_e, y_train_e, X_val_tfidf_e, y_val_e, X_test_tfidf_e, y_test_e)

# SGD Classifier on test_sent_emo Dataset
print("SGD Classifier Model Results for test_sent_emo Dataset:")
train_and_evaluate_model(sgd_classifier, X_train_tfidf_e, y_train_e, X_val_tfidf_e, y_val_e, X_test_tfidf_e, y_test_e)


Naive Bayes Model Results for test_sent_emo Dataset:
Validation Accuracy: 0.5440613026819924
Validation F1-score: 0.45021312246998973
Validation Precision: 0.5621485793899588
Validation Recall: 0.5440613026819924
Test Accuracy: 0.5517241379310345
Test F1-score: 0.47708998245602413
Test Precision: 0.6605641143219146
Test Recall: 0.5517241379310345


Linear SVC Model Results for test_sent_emo Dataset:
Validation Accuracy: 0.5478927203065134
Validation F1-score: 0.5404329481382222
Validation Precision: 0.5393722993815565
Validation Recall: 0.5478927203065134
Test Accuracy: 0.5095785440613027
Test F1-score: 0.5018737848095116
Test Precision: 0.5027362401266726
Test Recall: 0.5095785440613027


Logistic Regression Model Results for test_sent_emo Dataset:
Validation Accuracy: 0.5823754789272031
Validation F1-score: 0.5630544527933582
Validation Precision: 0.5772529626659538
Validation Recall: 0.5823754789272031
Test Accuracy: 0.5287356321839081
Test F1-score: 0.5101086712073247
Test Precisio