In [1]:
!pip install transformers
!pip install torch
!pip install scikit-learn
!pip install pandas



In [18]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from transformers import BartTokenizer, BartForConditionalGeneration
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

In [3]:
# Load the dataset
data = pd.read_excel('/content/5247-rows_3-Emotions_No-Type.xlsx')

In [5]:
# Preprocess data (Remove 'Type' column and select only necessary columns)
data = data[['Utterance', 'Dialogue_Act', 'Emotion']]

In [6]:
# Define feature and target variables
X = data['Utterance']
y = data['Emotion']

In [7]:
# Transform 'Dialogue_Act' with one-hot encoding if necessary
dialogue_act = pd.get_dummies(data['Dialogue_Act'], prefix='Dialogue_Act')

In [9]:
# Convert any boolean values in the 'Utterance' column to strings before applying TF-IDF
X = data['Utterance'].astype(str)

# TF-IDF vectorization for 'Utterance' column
tfidf = TfidfVectorizer(max_features=5000)  # Adjust max_features as per requirement
X_utterance_tfidf = tfidf.fit_transform(X)

In [10]:
# Combine TF-IDF features and encoded 'Dialogue_Act' features
from scipy.sparse import hstack
X_combined = hstack((X_utterance_tfidf, dialogue_act))

In [11]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [12]:
# Initialize and train the Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [13]:
# Evaluation
y_pred = model.predict(X_test)
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

Classification Report:
               precision    recall  f1-score   support

          -1       0.69      0.39      0.50       212
           0       0.85      0.95      0.90       827
           1       0.00      0.00      0.00        11

    accuracy                           0.83      1050
   macro avg       0.51      0.45      0.47      1050
weighted avg       0.81      0.83      0.81      1050

Accuracy Score: 0.8304761904761905


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
# Initialize BART summarization model for summarizing new samples
bart_tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
bart_model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.58k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

In [15]:
# Summarization function for long utterances
def summarize_utterance(utterance):
    if len(utterance.split()) > 15:  # Only summarize if more than 15 words
        inputs = bart_tokenizer(utterance, max_length=1024, truncation=True, return_tensors="pt")
        summary_ids = bart_model.generate(inputs['input_ids'], max_length=50, min_length=10, length_penalty=2.0, num_beams=4, early_stopping=True)
        summarized_text = bart_tokenizer.decode(summary_ids[0], skip_special_tokens=True)
        return summarized_text
    return utterance

In [19]:
# Function to predict emotion for new sample text
def predict_emotion(sample_text):
    # Summarize the text if it's too long
    summarized_text = summarize_utterance(sample_text)

    # Display the summarized text before prediction
    if len(sample_text.split()) > 15:
        print("Original Text:", sample_text)
        print("Summarized Text:", summarized_text)

    # Transform text with TF-IDF
    text_tfidf = tfidf.transform([summarized_text])

    # Create dummy 'Dialogue_Act' vector (assuming no dialogue act given)
    dialogue_act_dummy = pd.DataFrame([0] * dialogue_act.shape[1]).T  # Dummy zero-vector

    # Combine features
    combined_features = hstack((text_tfidf, dialogue_act_dummy))

    # Predict emotion
    prediction = model.predict(combined_features)
    return prediction

In [23]:
# Example usage
sample_text = "Lately, everything has been falling apart around me, and no matter how hard I try, I can’t seem to get ahead. Every time I make a bit of progress, something goes wrong, pulling me back to square one. It’s exhausting and demoralizing to constantly face setbacks, especially when it feels like nobody around me understands how hard I’m working. I feel so alone in all of this, and it’s starting to seem like there’s no way out of this cycle of failure."
predicted_emotion = predict_emotion(sample_text)
print("Predicted Emotion:", predicted_emotion)

Original Text: Lately, everything has been falling apart around me, and no matter how hard I try, I can’t seem to get ahead. Every time I make a bit of progress, something goes wrong, pulling me back to square one. It’s exhausting and demoralizing to constantly face setbacks, especially when it feels like nobody around me understands how hard I’m working. I feel so alone in all of this, and it’s starting to seem like there’s no way out of this cycle of failure.
Summarized Text: "It's exhausting and demoralizing to constantly face setbacks," she says. "I feel so alone in all of this, and it's starting to seem like there's no way out of this cycle of failure"
Predicted Emotion: [0]
