In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path_to_final_data = '/content/drive/MyDrive/Machine Learning Final Project/final_data.csv'

In [None]:
import pandas as pd
from transformers import pipeline

In [None]:
final_df = pd.read_csv(path_to_final_data)

In [None]:
final_df.head()

In [None]:
final_df.columns

Different Types of Sentiment Analyzers
1. distilBERT sentiment (positive/negative)
2. roBERTa (28 emotions)
3. XLM-roBERTa (7 emotions)

In [None]:
# distilbert sentiment analysis (positive/negative)
sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def analyze_sentiment_truncate(text, max_length=512):
  # max length of text it supports is 512
    truncated_text = text[:max_length]
    result = sentiment_analyzer(truncated_text)
    return result[0]['label'], result[0]['score']

In [None]:
# roBERTa emotion analysis (28 emotions)
emotion_analyzer = pipeline('text-classification',
                            model='j-hartmann/emotion-english-distilroberta-base',
                            return_all_scores=True)

def analyze_roberta_emotion(text, max_length=512):
    # max length of text it supports is 512
    truncated_text = text[:max_length]
    emotions = emotion_analyzer(truncated_text)
    emotion_scores = {item['label']: item['score'] for item in emotions[0]}
    dominant_emotion = max(emotion_scores, key=emotion_scores.get)
    return dominant_emotion, emotion_scores

In [None]:
#XLM-RoBERTa - Multi-Lingual Emotion Analysis (7 emotions)
seven_emotion_analyzer = pipeline('text-classification',
                                  model='bhadresh-savani/distilbert-base-uncased-emotion',
                                  return_all_scores=True)

def analyze_emotion(text, max_length=512):
    # max length of text it supports is 512
    truncated_text = text[:max_length]
    emotions = seven_emotion_analyzer(truncated_text)
    emotion_scores = {item['label']: item['score'] for item in emotions[0]}
    unique_emotions = [item['label'] for item in emotions[0]]
    dominant_emotion = max(emotion_scores, key=emotion_scores.get)
    return dominant_emotion, emotion_scores

In [None]:
final_df['Dominant_Emotion'], emotion_scores = zip(*final_df['plot'].apply(analyze_emotion))
emotion_df = pd.json_normalize(emotion_scores)

final_df = pd.concat([final_df, emotion_df], axis=1)

In [None]:
print(final_df.head())

In [None]:
final_df.to_csv('final_df_with_emotion_scores.csv', index=True)