In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
path_to_final_data = '/content/drive/MyDrive/Machine Learning Final Project/final_data.csv'

In [3]:
import pandas as pd
from transformers import pipeline

In [4]:
final_df = pd.read_csv(path_to_final_data)

In [5]:
final_df.head()

Unnamed: 0,title,audience_score,tomato_meter,rating,rating_contents,director,writer,box_office,distributor,sound_mix,belongs_to_collection,budget,id,runtime,release_year,cast,wiki_page,plot,genre,language
0,the in-laws,83.0,88.0,,,arthur hiller,andrew bergman,38200000.0,,,,9000000.0,,103.0,1979.0,"peter falk, alan arkin",https://en.wikipedia.org/wiki/the_in-laws_(197...,the daughter of mild-mannered manhattan dentis...,comedy,english
1,race the sun,41.0,22.0,,,charles t. kanganis,,1700000.0,,surround,,0.0,55731.0,100.0,1996.0,"halle berry, james belushi",https://en.wikipedia.org/wiki/race_the_sun,"a new science teacher, miss sandra beecher, (h...",drama,english
2,paul blart: mall cop,43.0,34.0,pg,"mild crude humor, language, some violence",steve carr,kevin james,183000000.0,"sony pictures releasing,",,,26000000.0,,91.0,2009.0,"jayma mays, keir o'donnell, bobby cannavale, s...",https://en.wikipedia.org/wiki/paul_blart:_mall...,"paul blart lives in west orange, new jersey wi...",comedy,english
3,not without my daughter,71.0,53.0,,,brian gilbert,"betty mahmoody,william hoffer",43000000.0,metro-goldwyn-mayer,surround,,13200000.0,9585.0,116.0,1991.0,"sally field, alfred molina, roshan seth",https://en.wikipedia.org/wiki/not_without_my_d...,"in 1984, an iranian physician, sayyed bozorg ""...",drama,english
4,rookie of the year,52.0,38.0,,,daniel stern,sam harper,56500000.0,20th century fox,,,10000000.0,21845.0,103.0,1993.0,"thomas ian nicholas, gary busey, dan hedaya",https://en.wikipedia.org/wiki/rookie_of_the_ye...,"henry rowengartner (nicholas), 12-year-old lit...","comedy, kids & family",english


In [6]:
final_df.columns

Index(['title', 'audience_score', 'tomato_meter', 'rating', 'rating_contents',
       'director', 'writer', 'box_office', 'distributor', 'sound_mix',
       'belongs_to_collection', 'budget', 'id', 'runtime', 'release_year',
       'cast', 'wiki_page', 'plot', 'genre', 'language'],
      dtype='object')

Different Types of Sentiment Analyzers
1. distilBERT sentiment (positive/negative)
2. roBERTa (28 emotions)
3. XLM-roBERTa (7 emotions)

In [8]:
# distilbert sentiment analysis (positive/negative)
sentiment_analyzer = pipeline('sentiment-analysis', model='distilbert-base-uncased-finetuned-sst-2-english')

def analyze_sentiment_truncate(text, max_length=512):
  # max length of text it supports is 512
    truncated_text = text[:max_length]
    result = sentiment_analyzer(truncated_text)
    return result[0]['label'], result[0]['score']

Device set to use cuda:0


In [9]:
# roBERTa emotion analysis (28 emotions)
emotion_analyzer = pipeline('text-classification',
                            model='j-hartmann/emotion-english-distilroberta-base',
                            return_all_scores=True)

def analyze_roberta_emotion(text, max_length=512):
    # max length of text it supports is 512
    truncated_text = text[:max_length]
    emotions = emotion_analyzer(truncated_text)
    emotion_scores = {item['label']: item['score'] for item in emotions[0]}
    dominant_emotion = max(emotion_scores, key=emotion_scores.get)
    return dominant_emotion, emotion_scores

config.json:   0%|          | 0.00/1.00k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/294 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/329M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Device set to use cuda:0


In [12]:
#XLM-RoBERTa - Multi-Lingual Emotion Analysis (7 emotions)
seven_emotion_analyzer = pipeline('text-classification',
                                  model='bhadresh-savani/distilbert-base-uncased-emotion',
                                  return_all_scores=True)

def analyze_emotion(text, max_length=512):
    # max length of text it supports is 512
    truncated_text = text[:max_length]
    emotions = seven_emotion_analyzer(truncated_text)
    emotion_scores = {item['label']: item['score'] for item in emotions[0]}
    unique_emotions = [item['label'] for item in emotions[0]]
    dominant_emotion = max(emotion_scores, key=emotion_scores.get)
    return dominant_emotion, emotion_scores

Device set to use cuda:0


In [13]:
final_df['Dominant_Emotion'], emotion_scores = zip(*final_df['plot'].apply(analyze_emotion))
emotion_df = pd.json_normalize(emotion_scores)

final_df = pd.concat([final_df, emotion_df], axis=1)

In [14]:
print(final_df.head())

                     title  audience_score  tomato_meter rating  \
0              the in-laws            83.0          88.0    NaN   
1             race the sun            41.0          22.0    NaN   
2     paul blart: mall cop            43.0          34.0     pg   
3  not without my daughter            71.0          53.0    NaN   
4       rookie of the year            52.0          38.0    NaN   

                             rating_contents             director  \
0                                        NaN        arthur hiller   
1                                        NaN  charles t. kanganis   
2  mild crude humor, language, some violence           steve carr   
3                                        NaN        brian gilbert   
4                                        NaN         daniel stern   

                          writer   box_office               distributor  \
0                 andrew bergman   38200000.0                       NaN   
1                            NaN

In [15]:
final_df.to_csv('final_df_with_emotion_scores.csv', index=True)