# Detect Emotions in Text using Bert Multilingual

## Preparation

### Install

In [1]:
!pip install torch
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


### Import

In [2]:
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
import torch
from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from sklearn.metrics import f1_score
import pandas as pd
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Download Model

In [3]:
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')

model = BertForSequenceClassification.from_pretrained('bert-base-multilingual-cased', num_labels=4)

# Define the emotion labels (assuming the labels are integers from 0 to 3, as in the previous example)
emotion_labels = ['joy', 'anger', 'fear', 'sadness']

Some weights of the model checkpoint at bert-base-multilingual-cased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model ch

## Function Definition

### Predict Emotions given a row in a Dataframe

In [4]:
def predict_all_emotion(row):

    text= row["Subtitle"]

    # Tokenize the input text and convert to PyTorch tensor
    input_ids = tokenizer.encode(text, add_special_tokens=True, padding='max_length', max_length=128, truncation=True, return_tensors='pt')
    
    # Predict the emotion label probabilities for the input text
    outputs = model(input_ids)
    label_probs = torch.softmax(outputs[0], dim=1).detach().numpy()[0]
    
    return {label: prob for label, prob in zip(emotion_labels, label_probs)}

### Analize Text Emotions given a csv

In [5]:
def analyze_emotions_csv(file_path, file_out):
  
  df = pd.read_csv(file_path, encoding="utf-8")
  df["emotions"] = df.apply(lambda row: predict_all_emotion(row), axis=1)

  # create new columns from emotions column
  df[['text_joy', 'text_anger', 'text_fear', 'text_sadness']] = df['emotions'].apply(lambda x: pd.Series(x))

  df = df.drop(columns=['emotions'])

  df.to_csv(file_out, index=False, encoding="utf-8-sig")

### Analize Text Emotions of csv in batch

In [6]:
def analyze_directory(directory_in, directory_out):
    # Create the output directory if it doesn't already exist
    os.makedirs(directory_out, exist_ok=True)

    # Iterate through all files in the input directory
    for file_name in os.listdir(directory_in):
        # Check if file is a CSV file
        if file_name.lower().endswith('.csv'):
            # Construct the input and output file paths
            file_path = os.path.join(directory_in, file_name)
            file_out = os.path.join(directory_out, os.path.splitext(file_name)[0] + '_text.csv')
            if os.path.exists(file_out)==False:
              # Analyze the CSV file and output the result
              analyze_emotions_csv(file_path, file_out)
              print(f"Processed {file_name}, saved output to {file_out}")

## Use of Function

In [7]:
directory_in = r'/content/drive/MyDrive/Projects/tps/data/10. speech_meetings_aggregated'
directory_out = r'/content/drive/MyDrive/Projects/tps/data/11. speech_text'

analyze_directory(directory_in, directory_out)

Processed 12_speech.csv, saved output to /content/drive/MyDrive/Projects/tps/data/11. speech_text/12_speech_text.csv
Processed 4_speech.csv, saved output to /content/drive/MyDrive/Projects/tps/data/11. speech_text/4_speech_text.csv
Processed 10_speech.csv, saved output to /content/drive/MyDrive/Projects/tps/data/11. speech_text/10_speech_text.csv
Processed 7_speech.csv, saved output to /content/drive/MyDrive/Projects/tps/data/11. speech_text/7_speech_text.csv
Processed 9_speech.csv, saved output to /content/drive/MyDrive/Projects/tps/data/11. speech_text/9_speech_text.csv
Processed 11_speech.csv, saved output to /content/drive/MyDrive/Projects/tps/data/11. speech_text/11_speech_text.csv
Processed 3_speech.csv, saved output to /content/drive/MyDrive/Projects/tps/data/11. speech_text/3_speech_text.csv
Processed 5_speech.csv, saved output to /content/drive/MyDrive/Projects/tps/data/11. speech_text/5_speech_text.csv
Processed 6_speech.csv, saved output to /content/drive/MyDrive/Projects/tp