In [None]:
from transformers import AutoTokenizer, pipeline
import torch
import pandas as pd

device = 0 if torch.cuda.is_available() else -1
print(f"Using {'GPU' if device == 0 else 'CPU'}.")

# Load the tokenizer and classifier
tokenizer = AutoTokenizer.from_pretrained("lrei/xlm-roberta-base-emolit-multilingual")
classifier = pipeline("text-classification", model="lrei/xlm-roberta-base-emolit-multilingual", device=device)

# Function to correctly tokenize and split the text into manageable chunks
def split_into_chunks(text, max_length=512):
    # Tokenize the text and get the token IDs
    token_ids = tokenizer.encode(text, add_special_tokens=False)
    
    # Initialize chunks
    chunks = []
    
    # Create chunks of max_length with space for special tokens
    for i in range(0, len(token_ids), max_length - 2):
        chunk = token_ids[i:i + max_length - 2]
        chunks.append(tokenizer.decode(chunk, add_special_tokens=True))
    
    return chunks

# Detect the most frequent top emotion from chunks
def detect_top_emotion(text):
    chunks = split_into_chunks(text)
    emotions = []
    
    for chunk in chunks:
        if len(chunk) > 0:
            result = classifier(chunk)[0]['label']
            emotions.append(result)
    
    # Aggregate results by finding the most common emotion
    if emotions:
        top_emotion = max(set(emotions), key=emotions.count)
        return top_emotion
    return None

# Load your dataframe
df = pd.read_csv('/kaggle/input/msc-thesis-dataset/fren_cleaned.csv/fren_cleaned.csv')  # adjust the path to your csv file

# Apply the model to each text entry
df['emolit_emotion'] = df['excerpt_value_cleaned'].apply(detect_top_emotion)

# Define your emotion mapping dictionary
primary_emotions_map = {
    'calmness': 'joy', 'serenity': 'joy', 'joy': 'joy', 'ecstasy': 'joy',
    'excitement': 'joy', 'relief': 'joy', 'despair': 'sadness', 'pensiveness': 'sadness',
    'sadness': 'sadness', 'disappointment': 'sadness', 'grief': 'sadness', 'desire':'anticipation',
    'embarrassment': 'fear', 'nostalgia': 'sadness', 'pain': 'sadness', 'greed':'anger',
    'approval': 'trust', 'acceptance': 'trust', 'trust': 'trust', 'admiration': 'joy', 'courage':'anticipation',
    'faith': 'trust', 'indifference': 'disgust', 'boredom': 'sadness', 'disgust': 'disgust',
    'loathing': 'disgust', 'nervousness': 'fear', 'apprehension': 'fear', 'pride': 'joy',
    'fear': 'fear', 'terror': 'fear', 'annoyance': 'anger', 'frustration': 'anger', 'love':'joy',
    'anger': 'anger', 'rage': 'anger', 'envy': 'anger', 'surprise': 'surprise', 'gratitude':'joy',
    'amazement': 'surprise', 'curiosity': 'anticipation', 'interest': 'anticipation', 'guilt':'sadness',
    'anticipation': 'anticipation', 'vigilance': 'anticipation', 'doubt': 'fear', 'amusement':'joy',
    'optimism': 'anticipation', 'disapproval': 'disgust', 'caring':'trust'
}

# Function to apply the mapping
def map_emotions(emotion):
    if emotion in primary_emotions_map:
        return primary_emotions_map[emotion]
    else:
        print(f"Unmapped emotion '{emotion}' found.")
        return None

# Apply the mapping
df['plutchik_emotion'] = df['emolit_emotion'].apply(map_emotions)

df.to_csv('/kaggle/working/final_data.csv', index=False)