In [None]:
import pandas as pd
from transformers import AutoTokenizer, pipeline
import logging
import torch

logging.basicConfig(filename='diag.txt', filemode='w', level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s')

tokenizer = AutoTokenizer.from_pretrained("lrei/xlm-roberta-base-emolit-multilingual")
classifier = pipeline("text-classification", model="lrei/xlm-roberta-base-emolit-multilingual", device=0 if torch.cuda.is_available() else -1)

def split_into_chunks(text, max_length=510):
    tokens = tokenizer.tokenize(text)
    chunks = []
    for i in range(0, len(tokens), max_length):
        chunk = tokenizer.convert_tokens_to_ids(tokens[i:i + max_length])
        chunks.append(tokenizer.decode(chunk))
    return chunks

def detect_top_emotion(text):
    try:
        chunks = split_into_chunks(text)
        emotions = []
        for chunk in chunks:
            if chunk:
                inputs = tokenizer.encode_plus(chunk, return_tensors="pt", padding='max_length', max_length=512, truncation=True)
                inputs = {name: tensor.to(classifier.device) for name, tensor in inputs.items()}
                results = classifier.model(**inputs)
                top_emotion = results.logits.argmax(-1).item()
                top_emotion_label = classifier.model.config.id2label[top_emotion]
                emotions.append(top_emotion_label)
        
        if emotions:
            return max(set(emotions), key=emotions.count)
        return None

    except Exception as e:
        logging.error(f"Error processing text: {text[:150]}... Error: {str(e)}")
        return None

df = pd.read_csv('/kaggle/input/msc-thesis-dataset/fren_cleaned.csv/fren_cleaned.csv')

df['emolit_emotion'] = df['excerpt_value_cleaned'].apply(detect_top_emotion)

primary_emotions_map = {
    'calmness': 'joy', 'serenity': 'joy', 'joy': 'joy', 'ecstasy': 'joy',
    'excitement': 'joy', 'relief': 'joy', 'despair': 'sadness', 'pensiveness': 'sadness',
    'sadness': 'sadness', 'disappointment': 'sadness', 'grief': 'sadness', 'desire':'anticipation',
    'embarrassment': 'fear', 'nostalgia': 'sadness', 'pain': 'sadness', 'greed':'anger',
    'approval': 'trust', 'acceptance': 'trust', 'trust': 'trust', 'admiration': 'joy', 'courage':'anticipation',
    'faith': 'trust', 'indifference': 'disgust', 'boredom': 'sadness', 'disgust': 'disgust',
    'loathing': 'disgust', 'nervousness': 'fear', 'apprehension': 'fear', 'pride': 'joy',
    'fear': 'fear', 'terror': 'fear', 'annoyance': 'anger', 'frustration': 'anger', 'love':'joy',
    'anger': 'anger', 'rage': 'anger', 'envy': 'anger', 'surprise': 'surprise', 'gratitude':'joy',
    'amazement': 'surprise', 'curiosity': 'anticipation', 'interest': 'anticipation', 'guilt':'sadness',
    'anticipation': 'anticipation', 'vigilance': 'anticipation', 'doubt': 'fear', 'amusement':'joy',
    'optimism': 'anticipation', 'disapproval': 'disgust', 'caring':'trust'
}

def map_emotions(emotion):
    if emotion in primary_emotions_map:
        return primary_emotions_map[emotion]
    else:
        print(f"Unmapped emotion '{emotion}' found.")
        return None

df['plutchik_emotion'] = df['emolit_emotion'].apply(map_emotions)

df.to_csv('/kaggle/working/final_data.csv', index=False)