In [12]:
import pandas as pd
from nrclex import NRCLex

In [16]:
# Load the dialogue dataset
dialogue = pd.read_csv("CSV_PostgreSQL/Raw_data/Dialogue.csv", encoding="ISO-8859-1")

# Load the emotion lookup table (dim_emolex)
dim_emolex = pd.read_csv("CSV_PostgreSQL/dim_emolex.csv")
dialogue.head()

Unnamed: 0,Dialogue ID,Chapter ID,Place ID,Character ID,Dialogue
0,1,1,8,4,I should have known that you would be here...P...
1,2,1,8,7,"Good evening, Professor Dumbledore. Are the ru..."
2,3,1,8,4,"I'm afraid so, Professor. The good, and the bad."
3,4,1,8,7,And the boy?
4,5,1,8,4,Hagrid is bringing him.


In [17]:
# Function to analyze a single text using NRCLex
def analyze_sentiment(text):
    text_object = NRCLex(text)
    return text_object.raw_emotion_scores, text_object.affect_frequencies

In [18]:
# Create a list to store the extracted emotion data
dialogue_emotions = []

# Iterate over each row in the dialogue dataset
for _, row in dialogue.iterrows():
    dialogue_id = row["Dialogue ID"]
    text = row["Dialogue"]
    
    # Analyze sentiment using NRCLex
    sentiment_scores, affect_frequencies = analyze_sentiment(text)
    
    # Skip if no emotion was detected
    if not sentiment_scores:
        continue

    # Loop through all detected emotions
    for emotion, score in sentiment_scores.items():
        if score > 0.0:
            frequency = affect_frequencies.get(emotion)
            if frequency is not None:
                dialogue_emotions.append({
                    "id_dialogue": dialogue_id,
                    "emotion": emotion,
                    "score": score,
                    "score_frequency": round(frequency, 3)
                })

# Convert the list of dictionaries to a DataFrame
df_dialogue_emotions = pd.DataFrame(dialogue_emotions)
df_dialogue_emotions.head()

Unnamed: 0,id_dialogue,emotion,score,score_frequency
0,2,joy,1,0.333
1,2,positive,1,0.333
2,2,trust,1,0.333
3,3,fear,2,0.167
4,3,negative,2,0.167


In [19]:
# Create a dictionary to map emotion names to IDs from dim_emolex
dim_emolex_dict = dim_emolex.set_index("emotion")["id_emolex"].to_dict()
print(dim_emolex_dict)

# Add ID_EMOLEX to the dataframe using the mapping
df_dialogue_emotions["id_emolex"] = df_dialogue_emotions["emotion"].map(dim_emolex_dict)
df_dialogue_emotions.head()

{'positive': 1, 'joy': 2, 'surprise': 3, 'trust': 4, 'negative': 5, 'anger': 6, 'fear': 7, 'disgust': 8, 'anticipation': 9, 'sadness': 10}


Unnamed: 0,id_dialogue,emotion,score,score_frequency,id_emolex
0,2,joy,1,0.333,2
1,2,positive,1,0.333,1
2,2,trust,1,0.333,4
3,3,fear,2,0.167,7
4,3,negative,2,0.167,5


In [20]:
# Drop any rows where the emotion could not be mapped to an ID
df_dialogue_emotions = df_dialogue_emotions.dropna(subset=["id_emolex"])

In [None]:
# Create the final bridge table
final_table = df_dialogue_emotions[["id_dialogue", "id_emolex", "score", "score_frequency"]]

# Save to CSV file
final_table.to_csv("CSV_PostgreSQL/CSV_final/bridge_dialogue_emolex.csv", index=False)

# Preview
final_table.head()

Unnamed: 0,id_dialogue,id_emolex,score,score_frequency
0,2,2,1,0.333
1,2,1,1,0.333
2,2,4,1,0.333
3,3,7,2,0.167
4,3,5,2,0.167
