In [27]:
# Import libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Load datasets
dialogue = pd.read_csv("CSV_PostgreSQL/Raw_data/Dialogue.csv", encoding='latin1')
fact_dialogue = pd.read_csv('CSV_PostgreSQL/CSV_draft/fact_dialogue_draft.csv', encoding='latin1')
afinn_lexicon = pd.read_csv('CSV_PostgreSQL/Afinn.csv', encoding='latin1', delimiter=',')

# Display the first few rows
dialogue.head()

Unnamed: 0,Dialogue ID,Chapter ID,Place ID,Character ID,Dialogue
0,1,1,8,4,I should have known that you would be here...P...
1,2,1,8,7,"Good evening, Professor Dumbledore. Are the ru..."
2,3,1,8,4,"I'm afraid so, Professor. The good, and the bad."
3,4,1,8,7,And the boy?
4,5,1,8,4,Hagrid is bringing him.


In [28]:
# Convert the AFINN lexicon into a dictionary
afinn_dict = dict(zip(afinn_lexicon['word'], afinn_lexicon['value']))

# Display the first 10 entries of the dictionary for verification
list(afinn_dict.items())[:10]

[('abandon', -2),
 ('abandoned', -2),
 ('abandons', -2),
 ('abducted', -2),
 ('abduction', -2),
 ('abductions', -2),
 ('abhor', -3),
 ('abhorred', -3),
 ('abhorrent', -3),
 ('abhors', -3)]

In [29]:
# Function to preprocess text: tokenize, convert to lowercase, remove non-alphabetic words and stopwords
def preprocess(text):
    words = word_tokenize(text.lower())
    filtered_words = [word for word in words if word.isalpha() and word not in stopwords.words('english')]
    return filtered_words

# Test the preprocessing function with a sample text
sample_text = "This is a sample dialogue to test the preprocessing function."
preprocess(sample_text)


['sample', 'dialogue', 'test', 'preprocessing', 'function']

In [30]:
# Function to calculate sentiment score using the AFINN lexicon
def calculate_sentiment(text):
    words = preprocess(text)
    sentiment_score = sum(afinn_dict.get(word, 0) for word in words)
    return sentiment_score

# Test the sentiment calculation function with a sample text
calculate_sentiment(sample_text)


0

In [31]:
# Merge the DataFrames based on 'id_dialogue'
merged = pd.merge(fact_dialogue, dialogue, left_on='id_dialogue', right_on='Dialogue ID')

# Display the first few rows of the merged DataFrame
merged.head()


Unnamed: 0,id_dialogue,id_character,id_scene,id_place,sentiment_afinn,dialogue_length,love_count,inserted_at,updated_at,Dialogue ID,Chapter ID,Place ID,Character ID,Dialogue
0,1,4,1,8,,10,0,2025-05-13 19:26:19.339761,2025-05-13 19:26:19.339761,1,1,8,4,I should have known that you would be here...P...
1,2,7,1,8,,9,0,2025-05-13 19:26:19.339761,2025-05-13 19:26:19.339761,2,1,8,7,"Good evening, Professor Dumbledore. Are the ru..."
2,3,4,1,8,,9,0,2025-05-13 19:26:19.339761,2025-05-13 19:26:19.339761,3,1,8,4,"I'm afraid so, Professor. The good, and the bad."
3,4,7,1,8,,3,0,2025-05-13 19:26:19.339761,2025-05-13 19:26:19.339761,4,1,8,7,And the boy?
4,5,4,1,8,,4,0,2025-05-13 19:26:19.339761,2025-05-13 19:26:19.339761,5,1,8,4,Hagrid is bringing him.


In [32]:
# Calculate AFINN sentiment for each row
merged['sentiment_afinn'] = merged['Dialogue'].apply(calculate_sentiment)

# Drop redundant columns
columns_to_drop = ['Dialogue', 'Dialogue ID', 'Chapter ID', 'Place ID', 'Character ID']
final_df = merged.drop(columns=columns_to_drop)

# Display the first few rows to verify the changes
final_df.head()

Unnamed: 0,id_dialogue,id_character,id_scene,id_place,sentiment_afinn,dialogue_length,love_count,inserted_at,updated_at
0,1,4,1,8,0,10,0,2025-05-13 19:26:19.339761,2025-05-13 19:26:19.339761
1,2,7,1,8,3,9,0,2025-05-13 19:26:19.339761,2025-05-13 19:26:19.339761
2,3,4,1,8,-2,9,0,2025-05-13 19:26:19.339761,2025-05-13 19:26:19.339761
3,4,7,1,8,0,3,0,2025-05-13 19:26:19.339761,2025-05-13 19:26:19.339761
4,5,4,1,8,0,4,0,2025-05-13 19:26:19.339761,2025-05-13 19:26:19.339761


In [33]:
# Save the DataFrame to a final CSV file
final_df.to_csv('CSV_PostgreSQL/CSV_final/fact_dialogue_final.csv', index=False, encoding='utf-8')