In [1]:
# Core libraries
import pandas as pd
import numpy as np

# Text preprocessing
import re
import string

# NLP tools
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# LLM support (OpenAI)
import openai


In [2]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/anachkhaidze/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/anachkhaidze/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Adjust this path if the file is elsewhere
file_path = "data/metaphorValence.csv"
df = pd.read_csv(file_path)

# Preview your data
df.head()

Unnamed: 0,StartDate,EndDate,Status,IPAddress,Progress,Duration (in seconds),Finished,RecordedDate,ResponseId,RecipientLastName,...,language#1_3_2,language#1_3_3,language#1_4_1,language#1_4_2,language#1_4_3,language#1_5_1,language#1_5_2,language#1_5_3,Q36,random
0,11/10/21 15:58,11/10/21 16:03,IP Address,65.184.69.197,100,322,True,11/10/21 16:03,R_3lVJS9mk7o2Togh,,...,,,,,,,,,United States,652490
1,11/10/21 14:28,11/10/21 14:33,IP Address,108.34.215.20,100,323,True,11/10/21 14:33,R_2do9qyItqku7Nn3,,...,,,,,,,,,U.S.,634128
2,11/10/21 15:00,11/10/21 15:08,IP Address,198.14.21.54,100,501,True,11/10/21 15:08,R_3g106D74dycqvVm,,...,,,,,,,,,"connecticut, alaska, california",586913
3,11/10/21 16:51,11/10/21 16:54,IP Address,74.136.120.181,100,127,True,11/10/21 16:54,R_2xKIsTBgAkyH5vD,,...,,,,,,,,,USA,202736
4,11/10/21 15:12,11/10/21 15:14,IP Address,205.133.127.242,100,74,True,11/10/21 15:14,R_2zuywXmfYnpQC4n,,...,,,,,,,,,United States,541631


In [4]:
columns_to_drop = [
    "Status", "RecordedDate", "RecipientLastName", "RecipientFirstName",
    "RecipientEmail", "ExternalReference", "DistributionChannel", "UserLanguage", "consent", "Finished",
    "language.1_3_1", "language.1_3_3", "language.1_3_2", "language.1_4_1", "language.1_4_2", "language.1_4_3",
    "language.1_5_1", "language.1_5_2", "language.1_5_3", "random", "immigrationStatus_5_TEXT", 
    "immigrationStatus_5", "immigrationStatus_3", "immigrationStatus_4", "immigrationStatus_6",
    "immigrationStatus_8", "immigrationStatus_2", "language.1_2_1", "language.1_2_2", "language.1_2_3",
    "language.1_1_3", "politicalView_4_TEXT", "previousParticipatio", "gender_3_TEXT", "Progress",
    "StartDate", "EndDate", "IPAddress", "Duration..in.seconds.", "immigrationStatus_1"
]

df.drop(columns=columns_to_drop, inplace=True, errors="ignore")

In [5]:
# Explicitly convert again to be sure
df["condition"] = df["condition"].astype("category")
df["gender"] = df["gender"].astype("category")
df["politicalView"] = df["politicalView"].astype("category")
df["valence"] = df["valence"].astype("category")

# Drop rows with NA or string "NA" in valence
df = df[~df["valence"].isin(["NA", "mixed"]) & df["valence"].notna()]

# Drop rows with 'Other' politicalView or gender
df = df[df["politicalView"] != "Other"]
df = df[df["gender"] != "Other"]

# # Drop rows where valence is 'mixed' or 'neutral'
# df = df[~df["valence"].isin(["mixed", "neutral"])]

# Drop unused categories
df["condition"] = df["condition"].cat.remove_unused_categories()
df["gender"] = df["gender"].cat.remove_unused_categories()
df["politicalView"] = df["politicalView"].cat.remove_unused_categories()
df["valence"] = df["valence"].cat.remove_unused_categories()

In [6]:
print(df["valence"].value_counts())

valence
positive    680
negative    167
neutral      28
Name: count, dtype: int64


In [34]:
import pandas as pd
import nltk
from nltk.sentiment import SentimentIntensityAnalyzer

# Download the VADER lexicon (only needed once)
nltk.download('vader_lexicon')

# Load your data
df = pd.read_csv("data/metaphorValence.csv")  # adjust path as needed

# Drop NA and blank openQ responses
df = df[df["openQ"].notna()]
df = df[df["openQ"].str.strip().astype(bool)]

# Initialize the sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Apply sentiment analysis
df["vader_score"] = df["openQ"].apply(lambda x: sia.polarity_scores(x)["compound"])

# Classify the sentiment
def classify_sentiment(score):
    if score >= 0.05:
        return "positive"
    elif score <= -0.05:
        return "negative"
    else:
        return "neutral"

df["sentiment_llm"] = df["vader_score"].apply(classify_sentiment)

# View the result
df[["ResponseId", "openQ", "vader_score", "sentiment_llm"]].head()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/anachkhaidze/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


Unnamed: 0,ResponseId,openQ,vader_score,sentiment_llm
0,R_3lVJS9mk7o2Togh,Immigration has helped Addison economically. ...,-0.4939,negative
1,R_2do9qyItqku7Nn3,There is no context provided that would sugges...,-0.2359,negative
2,R_3g106D74dycqvVm,"the reason may be a good environment, good cli...",0.886,positive
3,R_2xKIsTBgAkyH5vD,local economy,0.0,neutral
4,R_2zuywXmfYnpQC4n,I do not know. I am tired. I just need the mon...,0.5727,positive


In [35]:
df_compare = df[df["valence"].isin(["positive", "negative"])].copy()
pd.crosstab(df_compare["valence"], df_compare["sentiment_llm"])

sentiment_llm,negative,neutral,positive
valence,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
negative,78,31,58
positive,21,72,592
