In [None]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import seaborn as sns
from collections import Counter
import re
from nltk.sentiment import SentimentIntensityAnalyzer
import nltk
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# new_file.py
# GitHub Copilot
# Basic EDA + baseline sentiment using VADER on data/cleaned/reviews_clean.parquet
# Saves augmented dataframe to data/derived/reviews_sentiment.parquet

import matplotlib.pyplot as plt

# Try to import repo helpers in src if present (optional)
try:
    import src  # noqa: F401
except Exception:
    src = None

# Load data
data_dir = Path("/Users/alexandresepulvedadedietrich/Code/HelpfulLens/data")
input_path = Path(data_dir / "datasets/training/yelp_helpfulness_train.parquet")
df_clean = pd.read_parquet(input_path)


In [None]:
df = df_clean.copy().sample(n=100000, random_state=42)

In [None]:

# Identify text column heuristically
text_cols = [c for c in df.columns if any(k in c.lower() for k in ("review", "text", "comment"))]
if not text_cols:
    raise ValueError("No text-like column found in dataframe. Columns: " + ", ".join(df.columns))
text_col = text_cols[0]

# Basic EDA
print("Loaded:", input_path, "shape:", df.shape)
print("Text column used:", text_col)
print("\nSample rows:")
print(df[[text_col]].head(5))

print("\nMissing values per column:")
print(df.isna().sum())

# Text features
df["_text"] = df[text_col].astype(str)
df["_char_len"] = df["_text"].str.len()
df["_word_count"] = df["_text"].str.split().map(lambda x: len(x) if isinstance(x, list) else 0)
df["_avg_word_len"] = df["_text"].apply(lambda s: np.mean([len(w) for w in re.findall(r"\w+", s)]) if s else 0)

print("\nText length stats:")
print(df[["_char_len", "_word_count", "_avg_word_len"]].describe())


In [None]:
df

In [None]:

# Baseline sentiment: VADER
try:
    try:
        nltk.data.find("sentiment/vader_lexicon.zip")
    except LookupError:
        nltk.download("vader_lexicon", quiet=True)
except Exception:
    # fallback to vaderSentiment package
    print("NLTK vader_lexicon not found, using vaderSentiment package.")

sia = SentimentIntensityAnalyzer()

def vader_scores(text):
    return sia.polarity_scores(str(text))

scores = df["text"].map(vader_scores)
scores_df = pd.DataFrame(list(scores))
df = pd.concat([df.reset_index(drop=True), scores_df.reset_index(drop=True)], axis=1)

# Label mapping (standard VADER thresholds)
def vader_label(compound):
    if compound >= 0.05:
        return "positive"
    if compound <= -0.05:
        return "negative"
    return "neutral"

df["_vader_label"] = df["compound"].apply(vader_label)

print("\nVADER label distribution:")
print(df["_vader_label"].value_counts(normalize=False))

# Quick visualization (will work in notebook)
sns.set(style="whitegrid")
plt.figure(figsize=(6,4))
sns.countplot(data=df, x="_vader_label", order=["positive","neutral","negative"], stat="percent")
plt.title("Baseline VADER sentiment distribution")
plt.tight_layout()
plt.show()

# Top tokens (simple frequency)
def tokenize(text):
    return re.findall(r"\w{2,}", text.lower())

all_tokens = Counter()
df["text"].dropna().map(tokenize).map(all_tokens.update)
top_tokens = all_tokens.most_common(30)
print("\nTop tokens:", top_tokens[:20])

# Save augmented dataset
out_dir = Path("../data/derived")
out_dir.mkdir(parents=True, exist_ok=True)
out_path = out_dir / "reviews_sentiment.parquet"
df.to_parquet(out_path, index=False)
print("\nSaved augmented dataframe to:", out_path)

In [None]:
# Calculate correlation between 'useful' and VADER sentiment metrics
vader_metrics = ["neg", "neu", "pos", "compound"]
correlations = df[["useful_rate_smoothed"] + vader_metrics].corr().loc["useful_rate_smoothed", vader_metrics]
print("Correlation between 'useful_rate_smoothed' and VADER metrics:")
print(correlations)

In [None]:
for text in df[df["_vader_label"] == "negative"]["text"]:
    print(text)