In [26]:
import pandas as pd, numpy as np
from scipy import stats

df = pd.read_csv("../data/full_articles_with_sentiment.csv")
print(df.shape, df.columns[:8])


(1299, 8) Index(['id', 'headline', 'body_text', 'body_text_summary', 'headline_clean',
       'body_clean', 'sent_head', 'sent_body'],
      dtype='object')


In [27]:
for col in ["sent_head", "sent_body"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [28]:
df["sent_gap"] = df["sent_head"] - df["sent_body"]
gap_desc = df["sent_gap"].describe()
print(gap_desc)

count    1299.000000
mean       -0.190442
std         0.356281
min        -1.599326
25%        -0.415766
50%        -0.188103
75%         0.027984
max         1.224850
Name: sent_gap, dtype: float64


In [29]:
more_neg = (df["sent_gap"] < 0).mean() * 100
more_pos = (df["sent_gap"] > 0).mean() * 100
same     = (df["sent_gap"] == 0).mean() * 100

print(f"Headline more NEGATIVE than body: {more_neg:.1f}%")
print(f"Headline more POSITIVE than body: {more_pos:.1f}%")
print(f"Exact tie:                       {same:.1f}%")


Headline more NEGATIVE than body: 72.4%
Headline more POSITIVE than body: 27.6%
Exact tie:                       0.0%


In [30]:
t_stat, t_p  = stats.ttest_1samp(df["sent_gap"], 0)
w_stat, w_p  = stats.wilcoxon(df["sent_gap"])

print(f"t‑test p‑value:       {t_p:.3g}")
print(f"Wilcoxon p‑value:     {w_p:.3g}")

t‑test p‑value:       6.12e-73
Wilcoxon p‑value:     2.67e-71


In [31]:
TOP_N = 3  # change if you want more

print("\n=== Most NEGATIVE headlines relative to body ===")
for _, row in df.nsmallest(TOP_N, "sent_gap").iterrows():
    print("-" * 80)
    print("Headline:", row["headline"])
    print("Body ⬇︎"); print(row["body_text"][:400], "…")
    print(f"Gap = {row['sent_gap']:.3f}")

print("\n=== Most POSITIVE headlines relative to body ===")
for _, row in df.nlargest(TOP_N, "sent_gap").iterrows():
    print("-" * 80)
    print("Headline:", row["headline"])
    print("Body ⬇︎"); print(row["body_text"][:400], "…")
    print(f"Gap = {row['sent_gap']:.3f}")



=== Most NEGATIVE headlines relative to body ===
--------------------------------------------------------------------------------
Headline: Roseanne Barr slammed for tweet calling out 'racist anti-Semitic bigotry' after synagogue shooting
Body ⬇︎
in the meantime, we welcome your feedback to help us enhance the experience. our goal is to create a safe and engaging place for users to connect over interests and passions. in order to improve our community experience, we are temporarily suspending the article …
Gap = -1.599
--------------------------------------------------------------------------------
Headline: A Republican Governor Shoots Himself in the Foot
Body ⬇︎
the first state to recognize same-sex civil unions and the first to experiment with a single-payer health-care system. two vermont politicians. wells, vt. for a small state, vermont has a way of getting noticed. it's the first time that a single-payer health care system has been recognized in recent years …
Gap = -1.507
----

In [32]:
OUT = "../data/full_articles_with_gap.csv"
df.to_csv(OUT, index=False)
print("Saved with gap column →", OUT)

Saved with gap column → ../data/full_articles_with_gap.csv
