In [None]:
import pandas as pd, numpy as np
from scipy import stats

df = pd.read_csv("../data/full_articles_with_sentiment.csv")
print(df.shape, df.columns[:8])


(1299, 11) Index(['id', 'url', 'headline_text', 'body_text', 'Q2 Focus', 'Q3 Theme1',
       'Q3 Theme2', 'headline_clean'],
      dtype='object')


In [3]:
for col in ["sent_head", "sent_body"]:
    df[col] = pd.to_numeric(df[col], errors="coerce")

In [5]:
df["sent_gap"] = df["sent_head"] - df["sent_body"]
gap_desc = df["sent_gap"].describe()
print(gap_desc)

count    1299.000000
mean       -0.143557
std         0.432373
min        -1.859123
25%        -0.400784
50%        -0.109622
75%         0.096962
max         1.517315
Name: sent_gap, dtype: float64


In [6]:
more_neg = (df["sent_gap"] < 0).mean() * 100
more_pos = (df["sent_gap"] > 0).mean() * 100
same     = (df["sent_gap"] == 0).mean() * 100

print(f"Headline more NEGATIVE than body: {more_neg:.1f}%")
print(f"Headline more POSITIVE than body: {more_pos:.1f}%")
print(f"Exact tie:                       {same:.1f}%")


Headline more NEGATIVE than body: 64.1%
Headline more POSITIVE than body: 35.9%
Exact tie:                       0.0%


In [7]:
t_stat, t_p  = stats.ttest_1samp(df["sent_gap"], 0)
w_stat, w_p  = stats.wilcoxon(df["sent_gap"])

print(f"t‑test p‑value:       {t_p:.3g}")
print(f"Wilcoxon p‑value:     {w_p:.3g}")

t‑test p‑value:       2.22e-31
Wilcoxon p‑value:     4.39e-31


In [8]:
TOP_N = 3  # change if you want more

print("\n=== Most NEGATIVE headlines relative to body ===")
for _, row in df.nsmallest(TOP_N, "sent_gap").iterrows():
    print("-" * 80)
    print("Headline:", row["headline_text"])
    print("Body ⬇︎"); print(row["body_text"][:400], "…")
    print(f"Gap = {row['sent_gap']:.3f}")

print("\n=== Most POSITIVE headlines relative to body ===")
for _, row in df.nlargest(TOP_N, "sent_gap").iterrows():
    print("-" * 80)
    print("Headline:", row["headline_text"])
    print("Body ⬇︎"); print(row["body_text"][:400], "…")
    print(f"Gap = {row['sent_gap']:.3f}")



=== Most NEGATIVE headlines relative to body ===
--------------------------------------------------------------------------------
Headline: One Big Loser on Black Friday? Gun Sales
Body ⬇︎
the black friday holiday shopping weekend is one of the best since the recession, possibly rivaling the boom days of the mid 2000s. the most products ordered worldwide than any other day. amazon customers ordered more than 18 million toys and 13 million fashion items on black friday and cyber monday combined …
Gap = -1.859
--------------------------------------------------------------------------------
Headline: NRA supporters are blowing up Yeti coolers. Yeti says it's all a big mistake.
Body ⬇︎
yeti is unwavering in our belief in and commitment to the constitution of the united states and its second amendment. yeti founded more than 10 years ago with a passion for the outdoors, and over the course of our history we have actively and supported hunters, anglers and the broader outdoor community …
Ga

In [None]:
OUT = "../data/full_articles_with_gap.csv.csv"
df.to_csv(OUT, index=False)
print("Saved with gap column →", OUT)

Saved with gap column → ../data/gvfc_with_gap.csv
