In [2]:
import pandas as pd
import re

df = pd.read_csv("../data/clean_gvfc_sentiment.csv")
print(df.shape)


(1299, 7)


In [3]:
URL_RE   = re.compile(r'https?://\S+|www\.\S+')
EMAIL_RE = re.compile(r'\S+@\S+')
HTML_RE  = re.compile(r'&[a-z]+;')      # &amp; etc.

def light_clean(text: str) -> str:
    if not isinstance(text, str):
        return ""
    text = text.lower()                       # 1) lowercase
    text = URL_RE.sub('', text)               # 2) drop URLs
    text = EMAIL_RE.sub('', text)             # 3) drop e‑mails
    text = HTML_RE.sub('', text)              # 4) drop stray HTML codes
    text = re.sub(r'\s+', ' ', text).strip()  # 5) collapse whitespace
    return text


In [4]:
df['headline_clean'] = df['headline_text'].apply(light_clean)
df['body_clean']     = df['body_text'].apply(light_clean)


In [5]:
pair = df.sample(1, random_state=42).iloc[0]
print("RAW HEADLINE →", pair['headline_text'])
print("CLEANED      →", pair['headline_clean'])
print()
print("RAW BODY (first 160 chars) →", pair['body_text'][:160])
print("CLEANED      →", pair['body_clean'][:160])

RAW HEADLINE → Cincinnati Reds to hold Moment of Silence for Fifth Third Center shooting victims
CLEANED      → cincinnati reds to hold moment of silence for fifth third center shooting victims

RAW BODY (first 160 chars) → the reds plan to honor victims of the mass shooting at great american ball parkagainst the san diego padres. the reds announced on thursday that they will honor
CLEANED      → the reds plan to honor victims of the mass shooting at great american ball parkagainst the san diego padres. the reds announced on thursday that they will honor


In [6]:
CLEAN_V2 = "../data/clean_gvfc_sentiment_v2.csv"
df.to_csv(CLEAN_V2, index=False)
print("Saved →", CLEAN_V2)

Saved → ../data/clean_gvfc_sentiment_v2.csv
