In [None]:
# ---------------------------------------------
# Step 1: Load libraries & dataset
# ---------------------------------------------
import pandas as pd
import numpy as np
import re
from tqdm import tqdm
from urllib.parse import urlparse
from google.colab import drive

# Load data directly
df = pd.read_parquet(
    'https://storage.googleapis.com/msca-bdp-data-open/news_final_project/news_final_project.parquet',
    engine='pyarrow'
)

print(f"Loaded: {df.shape}")

Loaded: (200760, 5)


In [None]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df['text'] = df['text'].str.lower()
df['title'] = df['title'].str.lower()

In [None]:
# ---------------------------------------------
# Step 2: Drop clearly bad rows (nulls / short text)
# ---------------------------------------------
df = df[df['text'].notna() & df['title'].notna()]
df['text_len'] = df['text'].str.len()
df['title_len'] = df['title'].str.len()

# Remove short or empty articles
df = df[(df['text_len'] >= 800) & (df['title_len'] >= 10)]
print(f"After removing too-short text/title: {df.shape}")

# Remove very long outlier articles (>35K chars)
df = df[df['text_len'] <= 35000]
print(f"After removing too-long text: {df.shape}")

# Remove overly long titles (>200 chars)
df = df[df['title_len'] <= 200]
print(f"After removing too-long titles: {df.shape}")

After removing too-short text/title: (199308, 7)
After removing too-long text: (197502, 7)
After removing too-long titles: (196524, 7)


In [None]:
# ---------------------------------------------
# Step 3: Drop duplicate titles
# ---------------------------------------------
df = df.drop_duplicates(subset=['title'])
print(f"After dropping duplicate titles: {df.shape}")

After dropping duplicate titles: (155347, 7)


In [None]:
# ---------------------------------------------
# Step 4: Flag and filter spammy sources
# ---------------------------------------------

# --- Spam domains (image hosts, low-quality content farms) ---
spam_domains = [
    'shutterstock.com', 'rawpixel.com', 'adobe.com', 'gettyimages.com', 'depositphotos.com',
    'dreamstime.com', '123rf.com', 'canva.com', 'pixabay.com', 'buzzfeed.com', 'slideshare.net'
]

def is_spam_domain(url):
    url = str(url).lower()
    return any(domain in url for domain in spam_domains)

df['spam_domain'] = df['url'].apply(is_spam_domain)

# --- Spam TLDs ---
spam_tlds = ['.ga', '.cf', '.tk', '.ml', '.biz', '.info']

def has_spam_tld(url):
    url = str(url).lower()
    return any(url.endswith(tld) or f".{tld}" in url for tld in spam_tlds)

df['spam_tld'] = df['url'].apply(has_spam_tld)

# Filter out spam
df = df[~(df['spam_domain'] | df['spam_tld'])]
print(f"After spam URL filtering: {df.shape}")

After spam URL filtering: (147260, 9)


In [None]:
# ---------------------------------------------
# Step 5: Filter to AI-related articles
# ---------------------------------------------

ai_keywords = [
    # General AI terms
    'ai', 'artificial intelligence', 'machine learning', 'deep learning', 'neural network',
    'natural language processing', 'nlp', 'computer vision', 'supervised learning',
    'unsupervised learning', 'reinforcement learning', 'transfer learning',
    'large language model', 'llm', 'language model',

    # Generative AI and modern models
    'generative', 'generative ai', 'diffusion model', 'gpt', 'chatgpt', 'openai',
    'bard', 'gemini', 'claude', 'mistral', 'llama', 'mixtral', 'stable diffusion',
    'dall-e', 'image generation', 'text-to-image', 'text-to-video', 'sora',

    # Automation and productivity
    'automation', 'robotic process automation', 'rpa', 'autonomous', 'autonomous vehicles',
    'automated workflow', 'intelligent automation', 'process automation',

    # Applied AI domains
    'predictive analytics', 'forecasting model', 'personalized recommendation',
    'computer vision', 'facial recognition', 'object detection', 'voice recognition',
    'speech-to-text', 'text-to-speech',

    # Company and platform mentions
    'openai', 'anthropic', 'mistral', 'meta ai', 'google ai', 'microsoft ai', 'aws ai',
    'deepmind', 'huggingface', 'cohere', 'nvidia ai', 'ibm watson',

    # Business impact & terms
    'ai transformation', 'ai-powered', 'ai assistant', 'intelligent system',
    'ai integration', 'ai adoption', 'ai deployment', 'data-driven', 'data science',
    'intelligent analytics', 'ai use case', 'enterprise ai', 'ai in healthcare',
    'ai in finance', 'ai in retail', 'ai in education', 'ai in manufacturing',

    # Safety, ethics, and policy
    'ai alignment', 'ai regulation', 'ai ethics', 'responsible ai', 'ai safety',
    'hallucination', 'bias in ai', 'trustworthy ai', 'explainable ai', 'xai',

    # Programming / tooling references
    'transformer model', 'attention mechanism', 'fine-tuning', 'prompt engineering',
    'embedding model', 'vector search', 'retrieval augmented generation', 'rag',
    'few-shot learning', 'zero-shot learning'
]

# Compile keyword pattern
pattern_ai = '|'.join(map(re.escape, ai_keywords))

# Filter articles that match any AI keyword in the full text
df = df[df['text'].str.contains(pattern_ai, case=False, na=False)]

print(f"After AI keyword filtering: {df.shape}")

After AI keyword filtering: (147247, 9)


In [None]:
df.drop(columns=['language', 'spam_domain', 'spam_tld', 'url', 'text_len', 'title_len'], inplace=True)

In [None]:
# ---------------------------------------------
# Step 6: Quick cleaning + spam phrase removal + non-ASCII removal
# ---------------------------------------------

def quick_clean(df):
    url_pattern = r'https?://\S+|www\.\S+'
    html_pattern = r'<[^>]+>'

    spammy_phrases = [
        'image by', 'photo by', 'photographer', 'image credit', 'copyright',
        'all rights reserved', 'stock photo', 'premium photo', 'advertisement',
        'advertising', 'subscribe', 'cookie policy', 'cookie consent',
        'privacy policy', 'terms of service', 'contact us', 'sign up',
        'login', 'log in', 'download', 'click here', 'sponsored', 'newsletter',
        'buy now', 'shop now', 'freepik', 'shutterstock', 'adobe stock',
        'rawpixel', 'depositphotos', 'dreamstime'
    ]

    spam_pattern = re.compile('|'.join(map(re.escape, spammy_phrases)), flags=re.IGNORECASE)

    for col in ['title', 'text']:
        # Convert to string in case of missing or non-string values
        s = df[col].astype(str)

        # Remove HTML tags
        s = s.str.replace(html_pattern, ' ', regex=True)
        # Remove URLs
        s = s.str.replace(url_pattern, ' ', regex=True)
        # Remove spammy phrases
        s = s.str.replace(spam_pattern, '', regex=True)
        # Collapse whitespace
        s = s.str.replace(r'\s+', ' ', regex=True)
        # Strip leading/trailing spaces
        s = s.str.strip()
        # Remove non-ASCII characters
        s = s.str.encode('ascii', errors='ignore').str.decode('ascii')
        # Lowercase
        s = s.str.lower()

        df[col] = s

    return df

# Apply the cleaning
def batch_quick_clean(df, batch_size=10000):
    cleaned_batches = []
    for start in tqdm(range(0, len(df), batch_size)):
        batch = df.iloc[start:start+batch_size].copy()
        batch = quick_clean(batch)
        cleaned_batches.append(batch)
        del batch
    return pd.concat(cleaned_batches, ignore_index=True)

df = batch_quick_clean(df)

100%|██████████| 15/15 [18:27<00:00, 73.81s/it]


In [None]:
output_path = '/content/drive/MyDrive/cleaned_ai_articles.csv'
df.to_csv(output_path, index=False)