# 2.2 Clean Article Data
This notebook cleans the article data in preparation for modeling. The ultimate goal is to match cleaned articles to the products based on their provided data.

In [5]:
import json
import pandas as pd
import re
import logging
from datetime import datetime

In [6]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def clean_text(text):
    if not text or pd.isna(text):
        return ""
    
    # Convert to string if not already
    text = str(text)
    
    # Remove extra whitespace and normalize
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove common web artifacts
    text = re.sub(r'Cookie Policy|Privacy Policy|Terms of Service', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Subscribe|Newsletter|Advertisement', '', text, flags=re.IGNORECASE)
    text = re.sub(r'Share this article|Follow us on', '', text, flags=re.IGNORECASE)
    
    # Remove URLs
    text = re.sub(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', '', text)
    
    # Remove email addresses
    text = re.sub(r'\S+@\S+', '', text)
    
    # Remove excessive punctuation
    text = re.sub(r'[.]{3,}', '...', text)
    text = re.sub(r'[-]{3,}', '---', text)
    
    return text.strip()

def clean_article_data(article):
    cleaned_article = {}
    # Clean all fields
    cleaned_article['title'] = clean_text(article.get('title', ''))
    cleaned_article['source'] = clean_text(article.get('source', ''))
    cleaned_article['date'] = article.get('date', '')
    cleaned_article['link'] = article.get('link', '')
    cleaned_article['text'] = clean_text(article.get('text', ''))
    return cleaned_article

def clean_article_dataset(raw_data):
    if not raw_data:
        logging.error("No data provided for cleaning")
        return None
    
    cleaned_articles = []
    for i, article in enumerate(raw_data):
        try:
            cleaned_article = clean_article_data(article)
            cleaned_articles.append(cleaned_article)
            if (i + 1) % 50 == 0:
                logging.info(f"Cleaned {i + 1}/{len(raw_data)} articles") 
        except Exception as e:
            logging.error(f"Error cleaning article {i + 1}: {str(e)}")
            continue
    
    return cleaned_articles

def load_raw_data():
    try:
        with open('./intermediate_data/Scraped_Article_Raw_Data.json', 'r', encoding='utf-8') as f:
            raw_data = json.load(f)
        logging.info(f"Loaded {len(raw_data)} articles from file")
        return raw_data
    except FileNotFoundError:
        logging.error("Raw data file not found. Please ensure data collection has been completed.")
        return None
    except Exception as e:
        logging.error(f"Error loading raw data: {str(e)}")
        return None

def save_cleaned_data(cleaned_data):
    if not cleaned_data:
        logging.error("No cleaned data to save")
        return False
    
    filename = './intermediate_data/Cleaned_Article_Data.json'
    
    try:
        with open(filename, 'w', encoding='utf-8') as f:
            json.dump(cleaned_data, f, indent=2, ensure_ascii=False)
        return True
    except Exception as e:
        logging.error(f"Error saving cleaned data: {str(e)}")
        return False


### Save Cleaned Article Data

In [7]:
raw_article_data = load_raw_data()

if raw_article_data:
    cleaned_article_data = clean_article_dataset(raw_article_data)
    
    if cleaned_article_data and save_cleaned_data(cleaned_article_data):
        print("Data cleaned and saved successfully")
    else:
        print("Failed to clean or save data")
else:
    print("No raw data available for cleaning")

2025-06-07 21:50:46,886 - INFO - Loaded 30 articles from file


Data cleaned and saved successfully
