# Data cleaning
### Prerequisites

In [1]:
import pandas as pd
import re
import spacy

In [2]:
# Load the english model
nlp = spacy.load("en_core_web_sm")

### Load the datasets

In [3]:
df_article = pd.read_csv("vogue_article.csv")

In [None]:
df_comments = pd.read_csv("youtube_comments.csv")

### Preprocessing
#### *Part I: The Vogue article*

In [4]:
# Convert to datetime
df_article['pub_date'] = pd.to_datetime(df_article['pub_date'])

# Drop time
df_article['pub_date'] = df_article['pub_date'].dt.date

In [5]:
# Function to clean the text in the article
def clean_article_text(text):
    text = str(text)
    
    text = re.sub(r'\s+', ' ', text)     # collapse whitespace
    text = re.sub(r'http\S+', '', text)  # remove URLs
    text = text.strip()                  # remove extra spaces

    article = nlp(text)
    lemmas = []
    
    for token in article:
        # Keep sentence-ending punctuation (.!?), remove others
        if token.text in ('.', '!', '?'):
            lemmas.append(token.text)
        elif not token.is_punct:
            lemmas.append(token.lemma_) # lemmatization
        # Note: stopwords are not removed
    
    return " ".join(lemmas)

df_article["clean_text"] = df_article["text"].apply(clean_article_text)

#### *Part II: YouTube comments*

In [None]:
# Convert to datetime
df_comments['video_date'] = pd.to_datetime(df_comments['video_date'])
df_comments['comment_date'] = pd.to_datetime(df_comments['comment_date'])

# Drop time
df_comments['video_date'] = df_comments['video_date'].dt.date
df_comments['comment_date'] = df_comments['comment_date'].dt.date

In [None]:
def clean_text_spacy(text):
    text = str(text).lower()
    text = re.sub(r'^\d{1,2}:\d{2}\s*', '', text)   # Remove timestamps at the start (e.g., "0:23")
    text = re.sub(r'\n', ' ', text)                 # remove newlines
    text = re.sub(r'http\S+', '', text)             # remove URLs
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)      # remove punctuation/special chars
    text = re.sub(r'\s+', ' ', text).strip()        # remove extra spaces
    
    comment = nlp(text)
    tokens = []
    
    for token in comment:
        if not token.is_stop:            # remove stopwords
            tokens.append(token.lemma_)  # lemmatization
            
    return " ".join(tokens)

df_comments['clean_comment'] = df_comments['comment'].apply(clean_text_spacy)

### Descriptive analysis
#### *Part I: The Vogue article*

In [6]:
df_article

Unnamed: 0,platform,article_url,title,pub_date,author,text,clean_text
0,Vogue,https://www.vogue.com/article/is-having-a-boyf...,Is Having a Boyfriend Embarrassing Now?,2025-10-29,Chanté Joseph,If someone so much as says “my boyf–” on socia...,if someone so much as say my boyf on social me...


#### *Part II: YouTube comments*

In [None]:
# Unique videos
print(f"Number of videos: {df_comments['video_id'].nunique()}")

# Total comments
print(f"Total comments: {len(df_comments)}")

# Date range
print(f"Comment date range: {df_comments['comment_date'].min()} to {df_comments['comment_date'].max()}")

In [None]:
df_comments.columns

In [None]:
# pd.set_option("display.max_colwidth", None) - turn ON full text
# pd.reset_option("display.max_colwidth") - turn OFF full text
df_comments['clean_comment'].head()

### Save to CSV

In [7]:
df_article.to_csv("vogue_article_clean.csv", index=False)

In [None]:
df_comments.to_csv("youtube_comments_clean.csv", index=False)