# üßπ F1 YouTube Data Cleaning

## Notebook 02: Data Cleaning & Preprocessing

This notebook cleans and preprocesses the raw YouTube data:
- Handle missing values
- Parse dates and durations
- Clean text for NLP analysis
- Remove duplicates
- Validate data quality

In [1]:
# Setup and imports
import sys
from pathlib import Path

sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

from src import config
from src import utils

print("‚úÖ Imports successful!")

  from pandas.core import (


ModuleNotFoundError: No module named 'vaderSentiment'

## 1. Load Raw Data

In [None]:
# Load raw data
videos_raw = pd.read_csv(config.VIDEOS_CSV)
comments_raw = pd.read_csv(config.COMMENTS_CSV)

print(f"üìä Loaded {len(videos_raw)} videos and {len(comments_raw)} comments")
print(f"\nüé¨ Videos columns: {list(videos_raw.columns)}")
print(f"üí¨ Comments columns: {list(comments_raw.columns)}")

## 2. Check Data Quality

In [None]:
# Check for missing values
print("üîç Missing Values Analysis")
print("=" * 50)
print("\nüé¨ VIDEOS:")
print(videos_raw.isnull().sum())
print(f"\nTotal rows: {len(videos_raw)}")

print("\nüí¨ COMMENTS:")
print(comments_raw.isnull().sum())
print(f"\nTotal rows: {len(comments_raw)}")

In [None]:
# Check for duplicates
print("üîç Duplicate Analysis")
print("=" * 50)
print(f"Duplicate videos: {videos_raw.duplicated(subset=['video_id']).sum()}")
print(f"Duplicate comments: {comments_raw.duplicated(subset=['comment_id']).sum()}")

## 3. Clean Videos DataFrame

In [None]:
# Clean videos
videos = videos_raw.copy()

# Remove duplicates
videos = videos.drop_duplicates(subset=['video_id'], keep='first')

# Parse duration to seconds
videos['duration_seconds'] = videos['duration'].apply(utils.parse_duration)

# Extract temporal features
videos = utils.extract_temporal_features(videos, 'published_at')

# Fill missing descriptions
videos['description'] = videos['description'].fillna('')

# Clean title and description text
videos['title_clean'] = videos['title'].apply(utils.clean_text)
videos['description_clean'] = videos['description'].apply(utils.clean_text)

print(f"‚úÖ Cleaned videos: {len(videos)} rows")
videos.info()

## 4. Clean Comments DataFrame

In [None]:
# Clean comments
comments = comments_raw.copy()

# Remove duplicates
comments = comments.drop_duplicates(subset=['comment_id'], keep='first')

# Fill missing text
comments['text_original'] = comments['text_original'].fillna('')
comments['text_display'] = comments['text_display'].fillna('')

# Extract temporal features
comments = utils.extract_temporal_features(comments, 'published_at')

# Clean text for analysis
comments['text_clean'] = comments['text_original'].apply(utils.clean_text)

# Remove very short comments (likely spam or just emojis)
comments['text_length'] = comments['text_clean'].str.len()
comments = comments[comments['text_length'] >= 3]

print(f"‚úÖ Cleaned comments: {len(comments)} rows")
comments.info()

## 5. Calculate Video Metrics

In [None]:
# Calculate engagement rate and controversy index
videos['engagement_rate'] = videos.apply(
    lambda row: utils.calculate_engagement_rate(
        row['view_count'], row['like_count'], row['comment_count']
    ), axis=1
)

videos['controversy_index'] = videos.apply(
    lambda row: utils.calculate_controversy_index(
        row['comment_count'], row['like_count']
    ), axis=1
)

print("üìä Video Metrics Calculated:")
print(f"  Avg Engagement Rate: {videos['engagement_rate'].mean():.4f}%")
print(f"  Max Engagement Rate: {videos['engagement_rate'].max():.4f}%")
print(f"  Avg Controversy Index: {videos['controversy_index'].mean():.4f}")

## 6. Save Clean Data

In [None]:
# Save cleaned data
config.PROCESSED_DATA_DIR.mkdir(parents=True, exist_ok=True)

videos.to_csv(config.CLEAN_DATASET_CSV, index=False)
print(f"‚úÖ Saved clean videos to: {config.CLEAN_DATASET_CSV}")

# Save comments with video info merged
comments_with_video = comments.merge(
    videos[['video_id', 'title', 'view_count']], 
    on='video_id', 
    how='left',
    suffixes=('', '_video')
)
comments_with_video.to_csv(config.PROCESSED_DATA_DIR / 'comments_clean.csv', index=False)
print(f"‚úÖ Saved clean comments to: {config.PROCESSED_DATA_DIR / 'comments_clean.csv'}")

In [None]:
print("\n‚úÖ Cleaning notebook complete!")
print("‚û°Ô∏è Next: Run 03_feature_eng.ipynb for feature engineering")