# üîß F1 YouTube Feature Engineering

## Notebook 03: Feature Engineering & Analytics Preparation

This notebook creates all features needed for descriptive analytics:
- **Sentiment Analysis**: VADER scores for all comments
- **Driver Detection**: Which drivers are mentioned in each comment/video
- **Team Detection**: Which teams are mentioned
- **Rivalry Detection**: Identify comments mentioning rival pairs
- **Topic Classification**: Categorize content themes

In [None]:
# Setup and imports
import sys
from pathlib import Path

sys.path.insert(0, str(Path.cwd().parent))

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
import warnings
warnings.filterwarnings('ignore')

from src import config
from src import utils

print("‚úÖ Imports successful!")

## 1. Load Cleaned Data

In [None]:
# Load cleaned data
videos = pd.read_csv(config.CLEAN_DATASET_CSV)
comments = pd.read_csv(config.PROCESSED_DATA_DIR / 'comments_clean.csv')

print(f"üìä Loaded {len(videos)} videos and {len(comments)} comments")

## 2. Sentiment Analysis

Analyze sentiment for all comments using VADER (optimized for social media text).

In [None]:
# Perform sentiment analysis on comments
print("üé≠ Analyzing sentiment for all comments...")

# Use the text_original field for sentiment analysis
texts = comments['text_original'].fillna('').tolist()
sentiments = utils.batch_sentiment_analysis(texts, show_progress=True)

# Add sentiment columns
comments['sentiment_compound'] = sentiments['compound']
comments['sentiment_label'] = sentiments['label']

# Show distribution
print("\nüìä Sentiment Distribution:")
print(comments['sentiment_label'].value_counts())
print(f"\nAverage sentiment score: {comments['sentiment_compound'].mean():.3f}")

## 3. Driver Detection

Identify which F1 drivers are mentioned in comments and video titles.

In [None]:
# Detect drivers in comments
print("üèéÔ∏è Detecting drivers mentioned in comments...")

comments['drivers_mentioned'] = comments['text_original'].apply(
    lambda x: utils.detect_drivers(x) if isinstance(x, str) else []
)

# Convert list to string for CSV storage
comments['drivers_mentioned_str'] = comments['drivers_mentioned'].apply(utils.list_to_string)

# Count how many comments mention at least one driver
comments_with_drivers = comments[comments['drivers_mentioned'].apply(len) > 0]
print(f"\n‚úÖ {len(comments_with_drivers)} comments ({len(comments_with_drivers)/len(comments)*100:.1f}%) mention at least one driver")

In [None]:
# Detect drivers in video titles
print("üèéÔ∏è Detecting drivers mentioned in video titles...")

videos['drivers_in_title'] = videos['title'].apply(
    lambda x: utils.detect_drivers(x) if isinstance(x, str) else []
)
videos['drivers_in_title_str'] = videos['drivers_in_title'].apply(utils.list_to_string)

# Count videos mentioning each driver
videos_with_drivers = videos[videos['drivers_in_title'].apply(len) > 0]
print(f"\n‚úÖ {len(videos_with_drivers)} videos ({len(videos_with_drivers)/len(videos)*100:.1f}%) mention a driver in title")

## 4. Team Detection

In [None]:
# Detect teams in comments
print("üèÅ Detecting teams mentioned in comments...")

comments['teams_mentioned'] = comments['text_original'].apply(
    lambda x: utils.detect_teams(x) if isinstance(x, str) else []
)
comments['teams_mentioned_str'] = comments['teams_mentioned'].apply(utils.list_to_string)

comments_with_teams = comments[comments['teams_mentioned'].apply(len) > 0]
print(f"\n‚úÖ {len(comments_with_teams)} comments ({len(comments_with_teams)/len(comments)*100:.1f}%) mention at least one team")

## 5. Rivalry Detection

Identify comments that mention both drivers from a rivalry pair.

In [None]:
# Detect rivalry mentions
print("‚öîÔ∏è Detecting rivalry mentions in comments...")

comments['rivalries_detected'] = comments['text_original'].apply(
    lambda x: utils.detect_rivalries(x) if isinstance(x, str) else []
)

# Convert to string for storage
comments['rivalries_detected_str'] = comments['rivalries_detected'].apply(
    lambda x: '|'.join([f"{r[0]}vs{r[1]}" for r in x]) if x else ''
)

comments_with_rivalries = comments[comments['rivalries_detected'].apply(len) > 0]
print(f"\n‚úÖ {len(comments_with_rivalries)} comments mention rivalry pairs")

## 6. Save Feature-Engineered Data

In [None]:
# Prepare final DataFrames for saving (drop list columns, keep string versions)
comments_final = comments.drop(columns=['drivers_mentioned', 'teams_mentioned', 'rivalries_detected'], errors='ignore')
videos_final = videos.drop(columns=['drivers_in_title'], errors='ignore')

# Save to CSV
videos_final.to_csv(config.FEATURES_CSV, index=False)
comments_final.to_csv(config.PROCESSED_DATA_DIR / 'comments_with_features.csv', index=False)

print(f"‚úÖ Saved feature-engineered videos to: {config.FEATURES_CSV}")
print(f"‚úÖ Saved feature-engineered comments to: {config.PROCESSED_DATA_DIR / 'comments_with_features.csv'}")

In [None]:
# Summary of features
print("\nüìä FEATURE ENGINEERING SUMMARY")
print("=" * 60)
print(f"\nüé¨ VIDEOS: {len(videos_final)} rows")
print(f"   New features: drivers_in_title_str")

print(f"\nüí¨ COMMENTS: {len(comments_final)} rows")
print(f"   New features:")
print(f"   - sentiment_compound: VADER compound score (-1 to +1)")
print(f"   - sentiment_label: positive/neutral/negative")
print(f"   - drivers_mentioned_str: pipe-separated driver keys")
print(f"   - teams_mentioned_str: pipe-separated team keys")
print(f"   - rivalries_detected_str: detected rivalry pairs")

print("\n‚úÖ Feature engineering complete!")
print("‚û°Ô∏è Next: Run 04_eda_vis.ipynb for analysis and visualizations")