# Data Processing & Feature Engineering Demo

This notebook demonstrates how to clean data and create features for machine learning.

## What you'll learn:
- Data cleaning and validation
- Feature engineering for game predictions
- Feature engineering for player predictions
- Creating train/test datasets

In [None]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

sys.path.append(os.path.abspath('..'))

from src.data_processing.cleaning import DataCleaner
from src.data_processing.game_features import GameFeatureEngineer
from src.data_processing.player_features import PlayerFeatureEngineer
from src.data_processing.dataset_builder import DatasetBuilder
from src.utils.data_loader import load_games_as_dataframe, load_player_stats_as_dataframe

plt.style.use('seaborn-v0_8')
%matplotlib inline

## 1. Data Cleaning

First, let's load some raw data and clean it.

In [None]:
# Create sample data for demonstration
sample_games = pd.DataFrame({
    'id': [1, 2, 3, 4, 5],
    'date': pd.date_range('2023-10-01', periods=5),
    'home_team_id': [1, 2, 3, 4, 5],
    'visitor_team_id': [6, 7, 8, 9, 10],
    'home_team_score': [105, 110, -5, 98, 120],  # -5 is invalid
    'visitor_team_score': [100, 105, 95, 102, 115],
    'status': ['Final', 'Final', 'Final', 'In Progress', 'Final']
})

print("Raw game data:")
print(sample_games)

### Validate the data

In [None]:
cleaner = DataCleaner()

# Generate validation report
report = cleaner.validate_game_data(sample_games)
print("Validation Report:")
print(f"Total rows: {report['total_rows']}")
print(f"Issues: {report['issues']}")
print(f"Warnings: {report['warnings']}")

### Clean the data

In [None]:
# Clean the data
clean_games = cleaner.clean_game_data(sample_games)

print("Cleaned game data:")
print(clean_games)
print(f"\nRemoved {len(sample_games) - len(clean_games)} invalid rows")

## 2. Game Feature Engineering

Now let's create features for predicting game outcomes.

In [None]:
# Create more comprehensive sample data
np.random.seed(42)

dates = pd.date_range('2023-10-01', periods=50, freq='D')
game_data = []

for i, date in enumerate(dates):
    game_data.append({
        'id': i,
        'date': date,
        'home_team_id': np.random.choice([1, 2, 3, 4, 5]),
        'visitor_team_id': np.random.choice([6, 7, 8, 9, 10]),
        'home_team_score': np.random.randint(90, 120),
        'visitor_team_score': np.random.randint(90, 120),
        'status': 'Final'
    })

games_df = pd.DataFrame(game_data)
print(f"Created {len(games_df)} sample games")

### Create game features

In [None]:
engineer = GameFeatureEngineer()

# Create comprehensive features
game_features = engineer.create_game_features(games_df)

print(f"Created {len(game_features.columns)} features")
print("\nFeature columns:")
print(game_features.columns.tolist())

print("\nFirst few rows:")
game_features.head()

### Visualize features

In [None]:
# Plot feature distributions
fig, axes = plt.subplots(2, 2, figsize=(12, 8))

axes[0, 0].hist(game_features['home_win_pct'], bins=20)
axes[0, 0].set_title('Home Team Win Percentage')

axes[0, 1].hist(game_features['away_win_pct'], bins=20)
axes[0, 1].set_title('Away Team Win Percentage')

axes[1, 0].hist(game_features['home_rest_days'], bins=20)
axes[1, 0].set_title('Home Team Rest Days')

axes[1, 1].hist(game_features['home_streak'], bins=20)
axes[1, 1].set_title('Home Team Win Streak')

plt.tight_layout()
plt.show()

## 3. Player Feature Engineering

In [None]:
# Create sample player stats
dates = pd.date_range('2023-10-01', periods=30)
player_stats = []

for player_id in range(1, 6):
    for date in dates:
        player_stats.append({
            'player_id': player_id,
            'game_date': date,
            'pts': np.random.randint(10, 35),
            'ast': np.random.randint(2, 10),
            'reb': np.random.randint(3, 12),
            'stl': np.random.randint(0, 4),
            'blk': np.random.randint(0, 3),
            'fgm': np.random.randint(5, 15),
            'fga': np.random.randint(10, 25),
            'fg3m': np.random.randint(1, 5),
            'fg3a': np.random.randint(3, 10),
            'ftm': np.random.randint(3, 10),
            'fta': np.random.randint(4, 12),
            'min': f"{np.random.randint(20, 40)}:00",
            'turnover': np.random.randint(1, 5)
        })

player_df = pd.DataFrame(player_stats)
print(f"Created {len(player_df)} player stat records")

### Create player features

In [None]:
player_engineer = PlayerFeatureEngineer()

# Create comprehensive player features
player_features = player_engineer.create_player_features(player_df)

print(f"Created {len(player_features.columns)} features")
print("\nSample features for one player:")
player_features[player_features['player_id'] == 1].head()

### Visualize player trends

In [None]:
# Plot rolling averages for one player
player_1 = player_features[player_features['player_id'] == 1]

plt.figure(figsize=(12, 6))
plt.plot(player_1['game_date'], player_1['pts'], label='Actual Points', alpha=0.5)
plt.plot(player_1['game_date'], player_1['pts_rolling_5'], label='5-Game Average')
plt.plot(player_1['game_date'], player_1['pts_rolling_10'], label='10-Game Average')
plt.xlabel('Date')
plt.ylabel('Points')
plt.title('Player 1: Points with Rolling Averages')
plt.legend()
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 4. Creating Train/Test Datasets

In [None]:
builder = DatasetBuilder()

# Create dataset for game prediction
dataset = builder.create_dataset(
    df=game_features,
    target_column='home_win',
    date_column='date',
    split_method='time',
    scale_features=True,
    exclude_columns=['game_id', 'home_team_id', 'away_team_id', 'home_score', 'away_score']
)

print("Dataset created!")
print(f"Training samples: {len(dataset['X_train'])}")
print(f"Validation samples: {len(dataset['X_val'])}")
print(f"Test samples: {len(dataset['X_test'])}")
print(f"\nFeatures: {len(dataset['feature_names'])}")

### Generate dataset report

In [None]:
report = builder.generate_dataset_report(dataset)

print("Dataset Report:")
print(f"\nSummary:")
for key, value in report['summary'].items():
    print(f"  {key}: {value}")

print(f"\nTarget Distribution:")
print(f"  Train mean: {report['target_distribution']['train']['mean']:.3f}")
print(f"  Val mean: {report['target_distribution']['val']['mean']:.3f}")
print(f"  Test mean: {report['target_distribution']['test']['mean']:.3f}")

### Save the dataset

In [None]:
# Save dataset for model training
builder.save_dataset(dataset, name='game_predictions', version='v1')
print("Dataset saved to data/processed/game_predictions/v1/")

## Summary

You've learned how to:
- ✅ Clean and validate data
- ✅ Engineer features for game predictions
- ✅ Engineer features for player predictions
- ✅ Create train/validation/test splits
- ✅ Scale features properly
- ✅ Save datasets for model training

Next steps:
- Start building machine learning models!
- Experiment with different feature combinations
- Try different scaling methods