In [None]:
# Install required libraries
!pip install pandas numpy scikit-learn scipy matplotlib seaborn -q


## 1. Data Loading and Preparation

This cell loads MotoGP data or uses cached data if available.


In [None]:
import os
import pickle
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Data directory
DATA_DIR = Path("ml_data")
DATA_DIR.mkdir(exist_ok=True)

def load_motogp_data():
    """
    Load MotoGP data or use cached data
    """
    cache_file = DATA_DIR / "motogp_full_cache.pkl"
    
    # Load cached data if exists
    if cache_file.exists():
        print("‚úÖ Loading cached MotoGP data...")
        with open(cache_file, 'rb') as f:
            data = pickle.load(f)
        print(f"   Data loaded: {len(data)} rows, {len(data.columns)} columns")
        print(f"   Columns: {list(data.columns[:10])}...")
        return data
    
    # Load from CSV if exists
    csv_file = Path("trends_data") / "motogp_full_data.csv"
    if csv_file.exists():
        print("‚úÖ Loading MotoGP data from CSV...")
        data = pd.read_csv(csv_file)
        if 'RaceDate' in data.columns:
            data['RaceDate'] = pd.to_datetime(data['RaceDate'])
        print(f"   Data loaded: {len(data)} rows")
        # Save cache
        with open(cache_file, 'wb') as f:
            pickle.dump(data, f)
        print(f"   Data cached: {cache_file}")
        return data
    
    # If motogp_full variable exists in main notebook (Jupyter kernel)
    try:
        # If running in Jupyter/IPython environment, get variable from main notebook
        import sys
        if 'ipykernel' in sys.modules or 'IPython' in sys.modules:
            # Get from IPython namespace
            try:
                from IPython import get_ipython
                ipython = get_ipython()
                if ipython is not None and 'motogp_full' in ipython.user_ns:
                    print("‚úÖ Loading motogp_full from main notebook...")
                    data = ipython.user_ns['motogp_full'].copy()
                    print(f"   Data loaded: {len(data)} rows")
                    # Save cache
                    with open(cache_file, 'wb') as f:
                        pickle.dump(data, f)
                    print(f"   Data cached: {cache_file}")
                    return data
            except:
                pass
    except:
        pass
    
    print("‚ö†Ô∏è  No cached data found.")
    print("   Please run the main analysis notebook first to create motogp_full data.")
    print("   Or load the data manually in the cell below.")
    return None

# Load data
motogp_data = load_motogp_data()

# Show preview if data loaded
if motogp_data is not None:
    print(f"\nüìä Data Preview:")
    print(f"   Shape: {motogp_data.shape}")
    print(f"   Columns ({len(motogp_data.columns)}):")
    for i, col in enumerate(motogp_data.columns):
        print(f"     {i+1}. {col}")
    print(f"\n   First 5 rows:")
    print(motogp_data.head())


### Data Preparation and Feature Engineering

Target variable: Classification based on **Relative Increase %** categories
- **Low**: < 0% (decrease or no change)
- **Medium**: 0-50% (moderate increase)
- **High**: > 50% (high increase)


In [None]:
def prepare_ml_data(df):
    """
    Prepare MotoGP data for ML
    
    Args:
        df: motogp_full DataFrame
    
    Returns:
        X: Feature matrix
        y: Target labels
        feature_names: Feature names
    """
    # Copy data
    data = df.copy()
    
    # Clean NaN values
    data = data.dropna(subset=['Relative Increase %', 'Popularity', 
                               'Career wins_num', 'Career podiums_num', 
                               'Championships_num'])
    
    # Clean infinite values
    data = data[np.isfinite(data['Relative Increase %'])]
    
    # Categorize target variable
    def categorize_increase(increase):
        if pd.isna(increase) or not np.isfinite(increase):
            return None
        if increase < 0:
            return 'Low'
        elif increase < 50:
            return 'Medium'
        else:
            return 'High'
    
    data['target_category'] = data['Relative Increase %'].apply(categorize_increase)
    data = data.dropna(subset=['target_category'])
    
    # Select features
    feature_cols = [
        'Popularity',
        'Career wins_num',
        'Career podiums_num',
        'Championships_num',
        'Years_active_len',
        'Search Before',
        'Trend Difference'
    ]
    
    # Use only available columns
    available_features = [col for col in feature_cols if col in data.columns]
    
    X = data[available_features].values
    y = data['target_category'].values
    
    # Label encoding
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    print(f"‚úÖ Data prepared:")
    print(f"   Number of samples: {len(X)}")
    print(f"   Number of features: {X.shape[1]}")
    print(f"   Features: {available_features}")
    print(f"   Class distribution:")
    for label, count in zip(le.classes_, np.bincount(y_encoded)):
        print(f"     {label}: {count}")
    
    return X, y_encoded, available_features, le

# Prepare data (if motogp_data exists)
if motogp_data is not None:
    X, y, feature_names, label_encoder = prepare_ml_data(motogp_data)
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"\n‚úÖ Train/Test split completed:")
    print(f"   Train: {X_train.shape[0]} samples")
    print(f"   Test: {X_test.shape[0]} samples")
else:
    print("‚ö†Ô∏è  Data not loaded. Please load the data first.")


## 2. Random Forest Classifier

Main classifier model - Predicts MotoGP fan reaction categories.


In [None]:
from sklearn.ensemble import RandomForestClassifier

def train_random_forest(X_train, X_test, y_train, y_test, 
                        n_estimators=100, max_depth=None, random_state=42):
    """
    Train Random Forest model
    
    Args:
        X_train, X_test: Training and test features
        y_train, y_test: Training and test labels
        n_estimators: Number of trees
        max_depth: Maximum depth
        random_state: Random seed
    
    Returns:
        model: Trained model
    """
    print("üå≤ Training Random Forest...")
    print(f"   Parameters: n_estimators={n_estimators}, max_depth={max_depth}")
    
    # Create model
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state,
        n_jobs=-1,
        class_weight='balanced'  # Handle class imbalance
    )
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Evaluate
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    
    print(f"\n‚úÖ Model trained!")
    print(f"   Train Accuracy: {train_acc:.4f}")
    print(f"   Test Accuracy: {test_acc:.4f}")
    
    # Feature importances
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nüìä Feature Importances:")
    print(feature_importance.to_string(index=False))
    
    # Classification report
    print(f"\nüìä Classification Report (Test):")
    print(classification_report(y_test, y_pred_test, 
                                target_names=label_encoder.classes_))
    
    # Save model
    model_file = DATA_DIR / "random_forest_model.pkl"
    with open(model_file, 'wb') as f:
        pickle.dump({
            'model': model,
            'scaler': scaler,
            'label_encoder': label_encoder,
            'feature_names': feature_names
        }, f)
    print(f"\nüíæ Model saved: {model_file}")
    
    return model, feature_importance

# Train model (if data is ready)
if 'X_train_scaled' in locals() and 'X_test_scaled' in locals():
    rf_model, feature_importance = train_random_forest(
        X_train_scaled, X_test_scaled, y_train, y_test,
        n_estimators=100, max_depth=10
    )
else:
    print("‚ö†Ô∏è  Data not ready. Please run the data preparation cell first.")


## 3. Decision Tree Classifier

Baseline model - Used for explanation and comparison in the report.


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

def train_decision_tree(X_train, X_test, y_train, y_test,
                       max_depth=5, min_samples_split=10, random_state=42):
    """
    Train Decision Tree model (baseline)
    
    Args:
        X_train, X_test: Training and test features
        y_train, y_test: Training and test labels
        max_depth: Maximum depth
        min_samples_split: Minimum samples for split
        random_state: Random seed
    
    Returns:
        model: Trained model
    """
    print("üå≥ Training Decision Tree...")
    print(f"   Parameters: max_depth={max_depth}, min_samples_split={min_samples_split}")
    
    # Create model
    model = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=random_state,
        class_weight='balanced'
    )
    
    # Train
    model.fit(X_train, y_train)
    
    # Predict
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Evaluate
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    
    print(f"\n‚úÖ Model trained!")
    print(f"   Train Accuracy: {train_acc:.4f}")
    print(f"   Test Accuracy: {test_acc:.4f}")
    
    # Classification report
    print(f"\nüìä Classification Report (Test):")
    print(classification_report(y_test, y_pred_test,
                                target_names=label_encoder.classes_))
    
    # Tree visualization (for small depth)
    if max_depth <= 5:
        plt.figure(figsize=(20, 10))
        plot_tree(model, 
                  feature_names=feature_names,
                  class_names=label_encoder.classes_,
                  filled=True,
                  rounded=True,
                  fontsize=10)
        plt.title("Decision Tree Visualization", fontsize=16)
        plt.tight_layout()
        plt.savefig(DATA_DIR / "decision_tree_visualization.png", dpi=300, bbox_inches='tight')
        print(f"\nüìä Tree visualization saved: {DATA_DIR / 'decision_tree_visualization.png'}")
        plt.show()
    
    # Save model
    model_file = DATA_DIR / "decision_tree_model.pkl"
    with open(model_file, 'wb') as f:
        pickle.dump({
            'model': model,
            'scaler': scaler,
            'label_encoder': label_encoder,
            'feature_names': feature_names
        }, f)
    print(f"\nüíæ Model saved: {model_file}")
    
    return model

# Train model (if data is ready)
if 'X_train_scaled' in locals() and 'X_test_scaled' in locals():
    dt_model = train_decision_tree(
        X_train_scaled, X_test_scaled, y_train, y_test,
        max_depth=5, min_samples_split=10
    )
else:
    print("‚ö†Ô∏è  Data not ready. Please run the data preparation cell first.")


## 4. Model Comparison

Performance comparison between Random Forest and Decision Tree models


In [None]:
def compare_models(rf_model, dt_model, X_test, y_test):
    """
    Compare two models
    """
    # Predictions
    rf_pred = rf_model.predict(X_test)
    dt_pred = dt_model.predict(X_test)
    
    # Accuracy
    rf_acc = accuracy_score(y_test, rf_pred)
    dt_acc = accuracy_score(y_test, dt_pred)
    
    print("=" * 60)
    print("MODEL COMPARISON")
    print("=" * 60)
    print(f"\nüìä Test Accuracy:")
    print(f"   Random Forest: {rf_acc:.4f}")
    print(f"   Decision Tree: {dt_acc:.4f}")
    print(f"   Difference: {abs(rf_acc - dt_acc):.4f}")
    
    # Confusion matrices
    print(f"\nüìä Confusion Matrices:")
    print(f"\nRandom Forest:")
    print(confusion_matrix(y_test, rf_pred))
    print(f"\nDecision Tree:")
    print(confusion_matrix(y_test, dt_pred))
    
    # Comparison DataFrame
    comparison = pd.DataFrame({
        'Model': ['Random Forest', 'Decision Tree'],
        'Accuracy': [rf_acc, dt_acc],
        'Difference': [rf_acc - dt_acc, dt_acc - rf_acc]
    })
    
    print(f"\nüìä Comparison Table:")
    print(comparison.to_string(index=False))
    
    return comparison

# Compare models (if both are trained)
if 'rf_model' in locals() and 'dt_model' in locals():
    comparison_df = compare_models(rf_model, dt_model, X_test_scaled, y_test)
else:
    print("‚ö†Ô∏è  Both models are not trained. Please train the models first.")
