In [None]:
# Gerekli k√ºt√ºphaneleri y√ºkle
!pip install pandas numpy scikit-learn scipy matplotlib seaborn -q


## 1. Veri Y√ºkleme ve Hazƒ±rlama

Bu h√ºcre, MotoGP verilerini y√ºkler veya kaydedilmi≈ü veriyi kullanƒ±r.


In [None]:
import os
import pickle
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import warnings
warnings.filterwarnings('ignore')

# Veri klas√∂r√º
DATA_DIR = Path("ml_data")
DATA_DIR.mkdir(exist_ok=True)

def load_motogp_data():
    """
    MotoGP verilerini y√ºkle veya kaydedilmi≈ü veriyi kullan
    """
    cache_file = DATA_DIR / "motogp_full_cache.pkl"
    
    # Kaydedilmi≈ü veri varsa y√ºkle
    if cache_file.exists():
        print("‚úÖ Kaydedilmi≈ü MotoGP verisi y√ºkleniyor...")
        with open(cache_file, 'rb') as f:
            data = pickle.load(f)
        print(f"   Veri y√ºklendi: {len(data)} satƒ±r, {len(data.columns)} kolon")
        print(f"   Kolonlar: {list(data.columns[:10])}...")
        return data
    
    # Ana notebook'tan CSV olarak y√ºkle (eƒüer varsa)
    csv_file = Path("trends_data") / "motogp_full_data.csv"
    if csv_file.exists():
        print("‚úÖ CSV dosyasƒ±ndan MotoGP verisi y√ºkleniyor...")
        data = pd.read_csv(csv_file)
        if 'RaceDate' in data.columns:
            data['RaceDate'] = pd.to_datetime(data['RaceDate'])
        print(f"   Veri y√ºklendi: {len(data)} satƒ±r")
        # Kaydet
        with open(cache_file, 'wb') as f:
            pickle.dump(data, f)
        print(f"   Veri kaydedildi: {cache_file}")
        return data
    
    # Eƒüer ana notebook'ta motogp_full deƒüi≈ükeni varsa (Jupyter kernel'da)
    try:
        # Jupyter/IPython ortamƒ±nda √ßalƒ±≈üƒ±yorsak, ana notebook'tan deƒüi≈ükeni al
        import sys
        if 'ipykernel' in sys.modules or 'IPython' in sys.modules:
            # IPython namespace'inden al
            try:
                from IPython import get_ipython
                ipython = get_ipython()
                if ipython is not None and 'motogp_full' in ipython.user_ns:
                    print("‚úÖ Ana notebook'tan motogp_full deƒüi≈ükeni y√ºkleniyor...")
                    data = ipython.user_ns['motogp_full'].copy()
                    print(f"   Veri y√ºklendi: {len(data)} satƒ±r")
                    # Kaydet
                    with open(cache_file, 'wb') as f:
                        pickle.dump(data, f)
                    print(f"   Veri kaydedildi: {cache_file}")
                    return data
            except:
                pass
    except:
        pass
    
    print("‚ö†Ô∏è  Kaydedilmi≈ü veri bulunamadƒ±.")
    print("   L√ºtfen √∂nce ana analiz notebook'unu √ßalƒ±≈ütƒ±rƒ±p motogp_full verisini olu≈üturun.")
    print("   Veya a≈üaƒüƒ±daki h√ºcrede veriyi manuel olarak y√ºkleyin.")
    return None

# Veriyi y√ºkle
motogp_data = load_motogp_data()

# Veri y√ºklendiyse √∂nizleme g√∂ster
if motogp_data is not None:
    print(f"\nüìä Veri √ñnizleme:")
    print(f"   Shape: {motogp_data.shape}")
    print(f"   Kolonlar ({len(motogp_data.columns)}):")
    for i, col in enumerate(motogp_data.columns):
        print(f"     {i+1}. {col}")
    print(f"\n   ƒ∞lk 5 satƒ±r:")
    print(motogp_data.head())


### Veriyi Hazƒ±rlama ve √ñzellik M√ºhendisliƒüi

Hedef deƒüi≈üken: **Relative Increase %** kategorilerine g√∂re sƒ±nƒ±flandƒ±rma
- **Low**: < 0% (azalma veya deƒüi≈üim yok)
- **Medium**: 0-50% (orta artƒ±≈ü)
- **High**: > 50% (y√ºksek artƒ±≈ü)


In [None]:
def prepare_ml_data(df):
    """
    MotoGP verisini ML i√ßin hazƒ±rla
    
    Args:
        df: motogp_full DataFrame
    
    Returns:
        X: Feature matrix
        y: Target labels
        feature_names: Feature isimleri
    """
    # Veriyi kopyala
    data = df.copy()
    
    # NaN deƒüerleri temizle
    data = data.dropna(subset=['Relative Increase %', 'Popularity', 
                               'Career wins_num', 'Career podiums_num', 
                               'Championships_num'])
    
    # Sonsuz deƒüerleri temizle
    data = data[np.isfinite(data['Relative Increase %'])]
    
    # Hedef deƒüi≈ükeni kategorilere ayƒ±r
    def categorize_increase(increase):
        if pd.isna(increase) or not np.isfinite(increase):
            return None
        if increase < 0:
            return 'Low'
        elif increase < 50:
            return 'Medium'
        else:
            return 'High'
    
    data['target_category'] = data['Relative Increase %'].apply(categorize_increase)
    data = data.dropna(subset=['target_category'])
    
    # √ñzellikleri se√ß
    feature_cols = [
        'Popularity',
        'Career wins_num',
        'Career podiums_num',
        'Championships_num',
        'Years_active_len',
        'Search Before',
        'Trend Difference'
    ]
    
    # Sadece mevcut kolonlarƒ± kullan
    available_features = [col for col in feature_cols if col in data.columns]
    
    X = data[available_features].values
    y = data['target_category'].values
    
    # Label encoding
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    print(f"‚úÖ Veri hazƒ±rlandƒ±:")
    print(f"   √ñrnek sayƒ±sƒ±: {len(X)}")
    print(f"   √ñzellik sayƒ±sƒ±: {X.shape[1]}")
    print(f"   √ñzellikler: {available_features}")
    print(f"   Sƒ±nƒ±f daƒüƒ±lƒ±mƒ±:")
    for label, count in zip(le.classes_, np.bincount(y_encoded)):
        print(f"     {label}: {count}")
    
    return X, y_encoded, available_features, le

# Veriyi hazƒ±rla (eƒüer motogp_data varsa)
if motogp_data is not None:
    X, y, feature_names, label_encoder = prepare_ml_data(motogp_data)
    
    # Train/test split
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    
    # √ñl√ßeklendirme
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    print(f"\n‚úÖ Train/Test split yapƒ±ldƒ±:")
    print(f"   Train: {X_train.shape[0]} √∂rnek")
    print(f"   Test: {X_test.shape[0]} √∂rnek")
else:
    print("‚ö†Ô∏è  Veri y√ºklenemedi. L√ºtfen √∂nce veriyi y√ºkleyin.")


## 2. Random Forest Classifier

Ana sƒ±nƒ±flandƒ±rƒ±cƒ± model - MotoGP fan reaction kategorilerini tahmin eder.


In [None]:
from sklearn.ensemble import RandomForestClassifier

def train_random_forest(X_train, X_test, y_train, y_test, 
                        n_estimators=100, max_depth=None, random_state=42):
    """
    Random Forest modelini eƒüit
    
    Args:
        X_train, X_test: Eƒüitim ve test √∂zellikleri
        y_train, y_test: Eƒüitim ve test etiketleri
        n_estimators: Aƒüa√ß sayƒ±sƒ±
        max_depth: Maksimum derinlik
        random_state: Rastgelelik seed'i
    
    Returns:
        model: Eƒüitilmi≈ü model
    """
    print("üå≤ Random Forest eƒüitiliyor...")
    print(f"   Parametreler: n_estimators={n_estimators}, max_depth={max_depth}")
    
    # Model olu≈ütur
    model = RandomForestClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        random_state=random_state,
        n_jobs=-1,
        class_weight='balanced'  # Sƒ±nƒ±f dengesizliƒüini dikkate al
    )
    
    # Eƒüit
    model.fit(X_train, y_train)
    
    # Tahmin
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Deƒüerlendirme
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    
    print(f"\n‚úÖ Model eƒüitildi!")
    print(f"   Train Accuracy: {train_acc:.4f}")
    print(f"   Test Accuracy: {test_acc:.4f}")
    
    # √ñzellik √∂nemleri
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': model.feature_importances_
    }).sort_values('importance', ascending=False)
    
    print(f"\nüìä √ñzellik √ñnemleri:")
    print(feature_importance.to_string(index=False))
    
    # Classification report
    print(f"\nüìä Classification Report (Test):")
    print(classification_report(y_test, y_pred_test, 
                                target_names=label_encoder.classes_))
    
    # Modeli kaydet
    model_file = DATA_DIR / "random_forest_model.pkl"
    with open(model_file, 'wb') as f:
        pickle.dump({
            'model': model,
            'scaler': scaler,
            'label_encoder': label_encoder,
            'feature_names': feature_names
        }, f)
    print(f"\nüíæ Model kaydedildi: {model_file}")
    
    return model, feature_importance

# Modeli eƒüit (eƒüer veri hazƒ±rsa)
if 'X_train_scaled' in locals() and 'X_test_scaled' in locals():
    rf_model, feature_importance = train_random_forest(
        X_train_scaled, X_test_scaled, y_train, y_test,
        n_estimators=100, max_depth=10
    )
else:
    print("‚ö†Ô∏è  Veri hazƒ±r deƒüil. L√ºtfen √∂nce veri hazƒ±rlama h√ºcresini √ßalƒ±≈ütƒ±rƒ±n.")


## 3. Decision Tree Classifier

Baseline model - Raporda a√ßƒ±klama ve kar≈üƒ±la≈ütƒ±rma i√ßin kullanƒ±lƒ±r.


In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

def train_decision_tree(X_train, X_test, y_train, y_test,
                       max_depth=5, min_samples_split=10, random_state=42):
    """
    Decision Tree modelini eƒüit (baseline)
    
    Args:
        X_train, X_test: Eƒüitim ve test √∂zellikleri
        y_train, y_test: Eƒüitim ve test etiketleri
        max_depth: Maksimum derinlik
        min_samples_split: Split i√ßin minimum √∂rnek sayƒ±sƒ±
        random_state: Rastgelelik seed'i
    
    Returns:
        model: Eƒüitilmi≈ü model
    """
    print("üå≥ Decision Tree eƒüitiliyor...")
    print(f"   Parametreler: max_depth={max_depth}, min_samples_split={min_samples_split}")
    
    # Model olu≈ütur
    model = DecisionTreeClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=random_state,
        class_weight='balanced'
    )
    
    # Eƒüit
    model.fit(X_train, y_train)
    
    # Tahmin
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)
    
    # Deƒüerlendirme
    train_acc = accuracy_score(y_train, y_pred_train)
    test_acc = accuracy_score(y_test, y_pred_test)
    
    print(f"\n‚úÖ Model eƒüitildi!")
    print(f"   Train Accuracy: {train_acc:.4f}")
    print(f"   Test Accuracy: {test_acc:.4f}")
    
    # Classification report
    print(f"\nüìä Classification Report (Test):")
    print(classification_report(y_test, y_pred_test,
                                target_names=label_encoder.classes_))
    
    # Aƒüa√ß g√∂rselle≈ütirme (k√º√ß√ºk derinlik i√ßin)
    if max_depth <= 5:
        plt.figure(figsize=(20, 10))
        plot_tree(model, 
                  feature_names=feature_names,
                  class_names=label_encoder.classes_,
                  filled=True,
                  rounded=True,
                  fontsize=10)
        plt.title("Decision Tree Visualization", fontsize=16)
        plt.tight_layout()
        plt.savefig(DATA_DIR / "decision_tree_visualization.png", dpi=300, bbox_inches='tight')
        print(f"\nüìä Aƒüa√ß g√∂rselle≈ütirmesi kaydedildi: {DATA_DIR / 'decision_tree_visualization.png'}")
        plt.show()
    
    # Modeli kaydet
    model_file = DATA_DIR / "decision_tree_model.pkl"
    with open(model_file, 'wb') as f:
        pickle.dump({
            'model': model,
            'scaler': scaler,
            'label_encoder': label_encoder,
            'feature_names': feature_names
        }, f)
    print(f"\nüíæ Model kaydedildi: {model_file}")
    
    return model

# Modeli eƒüit (eƒüer veri hazƒ±rsa)
if 'X_train_scaled' in locals() and 'X_test_scaled' in locals():
    dt_model = train_decision_tree(
        X_train_scaled, X_test_scaled, y_train, y_test,
        max_depth=5, min_samples_split=10
    )
else:
    print("‚ö†Ô∏è  Veri hazƒ±r deƒüil. L√ºtfen √∂nce veri hazƒ±rlama h√ºcresini √ßalƒ±≈ütƒ±rƒ±n.")


## 4. Model Kar≈üƒ±la≈ütƒ±rmasƒ±

Random Forest ve Decision Tree modellerinin performans kar≈üƒ±la≈ütƒ±rmasƒ±


In [None]:
def compare_models(rf_model, dt_model, X_test, y_test):
    """
    ƒ∞ki modeli kar≈üƒ±la≈ütƒ±r
    """
    # Tahminler
    rf_pred = rf_model.predict(X_test)
    dt_pred = dt_model.predict(X_test)
    
    # Accuracy
    rf_acc = accuracy_score(y_test, rf_pred)
    dt_acc = accuracy_score(y_test, dt_pred)
    
    print("=" * 60)
    print("MODEL KAR≈ûILA≈ûTIRMASI")
    print("=" * 60)
    print(f"\nüìä Test Accuracy:")
    print(f"   Random Forest: {rf_acc:.4f}")
    print(f"   Decision Tree: {dt_acc:.4f}")
    print(f"   Fark: {abs(rf_acc - dt_acc):.4f}")
    
    # Confusion matrices
    print(f"\nüìä Confusion Matrices:")
    print(f"\nRandom Forest:")
    print(confusion_matrix(y_test, rf_pred))
    print(f"\nDecision Tree:")
    print(confusion_matrix(y_test, dt_pred))
    
    # Kar≈üƒ±la≈ütƒ±rma DataFrame
    comparison = pd.DataFrame({
        'Model': ['Random Forest', 'Decision Tree'],
        'Accuracy': [rf_acc, dt_acc],
        'Difference': [rf_acc - dt_acc, dt_acc - rf_acc]
    })
    
    print(f"\nüìä Kar≈üƒ±la≈ütƒ±rma Tablosu:")
    print(comparison.to_string(index=False))
    
    return comparison

# Modelleri kar≈üƒ±la≈ütƒ±r (eƒüer her ikisi de eƒüitildiyse)
if 'rf_model' in locals() and 'dt_model' in locals():
    comparison_df = compare_models(rf_model, dt_model, X_test_scaled, y_test)
else:
    print("‚ö†Ô∏è  Her iki model de eƒüitilmedi. L√ºtfen √∂nce modelleri eƒüitin.")
