In [32]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, log_loss, roc_auc_score



In [33]:

def clean_data(df):
    # Handle missing values
    df = df.fillna(0)
    
    # Ensure numerical columns are of the right type
    numerical_columns = ['FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']
    for col in numerical_columns:
        df[col] = pd.to_numeric(df[col], errors='coerce').fillna(0)
    
    return df




In [34]:
def load_and_preprocess_data(file_paths):
   # Load all datasets
    dataframes = [pd.read_csv(file_path) for file_path in file_paths]

    # Combine all datasets
    combined_data = pd.concat(dataframes, ignore_index=True)
    combined_data = clean_data(combined_data)
    
    # Select relevant features for prediction
    selected_features = ['FTHG', 'FTAG', 'HS', 'AS', 'HST', 'AST', 'HC', 'AC', 'HF', 'AF', 'HY', 'AY', 'HR', 'AR']
    
    # Create the target variable (match result)
    # FTR (Full Time Result) is already in the dataset as 'H', 'D', or 'A'
    
    # Prepare features
    X = combined_data[selected_features]
    
    # Prepare target
    y = combined_data['FTR']
    
    # Encode the target variable
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)
    
    return X, y_encoded, le.classes_


In [None]:

def train_model(X, y):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Initialize and train the model
    model = RandomForestClassifier(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)
    
    # Make predictions
    y_pred = model.predict(X_test)
    y_proba = model.predict_proba(X_test)
    
    return model, X_test, y_test, y_pred, y_proba

In [None]:
def evaluate_model(y_test, y_pred, y_proba):
    accuracy = accuracy_score(y_test, y_pred)
    log_loss_value = log_loss(y_test, y_proba)
    roc_score = roc_auc_score(y_test, y_proba, multi_class='ovr')
    
    return accuracy, log_loss_value, roc_score


In [None]:

def get_feature_importance(model, feature_names):
    importances = model.feature_importances_
    feature_importance = pd.DataFrame({
        'feature': feature_names,
        'importance': importances
    })
    return feature_importance.sort_values('importance', ascending=False)



In [None]:
def main():
    # Define file paths
    file_paths = ['csv/2020-2021.csv', 'csv/2021-2022.csv', 'csv/final_dataset.csv']
    
    # Load and preprocess data
    X, y, classes = load_and_preprocess_data(file_paths)
    
    # Train model
    model, X_test, y_test, y_pred, y_proba = train_model(X, y)
    
    # Evaluate model
    accuracy, log_loss_value, roc_score = evaluate_model(y_test, y_pred, y_proba)
    
    # Get feature importance
    feature_importance = get_feature_importance(model, X.columns)
    
    # Print results
    print(f"Model Accuracy: {accuracy:.4f}")
    print(f"Log Loss: {log_loss_value:.4f}")
    print(f"ROC-AUC Score: {roc_score:.4f}")
    print("\nTop 5 Most Important Features:")
    print(feature_importance.head())
    
    return model, classes


In [35]:

if __name__ == "__main__":
    model, classes = main()

Model Accuracy: 0.9974
Log Loss: 0.0278
ROC-AUC Score: 1.0000

Top 5 Most Important Features:
  feature  importance
0    FTHG    0.527675
1    FTAG    0.338460
9      AF    0.022121
8      HF    0.021167
5     AST    0.019101
