# XG Boost Model

In [7]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

train_data= pd.read_csv('../../../data/season-data/2012_to_2023_data.csv')
test_data = pd.read_csv('../../../data/season-data/combined_2024.csv')

combined_data = pd.concat([train_data, test_data], sort=False)

# Handle 'date' and other potential object type columns appropriately here if needed
combined_data['year'] = pd.to_datetime(combined_data['date']).dt.year
combined_data['month'] = pd.to_datetime(combined_data['date']).dt.month
combined_data['day'] = pd.to_datetime(combined_data['date']).dt.day
combined_data.drop('date', axis=1, inplace=True)

# Get dummies for categorical features
combined_data = pd.get_dummies(combined_data, columns=['team', 'team_opp'])

# Split the combined data back into the original train and test datasets
train_data = combined_data.iloc[:len(train_data)]
test_data = combined_data.iloc[len(train_data):]


X_train = train_data.drop('won', axis=1)
Y_train = train_data['won']

X_test = test_data.drop('won', axis=1)
Y_test = test_data['won']

# Prepare the model
model = xgb.XGBClassifier(objective='binary:logistic', max_depth=3, learning_rate=0.1, n_estimators=100)
model.fit(X_train, Y_train)

# Making predictions
y_pred = model.predict(X_test)
accuracy = accuracy_score(Y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Detailed classification report
print(classification_report(Y_test, y_pred))

Accuracy: 99.55%
              precision    recall  f1-score   support

         0.0       0.99      1.00      1.00      1228
         1.0       1.00      0.99      1.00      1228

    accuracy                           1.00      2456
   macro avg       1.00      1.00      1.00      2456
weighted avg       1.00      1.00      1.00      2456

