Importing pandas and numpy

In [2]:
import pandas as pd
import numpy as np
import joblib

Loading training set into dataframe called df

In [3]:
df = pd.read_csv('../data/raw/train.csv',low_memory=False)

In [24]:
rc = ['twoPM', 'porpag', 'dunksmade'] #not usinf dporpag and dunksmiss-dunksmade as they are derived from porpag and dunksmade respectively

Splitting data into features and target

In [25]:
features = df[rc]
target = df['drafted']

In [26]:
print(features.shape,target.shape)

(56091, 3) (56091,)


In [27]:
from sklearn.model_selection import train_test_split

In [28]:
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

In [29]:
from sklearn.impute import SimpleImputer

In [30]:
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
X_val_imputed = imputer.transform(X_val)

In [31]:
from sklearn.preprocessing import StandardScaler

In [32]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)

Training the model

In [33]:
from xgboost import XGBClassifier

grid search for best parameters

In [34]:
best_auc = 0
best_gamma = None
best_alpha = None
gamma_values = [0, 0.001, 0.01, 0.1, 1, 10]
alpha_values = [0, 0.001, 0.01, 0.1, 1, 10]

In [35]:
from sklearn.metrics import roc_auc_score

In [36]:
for gamma in gamma_values:
    for alpha in alpha_values:
        # Model training with regularization (XGBoost)
        xgb = XGBClassifier(gamma=gamma, alpha=alpha)
        xgb.fit(X_train_scaled, y_train)
        
        # Model evaluation with regularization (XGBoost)
        y_pred_prob = xgb.predict_proba(X_val_scaled)[:, 1]
        roc_auc = roc_auc_score(y_val, y_pred_prob)
        
        if roc_auc > best_auc:
            best_auc = roc_auc
            best_gamma = gamma
            best_alpha = alpha

Training final model with best parameters

In [37]:
final_xgb = XGBClassifier(gamma=best_gamma, alpha=best_alpha)
final_xgb.fit(X_train_scaled, y_train)

In [38]:
# Model evaluation (XGBoost)
y_pred_prob = final_xgb.predict_proba(X_val_scaled)[:, 1]
roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'XGBoost AUROC: {roc_auc:.4f}')

XGBoost AUROC: 0.9741


In [43]:
test_data = pd.read_csv('../data/raw/test.csv',low_memory=False)

In [44]:
test_data_processed = pd.get_dummies(test_data)
test_data_processed = test_data_processed.reindex(columns=features.columns, fill_value=0)  # Align columns
test_data_scaled = scaler.transform(test_data_processed)
predictions = final_xgb.predict_proba(test_data_scaled)[:,1]



In [45]:
test_data['drafted'] = predictions
print(test_data[['player_id', 'drafted']])

                                 player_id   drafted
0     cf302b4d-84f7-4124-a25d-a75eed31978b  0.000414
1     f91837cd-4f49-4b70-963d-aeb82c6ce3da  0.000414
2     53ec2a29-1e7d-4c6d-86d7-d60d02af8916  0.001214
3     32402798-471c-4a54-8cb4-29cd95199014  0.000414
4     73b960f9-27b8-4431-9d23-a760e9bbc360  0.004950
...                                    ...       ...
4965  a25ee55f-02a3-4f8e-8194-a5f427e14e7c  0.000414
4966  d0d9f45e-7b01-44b3-8d40-514ec338611d  0.000414
4967  f8df22c4-1602-4fab-896d-8820951aae2f  0.000414
4968  b791c69a-f769-4163-afda-051a6fd20a9d  0.000414
4969  18b51f5d-4746-4121-88fd-c8d0a1399130  0.000414

[4970 rows x 2 columns]


In [46]:
test_data[['player_id', 'drafted']].to_csv('predictions_xgb_limited_features_imputed.csv',index=False)