Loading and exploring dataset

In [1]:
%load_ext autoreload
%autoreload 2

Importing pandas and numpy

In [2]:
import pandas as pd
import numpy as np

Loading training set into dataframe called df

In [3]:
df = pd.read_csv('../data/raw/train.csv',low_memory=False)

Removing nulls

In [4]:
df=df.dropna()

Splitting data into features and target

In [5]:
features = df.drop(['player_id','drafted'],axis=1)
target = df['drafted']

One-hot-encoding

In [6]:
features = pd.get_dummies(features)

In [7]:
print(features.shape,target.shape)

(1023, 250) (1023,)


In [8]:
from sklearn.model_selection import train_test_split

In [9]:
X_train, X_val, y_train, y_val = train_test_split(features, target, test_size=0.2, random_state=42)

In [10]:
from sklearn.preprocessing import StandardScaler

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [14]:
from catboost import CatBoostClassifier

Training the model

In [15]:
# Model training (CatBoost)
cb = CatBoostClassifier(n_estimators=150, random_state=8, verbose=0)
cb.fit(X_train_scaled, y_train)

<catboost.core.CatBoostClassifier at 0x1fdaa92eed0>

In [16]:
from sklearn.metrics import roc_auc_score

In [17]:
# Model evaluation (LightGBM)
y_pred_prob = cb.predict_proba(X_val_scaled)[:, 1]
roc_auc = roc_auc_score(y_val, y_pred_prob)
print(f'Cat Boost AUROC: {roc_auc:.4f}')

Cat Boost AUROC: 0.8991


Final testing

In [18]:
test_data = pd.read_csv('../data/raw/test.csv',low_memory=False)

In [19]:
test_data_processed = pd.get_dummies(test_data)
test_data_processed = test_data_processed.reindex(columns=features.columns, fill_value=0) 
test_data_scaled = scaler.transform(test_data_processed)
predictions = cb.predict_proba(test_data_scaled)[:,1]

In [20]:
test_data['drafted'] = predictions
print(test_data[['player_id', 'drafted']])

                                 player_id   drafted
0     cf302b4d-84f7-4124-a25d-a75eed31978b  0.156081
1     f91837cd-4f49-4b70-963d-aeb82c6ce3da  0.135288
2     53ec2a29-1e7d-4c6d-86d7-d60d02af8916  0.363793
3     32402798-471c-4a54-8cb4-29cd95199014  0.231430
4     73b960f9-27b8-4431-9d23-a760e9bbc360  0.603604
...                                    ...       ...
4965  a25ee55f-02a3-4f8e-8194-a5f427e14e7c  0.079086
4966  d0d9f45e-7b01-44b3-8d40-514ec338611d  0.012200
4967  f8df22c4-1602-4fab-896d-8820951aae2f  0.014382
4968  b791c69a-f769-4163-afda-051a6fd20a9d  0.014207
4969  18b51f5d-4746-4121-88fd-c8d0a1399130  0.064051

[4970 rows x 2 columns]


In [21]:
print(test_data.shape)

(4970, 64)


Saving to csv

In [22]:
test_data[['player_id', 'drafted']].to_csv('predictions_cat.csv',index=False)