In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from scipy import stats

from f1score import score
import kaggle_metric_utilities

In [3]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
train_df.dropna(inplace=True)
train_df['ShootingEfficiency'] = train_df['FG_PCT'] * train_df['FG3_PCT']
train_df['TotalAttempts'] = train_df['FGA'] + train_df['FG3A'] + train_df['FTA']
z_scores = np.abs(stats.zscore(train_df.select_dtypes(include=[np.number])))
train_df = train_df[(z_scores < 3).all(axis=1)]

X = train_df.drop(['position', 'SEASON_ID', 'TEAM_ID', 'PLAYER_AGE', 'MIN'], axis=1)
y = train_df['position']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [4]:
dt = DecisionTreeClassifier(random_state=42)
param_grid_dt = {
    'max_depth': [5, 7, 10, 15],
    'min_samples_split': [10, 15, 20, 25, 30],
    'ccp_alpha': [0.0, 0.01, 0.05, 0.1]  # pruning parameter
}
grid_dt = GridSearchCV(dt, param_grid_dt, scoring='f1_weighted', cv=skf)
grid_dt.fit(X_train, y_train)

print(f'Decision Tree Best parameters: {grid_dt.best_params_}, Best Score: {grid_dt.best_score_}')

best_dt_model = grid_dt.best_estimator_
y_val_pred = best_dt_model.predict(X_val)

val_f1_score = f1_score(y_val, y_val_pred, average='weighted')
print(f'Validation Weighted F1 Score: {val_f1_score}')

Decision Tree Best parameters: {'ccp_alpha': 0.0, 'max_depth': 10, 'min_samples_split': 20}, Best Score: 0.5453909596190348
Validation Weighted F1 Score: 0.5609789559901469


In [5]:
# Evaluate each model and store F1 scores

name = 'Decision Tree'
model = best_dt_model

y_pred = model.predict(X_val)
score = f1_score(y_val, y_pred, average='weighted')
f1_scores = score
print(f'{name} Weighted F1 Score: {score}')
# print(classification_report(y_val, y_pred, target_names=label_encoder.classes_))


# plt.figure(figsize=(10, 6))
# plt.bar(f1_scores.keys(), f1_scores.values(), color=['blue', 'green', 'orange'])
# plt.title('Weighted F1 Score Comparison of Classifiers')
# plt.xlabel('Classifier')
# plt.ylabel('Weighted F1 Score')
# plt.ylim(0, 1)
# plt.show()


Decision Tree Weighted F1 Score: 0.5609789559901469


In [7]:
X_test = test_df.drop(['ID', 'SEASON_ID', 'TEAM_ID', 'PLAYER_AGE'], axis=1)
X_test = X_test.reindex(columns=X.columns, fill_value=0)
X_test = scaler.transform(X_test)

name = 'Decision Tree'
model = best_dt_model

y_test_pred = model.predict(X_test)
y_test_pred_decoded = label_encoder.inverse_transform(y_test_pred)
submission_df = pd.DataFrame({'ID': test_df['ID'], 'position': y_test_pred_decoded})
submission_file_path = f'submission_{name}.csv'
submission_df.to_csv(submission_file_path, index=False)
print(f"{name} Done.")


Decision Tree Done.
