In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from scipy import stats
from sklearn.manifold import TSNE
from sklearn.impute import KNNImputer

In [5]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Drop missing data
# train_df.dropna(inplace=True)

# Drop outliers
# z_scores = np.abs(stats.zscore(train_df.select_dtypes(include=[np.number])))
# train_df = train_df[(z_scores < 3).all(axis=1)]

# train_df['ShootingEfficiency'] = train_df['FG_PCT'] * train_df['FG3_PCT']
# test_df['ShootingEfficiency'] = test_df['FG_PCT'] * test_df['FG3_PCT']

# train_df['TotalAttempts'] = train_df['FGA'] + train_df['FG3A'] + train_df['FTA']
# test_df['TotalAttempts'] = test_df['FGA'] + test_df['FG3A'] + test_df['FTA']

X = train_df.drop(['position', 'SEASON_ID', 'TEAM_ID', 'GS', 'GP', 'MIN'], axis=1)
y = train_df['position']
test_df = test_df.drop(['ID', 'SEASON_ID', 'TEAM_ID'], axis=1)

# fill missing data
imputer = KNNImputer(n_neighbors=5)
imputer.fit(X.dropna())
X = imputer.transform(X)

column_name = ["PLAYER_AGE", "FGM", "FGA", "FG_PCT", "FG3M", "FG3A", "FG3_PCT", "FTM", "FTA", "FT_PCT", "OREB", "DREB", "REB", "AST", "STL", "BLK", "TOV", "PF", "PTS"]
X = pd.DataFrame(X, columns=column_name)

removed_X_set = []
# Delete each one columns
for column in X.columns:
    removed_X_set.append(X.drop([column], axis=1))
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

X_train_set = []
X_val_set = []
y_train_set = []
y_val_set = []

for removed_X in removed_X_set:
    X_train, X_val, y_train, y_val = train_test_split(removed_X, y_encoded, test_size=0.1, random_state=42)
    X_train_set.append(X_train)
    X_val_set.append(X_val)
    y_train_set.append(y_train)
    y_val_set.append(y_val)

# Base
model = KNeighborsClassifier(n_neighbors=11, weights='distance', algorithm='auto')
X_train, X_val, y_train, y_val = train_test_split(removed_X, y_encoded, test_size=0.1, random_state=42)
model = model.fit(X_train, y_train)
y_pred = model.predict(X_val)
f1_base = f1_score(y_val, y_pred, average='weighted')
print(f"Base f1 score: {f1_base}")

# Model
for i in range(len(X_train_set)):
    model = KNeighborsClassifier(n_neighbors=11, weights='distance', algorithm='auto')
    model = model.fit(X_train_set[i], y_train_set[i])
    # f1 score
    y_pred = model.predict(X_val_set[i])
    f1 = f1_score(y_val_set[i], y_pred, average='weighted')
    print(f"removed column: {X.columns[i]}, f1 score: {f1}")
    print(f"affect: {f1_base - f1}")





#/////////////////////////////////////

# # TSNE
# tsne = TSNE(n_components=3, random_state=42)

# X_tsne = tsne.fit_transform(X)
# X_tsne_df = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2', 'TSNE3'])
# X = pd.DataFrame(X, columns=train_df.drop(['position', 'SEASON_ID', 'TEAM_ID', 'GS', 'GP', 'MIN'], axis=1).columns)
# X = pd.concat([X, X_tsne_df], axis=1)

# test_df_tsne = tsne.fit_transform(test_df)
# test_df_tsne_df = pd.DataFrame(test_df_tsne, columns=['TSNE1', 'TSNE2', 'TSNE3'])
# test_df = pd.DataFrame(test_df, columns=test_df.columns)
# test_df = pd.concat([test_df, test_df_tsne_df], axis=1)

# standardize data
# scaler = StandardScaler()
# X = scaler.fit_transform(X)

# minmax scale data
# scaler = MinMaxScaler()
# X = scaler.fit_transform(X)

# # encode target
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)

# # split data into training and validation sets
# X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.1, random_state=42)

# # Model
# model = KNeighborsClassifier(n_neighbors=12, weights='distance', algorithm='auto')
# model = model.fit(X_train, y_train)
# # f1 score
# y_pred = model.predict(X_val)
# f1 = f1_score(y_val, y_pred, average='weighted')
# print(f"f1 score: {f1}")

Base f1 score: 0.5943354349756315
removed column: PLAYER_AGE, f1 score: 0.5819354269210621
affect: 0.012400008054569311
removed column: FGM, f1 score: 0.5939056065448889
affect: 0.0004298284307425737
removed column: FGA, f1 score: 0.5915727799604601
affect: 0.002762655015171367
removed column: FG_PCT, f1 score: 0.5849631731843324
affect: 0.00937226179129902
removed column: FG3M, f1 score: 0.5873370253930995
affect: 0.00699840958253195
removed column: FG3A, f1 score: 0.5908894068164683
affect: 0.0034460281591631725
removed column: FG3_PCT, f1 score: 0.5844142002177929
affect: 0.00992123475783857
removed column: FTM, f1 score: 0.5887627604379239
affect: 0.0055726745377076
removed column: FTA, f1 score: 0.5940233015103851
affect: 0.0003121334652463892
removed column: FT_PCT, f1 score: 0.585056624460217
affect: 0.00927881051541446
removed column: OREB, f1 score: 0.5834252194310912
affect: 0.010910215544540303
removed column: DREB, f1 score: 0.5865288800265492
affect: 0.007806554949082223
r

# OUTPUT
Base f1 score: 0.5943354349756315
removed column: PLAYER_AGE, f1 score: 0.5819354269210621
affect: 0.012400008054569311
removed column: FGM, f1 score: 0.5939056065448889
affect: 0.0004298284307425737
removed column: FGA, f1 score: 0.5915727799604601
affect: 0.002762655015171367
removed column: FG_PCT, f1 score: 0.5849631731843324
affect: 0.00937226179129902
removed column: FG3M, f1 score: 0.5873370253930995
affect: 0.00699840958253195
removed column: FG3A, f1 score: 0.5908894068164683
affect: 0.0034460281591631725
removed column: FG3_PCT, f1 score: 0.5844142002177929
affect: 0.00992123475783857
removed column: FTM, f1 score: 0.5887627604379239
affect: 0.0055726745377076
removed column: FTA, f1 score: 0.5940233015103851
affect: 0.0003121334652463892
removed column: FT_PCT, f1 score: 0.585056624460217
affect: 0.00927881051541446
removed column: OREB, f1 score: 0.5834252194310912
affect: 0.010910215544540303
removed column: DREB, f1 score: 0.5865288800265492
affect: 0.007806554949082223
removed column: REB, f1 score: 0.587928044013725
affect: 0.006407390961906456
removed column: AST, f1 score: 0.5738186943130961
affect: 0.020516740662535393
removed column: STL, f1 score: 0.5936371253841272
affect: 0.0006983095915042981
removed column: BLK, f1 score: 0.5836534348110107
affect: 0.01068200016462073
removed column: TOV, f1 score: 0.5924477384675677
affect: 0.001887696508063752
removed column: PF, f1 score: 0.5913229130347208
affect: 0.0030125219409106174
removed column: PTS, f1 score: 0.5943354349756315
affect: 0.0
FTM, FTA, STL, PTS -> let's remove!

In [3]:
X_test = test_df
# X_test = scaler.transform(X_test)
y_test = model.predict(X_test)
y_test = label_encoder.inverse_transform(y_test)

result_df = pd.DataFrame({'ID': np.arange(1, len(y_test) + 1), 'position': y_test})
result_df.to_csv('result.csv', index=False)