In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from scipy import stats
from sklearn.manifold import TSNE
from sklearn.impute import KNNImputer

In [11]:
def encode_categorical(X, y):
    # standardize data
    # scaler = StandardScaler()
    # X = scaler.fit_transform(X)

    # minmax scale data
    # scaler = MinMaxScaler()
    # X = scaler.fit_transform(X)

    # Standardize the features
    # Split the data into training and testing sets
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=42)

    # skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # algo = ['auto', 'ball_tree', 'kd_tree', 'brute']
    # wei = ['uniform', 'distance']
    # n_nei = list(range(3, 15))

    # k-fold
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    # kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    model = KNeighborsRegressor()
    params = {
        'n_neighbors': list(range(3, 27, 2)),
        'weights': ['uniform', 'distance'],
        'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
    }

    grid_search = GridSearchCV(model, params, cv = kf, scoring='neg_mean_squared_error', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    print(grid_search.best_params_)
    print(grid_search.best_score_)

    y_pred = grid_search.predict(X_val)
    loss = np.mean((y_val - y_pred) ** 2)
    print(f"loss: {loss}")
    return loss




# !!!!!!!!!!!!!!!
# model = KNeighborsRegressor(n_neighbors=9, weights='distance', algorithm='auto')
# model.fit(X_train, y_train)
# # MSE loss
# y_pred = model.predict(X_val)
# loss = np.mean((y_val - y_pred) ** 2)
# print(f"loss: {loss}")
# !!!!!!!!!!!!!!! 25382


In [12]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Drop the 'id' column
train_df = train_df.drop(columns=['SEASON_ID', 'TEAM_ID', 'position'])
test_df = test_df.drop(columns=['SEASON_ID', 'TEAM_ID', 'ID'])

# Delete rows with missing values
train_df.dropna(inplace=True)

# Delete outliers
# z_scores = np.abs(stats.zscore(train_df))
# train_df = train_df[(z_scores < 3).all(axis=1)]

# train_df['ShootingEfficiency'] = train_df['FG_PCT'] * train_df['FG3_PCT']
# test_df['ShootingEfficiency'] = test_df['FG_PCT'] * test_df['FG3_PCT']

# train_df['TotalAttempts'] = train_df['FGA'] + train_df['FG3A'] + train_df['FTA']
# test_df['TotalAttempts'] = test_df['FGA'] + test_df['FG3A'] + test_df['FTA']



# Split the data into features and target
X = train_df.drop(columns='MIN')
y = train_df['MIN']

column_name = ['PLAYER_AGE', 'FGM', 'FGA', 'FG_PCT', 'FG3M', 'FG3A', 'FG3_PCT', 'FTM', 'FTA', 'FT_PCT', 'OREB', 'DREB', 'REB', 'AST', 'STL', 'BLK', 'TOV', 'PF', 'PTS']



loss = encode_categorical(X, y)
removed_X_set = []
for i in range(0, len(column_name)):
    removed_X = X.drop(columns=column_name[i])
    removed_X_set.append(removed_X)
    print(f"removed column: {column_name[i]}")
    rloss = encode_categorical(removed_X, y)
    
    print(f"affect: {loss - rloss}")

# # TSNE
# tsne = TSNE(n_components=3, random_state=42)

# X_tsne = tsne.fit_transform(X)
# X_tsne_df = pd.DataFrame(X_tsne, columns=['TSNE1', 'TSNE2', 'TSNE3'])
# X = pd.concat([X.reset_index(drop=True), X_tsne_df], axis=1)

# test_df_tsne = tsne.fit_transform(test_df)
# test_df_tsne_df = pd.DataFrame(test_df_tsne, columns=['TSNE1', 'TSNE2', 'TSNE3'])
# test_df = pd.concat([test_df.reset_index(drop=True), test_df_tsne_df], axis=1)

{'algorithm': 'auto', 'n_neighbors': 13, 'weights': 'distance'}
-28868.932323744364
loss: 25704.258039678738
removed column: PLAYER_AGE
{'algorithm': 'auto', 'n_neighbors': 13, 'weights': 'distance'}
-28870.0409577437
loss: 25710.57093568315
affect: -6.312896004412323
removed column: FGM
{'algorithm': 'auto', 'n_neighbors': 17, 'weights': 'distance'}
-28626.98837264187
loss: 25716.2958435171
affect: -12.03780383836056
removed column: FGA
{'algorithm': 'auto', 'n_neighbors': 17, 'weights': 'distance'}
-29357.875703493824
loss: 27431.124150188607
affect: -1726.866110509869
removed column: FG_PCT
{'algorithm': 'ball_tree', 'n_neighbors': 13, 'weights': 'distance'}
-28868.95622711868
loss: 25704.32172810254
affect: -0.06368842380106798
removed column: FG3M
{'algorithm': 'auto', 'n_neighbors': 17, 'weights': 'distance'}
-28752.983726404782
loss: 26164.396557934182
affect: -460.13851825544407
removed column: FG3A
{'algorithm': 'auto', 'n_neighbors': 19, 'weights': 'distance'}
-28513.78271899

In [46]:
X_test = test_df
# y_test = model.predict(X_test)
y_test = grid_search.predict(X_test)

result_df = pd.DataFrame({'ID': np.arange(1, len(y_test) + 1), 'position': y_test})
result_df.to_csv('result.csv', index=False)