In the regression task, your goal is to predict the minutes played in a year. (Column “MIN” in csv file)

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from scipy import stats
from sklearn.manifold import TSNE

In [12]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Drop the 'id' column
train_df = train_df.drop(columns=['SEASON_ID', 'TEAM_ID', 'position'])
test_df = test_df.drop(columns=['SEASON_ID', 'TEAM_ID', 'ID'])

# Delete rows with missing values
train_df = train_df.dropna()

z_scores = np.abs(stats.zscore(train_df))
train_df = train_df[(z_scores < 3).all(axis=1)]

# Split the data into features and target
X = train_df.drop(columns='MIN')
y = train_df['MIN']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [13]:
# kNN Regression
knn_reg = KNeighborsRegressor()

param_grid = {
    'n_neighbors': np.arange(1, 21),
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute']
}

knn_reg_cv = RandomizedSearchCV(knn_reg, param_grid, cv=skf, n_iter=100, n_jobs=-1, random_state=42, scoring='neg_mean_squared_error')
knn_reg_cv.fit(X_train, y_train)
knn_reg = knn_reg_cv.best_estimator_
print('kNN Regression')
print('Best parameters:', knn_reg_cv.best_params_)
print('Best score:', knn_reg_cv.best_score_)
print('Test score:', knn_reg_cv.score(X_test, y_test))





kNN Regression
Best parameters: {'weights': 'distance', 'n_neighbors': np.int64(18), 'algorithm': 'kd_tree'}
Best score: -26381.890391991677
Test score: -25212.11420564825


In [14]:
# test the model with test_df
# save csv ['INDEX', 'MIN']

X_test = test_df
X_test = scaler.transform(X_test)
y_test = knn_reg.predict(X_test)

result_df = pd.DataFrame({'ID': np.arange(1, len(y_test) + 1), 'MIN': y_test})
result_df.to_csv('result.csv', index=False)
