In the regression task, your goal is to predict the minutes played in a year. (Column “MIN” in csv file)

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from scipy import stats
from sklearn.manifold import TSNE

In [None]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

# Drop the 'id' column
train_df = train_df.drop(columns=['SEASON_ID', 'TEAM_ID', 'position'])
test_df = test_df.drop(columns=['SEASON_ID', 'TEAM_ID', 'ID'])

# Delete rows with missing values
train_df = train_df.dropna()

z_scores = np.abs(stats.zscore(train_df))
train_df = train_df[(z_scores < 3).all(axis=1)]

# Split the data into features and target
X = train_df.drop(columns='MIN')
y = train_df['MIN']

# Standardize the features
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [17]:
# Regression Tree
reg_tree = DecisionTreeRegressor(random_state=42)

# Grid search
param_grid = {
    'max_depth': np.arange(1, 20),
    'min_samples_split': np.arange(2, 10),
    'min_samples_leaf': np.arange(1, 10),
    'max_features': [None]
}
grid_search = GridSearchCV(reg_tree, param_grid, cv=skf, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
print('Best parameters:', grid_search.best_params_)
print('Best score:', grid_search.best_score_)
reg_tree = grid_search.best_estimator_

# # Randomized search
# param_dist = {
#     'max_depth': np.arange(1, 100),
#     'min_samples_split': np.arange(2, 100),
#     'min_samples_leaf': np.arange(1, 100)
# }
# random_search = RandomizedSearchCV(
#     reg_tree, param_distributions=param_dist, 
#     n_iter=50, cv=skf, scoring='neg_mean_absolute_error', 
#     random_state=42, n_jobs=-1
# )
# random_search.fit(X_train, y_train)
# print('Best parameters:', random_search.best_params_)
# print('Best score:', random_search.best_score_)
# reg_tree = random_search.best_estimator_

# Cross-validation
scores = cross_val_score(reg_tree, X_train, y_train, cv=skf, scoring='neg_mean_squared_error')
print('Cross-validation scores:', scores)
print('Mean score:', scores.mean())


# Best parameters: {'max_depth': np.int64(13), 'max_features': None, 'min_samples_leaf': np.int64(16), 'min_samples_split': np.int64(2)}



Best parameters: {'max_depth': np.int64(8), 'max_features': None, 'min_samples_leaf': np.int64(9), 'min_samples_split': np.int64(2)}
Best score: -35876.48795709949




Cross-validation scores: [-32666.84851956 -37271.97861888 -35089.25050814 -37731.44218041
 -36622.91995851]
Mean score: -35876.48795709949


In [15]:
# test the model with test_df
# save csv ['INDEX', 'MIN']

X_test = test_df
X_test = scaler.transform(X_test)
y_test = reg_tree.predict(X_test)


result_df = pd.DataFrame({'ID': np.arange(1, len(y_test) + 1), 'MIN': y_test})
result_df.to_csv('result.csv', index=False)
