In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier,XGBRFClassifier, XGBRegressor, XGBRFRegressor
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from importlib import reload
from pactools.grid_search import GridSearchCVProgressBar
import sys

training_data = pd.read_csv('2024-introduction-to-data-analysis-hw-5-1/train.csv')
testing_data = pd.read_csv('2024-introduction-to-data-analysis-hw-5-1/test.csv')

# training_data.dropna(inplace=True)
training_data.dropna(subset=["position"], inplace=True)
training_data.fillna(0, inplace=True)

training_data.drop(columns=["SEASON_ID", "TEAM_ID", 'GP', 'GS', 'MIN'], inplace=True)
testing_data.drop(columns=["ID", "SEASON_ID", "TEAM_ID"], inplace=True)

label_encoder = LabelEncoder()
training_data["position"] = label_encoder.fit_transform(training_data["position"])

X = training_data.drop(columns=["position"])
y = training_data["position"]

params = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [10, 20, 30, 40, 50],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

model = RandomForestClassifier()
search = RandomizedSearchCV(model, params, n_iter=100, cv=5, verbose=1, n_jobs=-1)
search.fit(X, y)
print(search.best_params_)
# model.fit(X, y)
model = search.best_estimator_

y_pred = model.predict(testing_data)
submission_df = pd.DataFrame()
temp = pd.read_csv('2024-introduction-to-data-analysis-hw-5-1/test.csv')
submission_df["ID"] = temp["ID"]
submission_df["position"] = label_encoder.inverse_transform(y_pred)
print(submission_df)
submission_df.to_csv("test_submission.csv", index=False)

Fitting 5 folds for each of 100 candidates, totalling 500 fits
{'n_estimators': 300, 'min_samples_split': 5, 'min_samples_leaf': 1, 'max_depth': 40, 'bootstrap': False}
        ID        position
0        1         Forward
1        2           Guard
2        3           Guard
3        4          Center
4        5          Center
...    ...             ...
1995  1996         Forward
1996  1997         Forward
1997  1998         Forward
1998  1999  Forward-Center
1999  2000           Guard

[2000 rows x 2 columns]
