# Setup
You can use any models or methods you have learned in class (Decision Tree, Ensembles, SVM, etc.) *except kNN   
In the classification task, we will use Weighted F1 score for the evaluation metric.


## Versions
> 3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]   
> numpy: 2.1.0   
> pandas: 2.2.2   
> matplotlib.pyplot: 3.9.2   
> sklearn: 1.5.2   
> scipy: 1.14.1   
> seaborn: 0.13.2   
> xgboost: 2.1.2   


### Import modules

In [37]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from importlib import reload
import sys

### Settings

In [38]:
reload(sys.modules['settings'])
from settings import *

### Output log

In [39]:
if LOG:
    log_file = open(LOG_FILE, 'w')

### Import CSV data, Preprocessing #1

In [40]:
training_data = pd.read_csv(DATA_PATH + "\\train.csv")
testing_data = pd.read_csv(DATA_PATH + "\\test.csv")

# Label encoding
label_encoder = LabelEncoder()
training_data["position"] = label_encoder.fit_transform(training_data["position"])

# Drop unnecessary columns
drop_col = []
for col in training_data.columns:
    if col not in testing_data.columns:
        drop_col.append(col)
drop_col.remove("position")
training_data.drop(columns=drop_col, inplace=True)
training_data.drop(columns=["SEASON_ID", "TEAM_ID"], inplace=True)
testing_data.drop(columns=["ID", "SEASON_ID", "TEAM_ID"], inplace=True)

# Process missing values
if METHOD_PROCESSING_MISSING_VALUES == "drop":
    training_data.dropna(inplace=True)
elif METHOD_PROCESSING_MISSING_VALUES == "mean":
    training_data.fillna(training_data.mean(), inplace=True)
elif METHOD_PROCESSING_MISSING_VALUES == "med":
    training_data.fillna(training_data.median(), inplace=True)
elif METHOD_PROCESSING_MISSING_VALUES == "mode":
    training_data.fillna(training_data.mode(), inplace=True)
else:
    raise("Invalid method for processing missing values")

# Divide data into X and y
X = training_data.drop(columns=["position"])
y = training_data["position"]

# Feature scaling
if SCALER == None:
    pass
elif SCALER == "std":
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    testing_data = pd.DataFrame(scaler.transform(testing_data), columns=testing_data.columns)
elif SCALER == "minmax":
    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    testing_data = pd.DataFrame(scaler.transform(testing_data), columns=testing_data.columns)
else:
    raise("Invalid scaler")

# TSNE or PCA
if ADD_TSNE[0] == True:
    tsne = TSNE(n_components=ADD_TSNE[1])
    X_tsne = tsne.fit_transform(X)
    X = pd.concat([X, pd.DataFrame(X_tsne)], axis=1)
if ADD_PCA[0] == True:
    pca = PCA(n_components=ADD_PCA[1])
    X_pca = pca.fit_transform(X)
    X = pd.concat([X, pd.DataFrame(X_pca)], axis=1)

# Info. of data
print(drop_col)
print(len(X.columns))
training_data

# logging
if LOG:
    try: 
        log_file.write("Number of columns: " + str(len(X.columns)) + "\n")
        log_file.write("Number of rows: " + str(len(X)) + "\n")
        log_file.write("Columns: ")
        for col in X.columns:
            log_file.write(col + " ")
        log_file.write("\n")
    except IOError:
        print("logger is not working")

['GP', 'GS', 'MIN']
19


### EDA (Heatmap)

In [41]:
## Output image was saved as "Corr_heatmap.png" in WORKSPACE folder.

# plt.figure(figsize=(15,15))
# figure = sns.heatmap(training_data.corr(), annot=True, fmt=".2f")

# plt.show()

### Feature Selection

In [42]:
# Remove unrelated feature using RFE.
if 0 > NUM_SELECTED_FEATURES or NUM_SELECTED_FEATURES > len(X.columns):
    raise("Invalid number of selected features")
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=NUM_SELECTED_FEATURES)
rfe.fit(X, y)

print("Selected features: ", end="")
for i in range(X.shape[1]):
    if rfe.support_[i]:
        print(X.columns[i], end=", ")
print()

X_selected = rfe.transform(X)
testing_data_selected = rfe.transform(testing_data)

# Logging
if LOG:
    try:
        log_file.write("Number of selected features: " + str(NUM_SELECTED_FEATURES) + "\n")
        log_file.write("Selected features: ")
        for i in range(X.shape[1]):
            if rfe.support_[i]:
                log_file.write(X.columns[i] + ", ")
        log_file.write("\n")
    except IOError:
        print("logger is not working")


Selected features: PLAYER_AGE, FGA, FG_PCT, FG3A, FTA, FT_PCT, OREB, DREB, REB, AST, STL, BLK, PF, PTS, 


### XGBClassifier

In [None]:
model = XGBClassifier()
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
# params = {
#     # eta, num_boost_around, min_child_weight, max_depth, gamma, sub_sample, colsample_bytree, lambda, alpha, scale_pos_weight
#     "max_depth": [3, 5, 7, 9, 11],
# }

params = {
    'min_child_weight': [1, 5, 10],
    'gamma': [0.5, 1, 1.5, 2, 5],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'max_depth': [3, 4, 5]
}
# Note: https://www.kaggle.com/code/tilii7/hyperparameter-grid-search-with-xgboost

if LOG:
    try:
        log_file.write("Model: " + str(type(model)) + "\n")
    except IOError:
        print("logger is not working")

# Validation (Not implemented)
if METHOD_VALIDATION == "kf":
    grid = GridSearchCV(estimator=model, param_grid=params, cv=kf, n_jobs=-1, verbose=1)
    grid.fit(X_selected, y)
    print(grid.best_params_)
    print(grid.best_score_)
    if LOG:
        try:
            log_file.write("Best parameters: " + str(grid.best_params_) + "\n")
            log_file.write("Best score: " + str(grid.best_score_) + "\n")
        except IOError:
            print("logger is not working")


elif METHOD_VALIDATION == "skf":
    grid = GridSearchCV(estimator=model, param_grid=params, cv=skf, n_jobs=-1, verbose=1)
    grid.fit(X_selected, y)
    print(grid.best_params_)
    print(grid.best_score_)
    if LOG:
        try:
            log_file.write("Best parameters: " + str(grid.best_params_) + "\n")
            log_file.write("Best score: " + str(grid.best_score_) + "\n")
        except IOError:
            print("logger is not working")


elif METHOD_VALIDATION == None:
    X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    print(f1_score(y_val, y_pred, average="weighted"))
    print(classification_report(y_val, y_pred))
    if LOG:
        try:
            log_file.write("F1 score: " + str(f1_score(y_val, y_pred, average="weighted")) + "\n")
            log_file.write("Classification report: \n" + classification_report(y_val, y_pred) + "\n")
        except IOError:
            print("logger is not working")


else:
    raise("Invalid method for validation")


print(type(model))



Fitting 5 folds for each of 405 candidates, totalling 2025 fits


2 fits failed out of a total of 2025.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
2 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\carot\AppData\Local\Programs\Python\Python310\lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\carot\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\core.py", line 726, in inner_f
    return func(**kwargs)
  File "c:\Users\carot\AppData\Local\Programs\Python\Python310\lib\site-packages\xgboost\sklearn.py", line 1512, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "c:\Users\carot\AppData\Local\Programs\Python\Python310\lib

{'colsample_bytree': 0.6, 'gamma': 1, 'max_depth': 3, 'min_child_weight': 5, 'subsample': 0.6}
0.6449134743432922
<class 'xgboost.sklearn.XGBClassifier'>


### Training new model with selected model, hyper parameters.

In [47]:
def for_submission():
    model = XGBClassifier(colsample_bytree=0.6, gamma=1, max_depth=3, min_child_weight=5, subsample=0.6)
    model.fit(X_selected, y)

    y_pred = model.predict(testing_data_selected)

    submission_df = pd.DataFrame()
    temp = pd.read_csv(DATA_PATH + "\\test.csv")
    submission_df["ID"] = temp["ID"]
    
    submission_df["position"] = label_encoder.inverse_transform(y_pred)

    print(submission_df)
    submission_df.to_csv("submission.csv", index=False)

for_submission()


        ID position
0        1  Forward
1        2    Guard
2        3    Guard
3        4   Center
4        5   Center
...    ...      ...
1995  1996    Guard
1996  1997    Guard
1997  1998  Forward
1998  1999   Center
1999  2000    Guard

[2000 rows x 2 columns]



---
### Close log.

In [45]:
try:
    log_file.close()
except IOError:
    print("logger is not working")