### Logistics summary
You can use any models or methods you have learned in class (Decision Tree, Ensembles, SVM, etc.) *except kNN   
In the classification task, we will use Weighted F1 score for the evaluation metric.

### Versions
> 3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]   
> numpy: 2.1.0   
> pandas: 2.2.2   
> matplotlib.pyplot: 3.9.2   
> sklearn: 1.5.2   
> scipy: 1.14.1   
> seaborn: 0.13.2   
> xgboost: 2.1.2   


### Import modules

In [162]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

### Settings

In [163]:
from importlib import reload
import sys
from settings import *
reload(sys.modules['settings'])

<module 'settings' from 'd:\\Workspace\\AI_homework_5\\Classification\\settings.py'>

### Output log

In [164]:
if LOG:
    log_file = open(LOG_FILE, 'w')

### Import CSV data, Preprocessing #1

In [165]:
training_data = pd.read_csv(DATA_PATH + "\\train.csv")
testing_data = pd.read_csv(DATA_PATH + "\\test.csv")

# Label encoding
label_encoder = LabelEncoder()
training_data["position"] = label_encoder.fit_transform(training_data["position"])
# Drop unnecessary columns
drop_col = []
for col in training_data.columns:
    if col not in testing_data.columns:
        drop_col.append(col)
drop_col.remove("position")
training_data.drop(columns=drop_col, inplace=True)
training_data.drop(columns=["SEASON_ID", "TEAM_ID"], inplace=True)
testing_data.drop(columns=["ID", "SEASON_ID", "TEAM_ID"], inplace=True)
# Process missing values
if METHOD_PROCESSING_MISSING_VALUES == "drop":
    training_data.dropna(inplace=True)
elif METHOD_PROCESSING_MISSING_VALUES == "mean":
    training_data.fillna(training_data.mean(), inplace=True)
elif METHOD_PROCESSING_MISSING_VALUES == "med":
    training_data.fillna(training_data.median(), inplace=True)
elif METHOD_PROCESSING_MISSING_VALUES == "mode":
    training_data.fillna(training_data.mode(), inplace=True)
else:
    raise("Invalid method for processing missing values")
# Divide data into X and y
X = training_data.drop(columns=["position"])
y = training_data["position"]
# TSNE or PCA
if ADD_TSNE[0] == True:
    tsne = TSNE(n_components=ADD_TSNE[1])
    X_tsne = tsne.fit_transform(X)
    X = pd.concat([X, pd.DataFrame(X_tsne)], axis=1)
if ADD_PCA[0] == True:
    pca = PCA(n_components=ADD_PCA[1])
    X_pca = pca.fit_transform(X)
    X = pd.concat([X, pd.DataFrame(X_pca)], axis=1)
# Info. of data
print(drop_col)
print(len(X.columns))
training_data

if LOG:
    try: 
        log_file.write("Number of columns: " + str(len(X.columns)) + "\n")
        log_file.write("Number of rows: " + str(len(X)) + "\n")
        log_file.write("Columns: ")
        for col in X.columns:
            log_file.write(col + " ")
        log_file.write("\n")
    except IOError:
        print("logger is not working")

['GP', 'GS', 'MIN']
19


### EDA (Heatmap)

In [166]:
## Output image was saved as "Corr_heatmap.png" in WORKSPACE folder.

# plt.figure(figsize=(15,15))
# figure = sns.heatmap(training_data.corr(), annot=True, fmt=".2f")

# plt.show()

### Feature Selection

In [167]:
if 0 > NUM_SELECTED_FEATURES or NUM_SELECTED_FEATURES > len(X.columns):
    raise("Invalid number of selected features")
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=NUM_SELECTED_FEATURES)
rfe.fit(X, y)

print("Selected features: ", end="")
for i in range(X.shape[1]):
    if rfe.support_[i]:
        print(X.columns[i], end=", ")
print()

X_selected = rfe.transform(X)
testing_data_selected = rfe.transform(testing_data)

if LOG:
    try:
        log_file.write("Number of selected features: " + str(NUM_SELECTED_FEATURES) + "\n")
        log_file.write("Selected features: ")
        for i in range(X.shape[1]):
            if rfe.support_[i]:
                log_file.write(X.columns[i] + ", ")
        log_file.write("\n")
    except IOError:
        print("logger is not working")


Selected features: PLAYER_AGE, FGA, FG_PCT, FG3A, FTM, FT_PCT, OREB, DREB, REB, AST, STL, BLK, TOV, PF, 


### Model Selection

In [168]:
X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.2, random_state=42)

#MODEL_TYPE = "dt" # "dt", "en-rf", "en-xgbst", "svm"

model = None
if MODEL_TYPE == "dt":
    model = DecisionTreeClassifier()
elif MODEL_TYPE == "en-rf":
    model = RandomForestClassifier()
elif MODEL_TYPE == "en-xgbst":
    model = XGBClassifier()
elif MODEL_TYPE == "svm":
    model = SVC()
else: 
    raise("Invalid model type")

print(type(model))
model.fit(X_train, y_train)

y_pred = model.predict(X_val)
print(f1_score(y_val, y_pred, average="weighted"))
print(classification_report(y_val, y_pred))

# Baseline of Decision Tree: 0.50~0.52
# Baseline of Random Forest: 0.59

if LOG:
    try: 
        log_file.write("Model: " + str(type(model)) + "\n")
        log_file.write("F1 score: " + str(f1_score(y_val, y_pred, average="weighted")) + "\n")
        log_file.write("Classification report: \n" + classification_report(y_val, y_pred) + "\n")
    except IOError:
        print("logger is not working")

<class 'xgboost.sklearn.XGBClassifier'>
0.6158843005912215
              precision    recall  f1-score   support

           0       0.58      0.66      0.62       308
           1       0.43      0.10      0.17        98
           2       0.57      0.73      0.64       748
           3       0.23      0.07      0.11       134
           4       0.36      0.07      0.12        55
           5       0.80      0.87      0.83       884
           6       0.34      0.11      0.17       149
           7       0.00      0.00      0.00        15

    accuracy                           0.65      2391
   macro avg       0.42      0.33      0.33      2391
weighted avg       0.61      0.65      0.62      2391



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Training new model with selected model, hyper parameters.

In [173]:
def for_submission():
    model = XGBClassifier()
    model.fit(X_selected, y)

    y_pred = model.predict(testing_data_selected)

    submission_df = pd.DataFrame()
    temp = pd.read_csv(DATA_PATH + "\\test.csv")
    submission_df["ID"] = temp["ID"]
    
    submission_df["position"] = label_encoder.inverse_transform(y_pred)

    print(submission_df)
    submission_df.to_csv("submission.csv", index=False)

for_submission()


        ID        position
0        1         Forward
1        2           Guard
2        3           Guard
3        4          Center
4        5          Center
...    ...             ...
1995  1996           Guard
1996  1997           Guard
1997  1998         Forward
1998  1999  Forward-Center
1999  2000           Guard

[2000 rows x 2 columns]



---
### Close log.

In [170]:
try:
    log_file.close()
except IOError:
    print("logger is not working")