# Setup
You can use any models or methods you have learned in class (Decision Tree, Ensembles, SVM, etc.) *except kNN   
In the regression task, MSE


## Versions
> 3.10.6 (tags/v3.10.6:9c7b4bd, Aug  1 2022, 21:53:49) [MSC v.1932 64 bit (AMD64)]   
> numpy: 2.1.0   
> pandas: 2.2.2   
> matplotlib.pyplot: 3.9.2   
> sklearn: 1.5.2   
> scipy: 1.14.1   
> seaborn: 0.13.2   
> xgboost: 2.1.2   


## Import modules

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from sklearn.preprocessing import StandardScaler, LabelEncoder, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from xgboost import XGBClassifier,XGBRFClassifier, XGBRegressor, XGBRFRegressor
from sklearn.feature_selection import RFE
from sklearn.svm import SVC
from importlib import reload
from pactools.grid_search import GridSearchCVProgressBar
import sys

## Settings

In [43]:
from settings import *
reload(sys.modules['settings'])
from settings import *
reload(sys.modules['settings'])

  WORKSPACE = "D:\Workspace\Dataanalysis_homework_5\Regression"


<module 'settings' from 'd:\\Workspace\\Dataanalysis_homework_5\\Regression\\settings.py'>

## Output log

In [44]:
if LOG:
    log_file = open(LOG_FILE, 'w')

  log_file = open(LOG_FILE, 'w')


# Data Preprocessing

In [45]:
training_data = pd.read_csv(DATA_PATH + "\\train.csv")
testing_data = pd.read_csv(DATA_PATH + "\\test.csv")

# Drop unnecessary columns
drop_col = []
for col in training_data.columns:
    if col not in testing_data.columns:
        drop_col.append(col)
drop_col.remove("MIN")
training_data.drop(columns=drop_col, inplace=True)
training_data.drop(columns=["SEASON_ID", "TEAM_ID"], inplace=True)
testing_data.drop(columns=["ID", "SEASON_ID", "TEAM_ID"], inplace=True)

# Process missing values
if METHOD_PROCESSING_MISSING_VALUES == "drop":
    training_data.dropna(inplace=True)
elif METHOD_PROCESSING_MISSING_VALUES == "mean":
    training_data.fillna(training_data.mean(), inplace=True)
elif METHOD_PROCESSING_MISSING_VALUES == "med":
    training_data.fillna(training_data.median(), inplace=True)
elif METHOD_PROCESSING_MISSING_VALUES == "mode":
    training_data.fillna(training_data.mode(), inplace=True)
else:
    raise("Invalid method for processing missing values")

# Divide data into X and y
X = training_data.drop(columns=["MIN"])
y = training_data["MIN"]

# Feature scaling
if SCALER == None:
    pass
elif SCALER == "std":
    scaler = StandardScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    testing_data = pd.DataFrame(scaler.transform(testing_data), columns=testing_data.columns)
elif SCALER == "minmax":
    scaler = MinMaxScaler()
    X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)
    testing_data = pd.DataFrame(scaler.transform(testing_data), columns=testing_data.columns)
else:
    raise("Invalid scaler")

if LOG:
    try:
        log_file.write("Scaler: " + str(scaler) + "\n")
    except IOError:
        print("logger is not working")

# TSNE or PCA
if ADD_TSNE[0] == True:
    tsne = TSNE(n_components=ADD_TSNE[1])
    X_tsne = tsne.fit_transform(X)
    X = pd.concat([X, pd.DataFrame(X_tsne)], axis=1)
if ADD_PCA[0] == True:
    pca = PCA(n_components=ADD_PCA[1])
    X_pca = pca.fit_transform(X)
    X = pd.concat([X, pd.DataFrame(X_pca)], axis=1)

# Info. of data
print(drop_col)
print(len(X.columns))
training_data

# logging
if LOG:
    try: 
        log_file.write("Number of columns: " + str(len(X.columns)) + "\n")
        log_file.write("Number of rows: " + str(len(X)) + "\n")
        log_file.write("Columns: ")
        for col in X.columns:
            log_file.write(str(col) + " ")
        log_file.write("\n")
    except IOError:
        print("logger is not working")

['position']
19


### Feature Selection

In [46]:
# Remove unrelated feature using RFE.
if 0 > NUM_SELECTED_FEATURES or NUM_SELECTED_FEATURES > len(X.columns):
    raise("Invalid number of selected features")
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=NUM_SELECTED_FEATURES)
rfe.fit(X, y)

print("Selected features: ", end="")
for i in range(X.shape[1]):
    if rfe.support_[i]:
        print(X.columns[i], end=", ")
print()

X_selected = rfe.transform(X)
# testing_data_selected = rfe.transform(testing_data)
testing_data_selected = testing_data[X.columns[rfe.support_]]

# Logging
if LOG:
    try:
        log_file.write("Number of selected features: " + str(NUM_SELECTED_FEATURES) + "\n")
        log_file.write("Selected features: ")
        for i in range(X.shape[1]):
            if rfe.support_[i]:
                log_file.write(X.columns[i] + ", ")
        log_file.write("\n")
    except IOError:
        print("logger is not working")

# Selected features: PLAYER_AGE, FGM, FGA, FG_PCT, FTM, FTA, FT_PCT, OREB, REB, AST, STL, TOV, PF, PTS, 

Selected features: PLAYER_AGE, FGA, FG_PCT, FTM, FTA, FT_PCT, OREB, REB, AST, STL, BLK, TOV, PF, PTS, 


### Validation

In [47]:
kf = KFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)
skf = StratifiedKFold(n_splits=NUM_FOLDS, shuffle=True, random_state=42)

# Model Selection

In [48]:
model = None
params = None
if METHOD_MODEL == "xgb":
    model = XGBRegressor()
    params = XGBOOST_PARAMS
elif METHOD_MODEL == "rf":
    model = RandomForestRegressor()
    params = RANDOM_FOREST_PARAMS
elif METHOD_MODEL == "svc":
    model = SVC()
    params = SVC_PARAMS
elif METHOD_MODEL == 'xgbrf':
    model = XGBRFRegressor()
    params = XGBOOST_PARAMS
else:
    raise("Invalid model")

if LOG:
    try:
        log_file.write("Model: " + METHOD_MODEL + "\n")
        log_file.write("Validation: " + METHOD_VALIDATION + "\n")
        log_file.write("Search: " + METHOD_SEARCH + "\n")
        log_file.write("Parameters: " + str(params) + "\n")
    except IOError:
        print("logger is not working")

if METHOD_VALIDATION == "kf":
    if METHOD_SEARCH == "grid":
        grid = GridSearchCVProgressBar(estimator=model, param_grid=params, cv=kf, n_jobs=-1, verbose=10, scoring="neg_mean_squared_error")
    elif METHOD_SEARCH == "random":
        grid = RandomizedSearchCV(estimator=model, param_distributions=params, cv=kf, n_jobs=-1, verbose=10, scoring="neg_mean_squared_error")
    else:
        raise("Invalid method for search")
    grid.fit(X_selected, y)
    print(grid.best_params_)
    print(grid.best_score_)
    if LOG:
        try:
            log_file.write("Best parameters: " + str(grid.best_params_) + "\n")
            log_file.write("Best score: " + str(grid.best_score_) + "\n")
        except IOError:
            print("logger is not working")


elif METHOD_VALIDATION == "skf":
    if METHOD_SEARCH == "grid":
        grid = GridSearchCV(estimator=model, param_grid=params, cv=skf, n_jobs=-1, verbose=10, scoring="neg_mean_squared_error")
    elif METHOD_SEARCH == "random":
        grid = RandomizedSearchCV(estimator=model, param_distributions=params, cv=skf, n_jobs=-1, verbose=10, scoring="neg_mean_squared_error")
    else:
        raise("Invalid method for search")
    grid.fit(X_selected, y)
    print(grid.best_params_)
    print(grid.best_score_)
    if LOG:
        try:
            log_file.write("Best parameters: " + str(grid.best_params_) + "\n")
            log_file.write("Best score: " + str(grid.best_score_) + "\n")
        except IOError:
            print("logger is not working")

elif METHOD_VALIDATION == None:
    X_train, X_val, y_train, y_val = train_test_split(X_selected, y, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    # MSE
    print(np.sqrt(np.mean((y_pred - y_val) ** 2)))
    if LOG:
        try:
            log_file.write("MSE: " + str(np.sqrt(np.mean((y_pred - y_val) ** 2))) + "\n")
        except IOError:
            print("logger is not working")


else:
    raise("Invalid method for validation")

best_model = grid.best_estimator_


Fitting 5 folds for each of 10 candidates, totalling 50 fits




{'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
-30753.01228983947


# Making Submissions

In [49]:
model = best_model
y_pred = model.predict(testing_data_selected)
submission_df = pd.DataFrame()
temp = pd.read_csv(DATA_PATH + "\\test.csv")
submission_df["ID"] = temp["ID"]
submission_df["MIN"] = y_pred
print(submission_df)
submission_df.to_csv("submission.csv", index=False)



        ID        MIN
0        1    42.2725
1        2  2355.9725
2        3  1646.2925
3        4   451.5825
4        5   129.4050
...    ...        ...
1995  1996  2525.1150
1996  1997  1118.0350
1997  1998  1007.0100
1998  1999  1089.7775
1999  2000   381.6375

[2000 rows x 2 columns]



---
### Close log.

In [50]:
try:
    log_file.close()
except IOError:
    print("logger is not working")