## README

- The **Features** of data sets 從 4~97 都有
- Function: Select_best_model
  = add XGBClassifier = 0.819854
  ```python
  # = 0.785959
  # GridSearchCV Don't run it on Colab, it's too slow
  # RandomForestClassifier
  param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
  }
  ```
- 使用 KNeighborsClassifier
  - k = 1, auc = 0.0514096
  - k = 3, auc = 0.224457
  - k = 5, auc = 0.149482
  - k = 7, auc = 0.362344
  - k = 9, auc = 0.42899
  - k = 11, auc = 0.471535
  - k = 13, auc = 0.524804
  - k = 15, auc = 0.524758
  - k = 17, auc = 0.553534
  - k = 19, auc = 0.566106
  - 窮舉不知道是不是好方法，public
leaderboard 表現好，競賽最終 private leaderboard 可能不好
- 基於 RandomForestClassifier
  - 對於 Categorical features 的資料處理不好
  ```python
  # = 0.67557
  RandomForestClassifier(n_estimators=100, random_state=42)
  ```
  ```python
  # = 0.759853
  # GridSearchCV Don't run it on Colab, it's too slow
  param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
  }
  ```
  ```python
  # = 0.740953
  # GridSearchCV Don't run it on Colab, it's too slow
  param_grid = {
      'n_estimators': [100, 200, 300],
      'max_depth': [10, 20, 30, None],
      'min_samples_split': [2, 5, 10],
      'min_samples_leaf': [1, 2, 4],
      'max_features': ['sqrt', 'log2']
  }
  ```
  ```python
  # = 0.667257
  # GridSearchCV Don't run it on Colab, it's too slow
  param_grid = {
      'n_estimators': [100, 200, 300],
      'max_depth': [10, 20, 30, None],
      'min_samples_split': [2, 5, 10],
      'min_samples_leaf': [1, 2, 4],
      'max_features': ['sqrt', 'log2', None]
  }
  ```
- 使用 GradientBoostingClassifier
  - n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42, auc = 0.491197

In [None]:
!pip install scikit-learn pandas



In [None]:
# only for use google colab (drive)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # RandomForestClassifier, GradientBoostingClassifier
# from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.utils.class_weight import compute_class_weight

# Read All Dataset CSV

In [None]:
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]
drive_path = "./drive/MyDrive/Colab Notebooks/Competition_data" # only for use google colab (drive)
for folder_name in os.listdir(drive_path):           # only for use google colab (drive)
# for folder_name in os.listdir("./Competition_data"):
  # print(folder_name)
  dataset_names.append(folder_name)
  # X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
  # y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
  # X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))
  X_trains.append(pd.read_csv(f"{drive_path}/{folder_name}/X_train.csv",header=0))  # only for use google colab (drive)
  y_trains.append(pd.read_csv(f"{drive_path}/{folder_name}/y_train.csv",header=0))  # only for use google colab (drive)
  X_tests.append(pd.read_csv(f"{drive_path}/{folder_name}/X_test.csv",header=0))   # only for use google colab (drive)
# print(y_trains)

## Data Preprocessing & Feature Engineering

In [None]:
## your code here
def preprocess_data(X_train, X_test):
  """
  Data Preprocessing & Feature Engineering
    a) Automatically identify numerical and categorical features
    b) Feature Scaling
    c) Categorical features
  """
  # a
  numeric_features = X_train.select_dtypes(include=['float64']).columns
  categorical_features = X_train.select_dtypes(include=['int64']).columns

  # b
  scaler = StandardScaler()

  if len(numeric_features) > 0:  # if numerical features
    X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
    X_test[numeric_features] = scaler.transform(X_test[numeric_features])

  # c
  return X_train, X_test

processed_X_trains = []
processed_X_tests = []

for i in range(len(dataset_names)):
  X_train, X_test = X_trains[i], X_tests[i]
  X_train, X_test = preprocess_data(X_train, X_test)
  processed_X_trains.append(X_train)
  processed_X_tests.append(X_test)


In [None]:
def select_best_model(X_train, y_train):
  """
  Multiple models and choose the best one
  based on AUC or cross-validation scores
  """
  models = {
    'SVM': SVC(probability=True, random_state=42),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(random_state=42)
    'XGB': XGBClassifier(eval_metric='logloss', random_state=42)
  }

  best_model = None
  best_auc = 0
  best_model_name = ""

  X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(
      X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
  )

  for model_name, model in models.items():
    model.fit(X_train_split, y_train_split)
    y_pred_prob = model.predict_proba(X_test_split)[:, 1]
    auc = roc_auc_score(y_test_split, y_pred_prob)

    if auc > best_auc:
      best_auc = auc
      best_model = model
      best_model_name = model_name

  best_model.fit(X_train, y_train)
  y_pred_prob = best_model.predict_proba(X_test_split)[:, 1]
  auc = roc_auc_score(y_test_split, y_pred_prob)

  return best_model, best_model_name, auc


## train test split & build Model
You can select an appropriate model and perform corresponding hyperparameter tuning.

## Main


In [None]:
# Main_1
models=[]
dataset_aucs=[]

for i in range(len(dataset_names)):
  tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(processed_X_trains[i], y_trains[i], test_size=0.2, random_state=42, stratify=y_trains[i])
  rf = RandomForestClassifier(random_state=42)
  param_grid = {
    'n_estimators': [100, 200],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
  }

  grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=1)
  grid_search.fit(tmp_X_train, tmp_y_train.squeeze())

  model = grid_search.best_estimator_
  models.append(model)

  tmp_y_prob = model.predict_proba(tmp_X_test)[:, 1]
  tmp_auc = roc_auc_score(tmp_y_test, tmp_y_prob)
  dataset_aucs.append(tmp_auc)

  # Now use select_best_model to see if there is a better model
  best_model, best_model_name, auc = select_best_model(processed_X_trains[i], y_trains[i])

  # Compare AUCs and update the model if needed
  if auc > tmp_auc:
    models[i] = best_model
    dataset_aucs[i] = auc

Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Fitting 

KeyboardInterrupt: 

## Inference Model and Save result

In [None]:
# Main_2
y_predicts=[]
for i in range(len(dataset_names)):
  y_predict_proba = models[i].predict_proba(processed_X_tests[i])[:, 1]
  df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
  y_predicts.append(df)

for idx, dataset_name in enumerate(dataset_names):
  df = y_predicts[idx]
  # df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False,header=True)
  df.to_csv(f'{drive_path}/{dataset_name}/y_predict.csv', index=False, header=True)  # only for use google colab (drive)

In [None]:
# The fianl AUC
for idx, auc in enumerate(dataset_aucs):
  print(f"The fianl AUC of {dataset_names[idx]} : {auc:.4f}")

The fianl AUC of Dataset_18 : 1.0000
The fianl AUC of Dataset_14 : 0.9133
The fianl AUC of Dataset_10 : 0.7322
The fianl AUC of Dataset_15 : 0.6621
The fianl AUC of Dataset_12 : 0.7548
The fianl AUC of Dataset_1 : 0.6842
The fianl AUC of Dataset_11 : 0.7429
The fianl AUC of Dataset_17 : 0.9407
The fianl AUC of Dataset_16 : 0.9967
The fianl AUC of Dataset_13 : 0.8633
The fianl AUC of Dataset_26 : 0.7346
The fianl AUC of Dataset_19 : 0.9890
The fianl AUC of Dataset_25 : 0.8148
The fianl AUC of Dataset_22 : 0.7810
The fianl AUC of Dataset_21 : 0.8957
The fianl AUC of Dataset_2 : 0.9967
The fianl AUC of Dataset_23 : 0.8812
The fianl AUC of Dataset_24 : 0.5970
The fianl AUC of Dataset_27 : 1.0000
The fianl AUC of Dataset_20 : 0.9061
The fianl AUC of Dataset_30 : 0.8246
The fianl AUC of Dataset_29 : 0.8416
The fianl AUC of Dataset_32 : 0.8215
The fianl AUC of Dataset_28 : 0.8444
The fianl AUC of Dataset_3 : 0.5833
The fianl AUC of Dataset_36 : 0.9469
The fianl AUC of Dataset_34 : 0.8129
The 

## 好像沒什麼用的想法

### Find KNN the best k


In [None]:
best_k_values = []

for i, (X, y) in enumerate(zip(X_trains, y_trains)):
  y = y.squeeze()
  k_values = range(1, 21)
  cv_scores = []
  for k in k_values:
    knn = KNeighborsClassifier(n_neighbors=k)
    scores = cross_val_score(knn, X, y, cv=5, scoring='accuracy')
    cv_scores.append(scores.mean())

  best_k = k_values[np.argmax(cv_scores)]
  best_k_values.append(best_k)


In [None]:
print(best_k_values)

[1, 13, 12, 16, 12, 7, 4, 14, 1, 17, 6, 3, 5, 5, 4, 1, 11, 16, 12, 11, 17, 2, 10, 10, 3, 10, 18, 2, 4, 9, 15, 9, 9, 2, 10, 1, 4, 7, 1, 18, 2, 2, 3, 7, 13, 13, 3, 7, 16]


In [None]:
# best k
y_predicts = []
dataset_aucs = []

for i, (X_train, y_train, X_test) in enumerate(zip(processed_X_trains, y_trains, processed_X_tests)):
  y_train = y_train.squeeze()  # 確保 y 為一維陣列
  best_k = best_k_values[i]

  # 使用最佳 K 值訓練模型
  knn = KNeighborsClassifier(n_neighbors=best_k)
  knn.fit(X_train, y_train)

  # 在測試數據上進行預測
  y_pred = knn.predict(X_test)
  y_predicts.append(pd.DataFrame(y_pred, columns=['y_predict']))

  # 在測試數據上進行預測（計算概率）
  y_pred_proba = knn.predict_proba(X_test)[:, 1]  # 獲取陽性類別的概率

  # 分割測試集，計算 AUC
  tmp_X_train, tmp_X_val, tmp_y_train, tmp_y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
  knn.fit(tmp_X_train, tmp_y_train)
  y_val_proba = knn.predict_proba(tmp_X_val)[:, 1]
  auc = roc_auc_score(tmp_y_val, y_val_proba)
  dataset_aucs.append(auc)

數據集 Dataset_18 的 AUC 為: 0.7500
數據集 Dataset_14 的 AUC 為: 0.9133
數據集 Dataset_10 的 AUC 為: 0.7219
數據集 Dataset_15 的 AUC 為: 0.6625
數據集 Dataset_12 的 AUC 為: 0.7690
數據集 Dataset_1 的 AUC 為: 0.6306
數據集 Dataset_11 的 AUC 為: 0.3857
數據集 Dataset_17 的 AUC 為: 0.9370
數據集 Dataset_16 的 AUC 為: 0.9474
數據集 Dataset_13 的 AUC 為: 0.8300
數據集 Dataset_26 的 AUC 為: 0.7500
數據集 Dataset_19 的 AUC 為: 0.9947
數據集 Dataset_25 的 AUC 為: 0.8519
數據集 Dataset_22 的 AUC 為: 0.7941
數據集 Dataset_21 的 AUC 為: 0.8677
數據集 Dataset_2 的 AUC 為: 0.9474
數據集 Dataset_23 的 AUC 為: 0.8816
數據集 Dataset_24 的 AUC 為: 0.5927
數據集 Dataset_27 的 AUC 為: 1.0000
數據集 Dataset_20 的 AUC 為: 0.8909
數據集 Dataset_30 的 AUC 為: 0.8092
數據集 Dataset_29 的 AUC 為: 0.8519
數據集 Dataset_32 的 AUC 為: 0.7413
數據集 Dataset_28 的 AUC 為: 0.8487
數據集 Dataset_3 的 AUC 為: 0.5417
數據集 Dataset_36 的 AUC 為: 0.9560
數據集 Dataset_34 的 AUC 為: 0.8114
數據集 Dataset_31 的 AUC 為: 0.5455
數據集 Dataset_33 的 AUC 為: 1.0000
數據集 Dataset_35 的 AUC 為: 0.8255
數據集 Dataset_38 的 AUC 為: 0.7665
數據集 Dataset_40 的 AUC 為: 0.8255
數據集 Dataset