## README

- The **Features** of data sets 從 4~97 都有
- 使用 KNeighborsClassifier = 0.224457
- 使用 RandomForestClassifier = 0.67557
- 使用 GradientBoostingClassifier = 0.491197
  - n_estimators=100
  - learning_rate=0.1
  - max_depth=3
  - random_state=42

In [1]:
!pip install scikit-learn pandas



In [2]:
# only for use google colab (drive)
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [13]:
import os
import csv
import pandas as pd
import numpy as np

In [27]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier # RandomForestClassifier, GradientBoostingClassifier
# from sklearn.metrics import accuracy_score, f1_score, precision_score
from sklearn.metrics import roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Read All Dataset CSV

In [4]:
dataset_names=[]
X_trains=[]
y_trains=[]
X_tests=[]
drive_path = "./drive/MyDrive/Colab Notebooks/Competition_data" # only for use google colab (drive)
for folder_name in os.listdir(drive_path):           # only for use google colab (drive)
# for folder_name in os.listdir("./Competition_data"):
  # print(folder_name)
  dataset_names.append(folder_name)
  # X_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/X_train.csv",header=0))
  # y_trains.append(pd.read_csv(f"./Competition_data/{folder_name}/y_train.csv",header=0))
  # X_tests.append(pd.read_csv(f"./Competition_data/{folder_name}/X_test.csv",header=0))
  X_trains.append(pd.read_csv(f"{drive_path}/{folder_name}/X_train.csv",header=0))  # only for use google colab (drive)
  y_trains.append(pd.read_csv(f"{drive_path}/{folder_name}/y_train.csv",header=0))  # only for use google colab (drive)
  X_tests.append(pd.read_csv(f"{drive_path}/{folder_name}/X_test.csv",header=0))   # only for use google colab (drive)

## Data Preprocessing & Feature Engineering

In [16]:
## your code here
def preprocess_data(X_train, X_test):
  """
  Data Preprocessing & Feature Engineering
  a) Automatically identify numerical and categorical features
  b) Standardize numerical features
  c) Categorical features
  """
  # a
  numeric_features = X_train.select_dtypes(include=['float64']).columns
  categorical_features = X_train.select_dtypes(include=['int64']).columns

  # b
  scaler = StandardScaler()
  if len(numeric_features) > 0:  # if numerical features
    X_train[numeric_features] = scaler.fit_transform(X_train[numeric_features])
    X_test[numeric_features] = scaler.transform(X_test[numeric_features])

  # c

  return X_train, X_test


## train test split & build Model
You can select an appropriate model and perform corresponding hyperparameter tuning.

## Main


In [28]:
models=[]
for i in range(len(dataset_names)):
  # read data
  X_train, X_test = X_trains[i], X_tests[i]

  # Data Preprocessing
  X_train, X_test = preprocess_data(X_train, X_test)

  tmp_X_train, tmp_X_test, tmp_y_train, tmp_y_test = train_test_split(X_train, y_trains[i], test_size=0.2, random_state=42)

  # model = KNeighborsClassifier(n_neighbors=3)
  # model = RandomForestClassifier(n_estimators=100, random_state=42) # RandomForestClassifier
  model = GradientBoostingClassifier(
      n_estimators=100, # number of tree
      learning_rate=0.1,
      max_depth=3,
      random_state=42  # random seed
      )
  model.fit(tmp_X_train, tmp_y_train.squeeze())

  tmp_y_prob = model.predict_proba(tmp_X_test)[:, 1]
  auc = roc_auc_score(tmp_y_test, tmp_y_prob)

  models.append(model)


## Inference Model

In [29]:
y_predicts=[]
for i in range(len(dataset_names)):
  y_predict_proba = models[i].predict_proba(X_tests[i])[:, 1]
  df = pd.DataFrame(y_predict_proba, columns=['y_predict_proba'])
  y_predicts.append(df)


## Save result

In [30]:
for idx,dataset_name in enumerate(dataset_names):
  df = y_predicts[idx]
  # df.to_csv(f'./Competition_data/{dataset_name}/y_predict.csv', index=False,header=True)
  df.to_csv(f'{drive_path}/{dataset_name}/y_predict.csv', index=False, header=True)  # only for use google colab (drive)