# Drive Mount

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Library Definition

In [3]:
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RandomizedSearchCV

# Data Upload

In [4]:
# preprocess type
PCA = 1
FEATURE_REDUC = 2
UNDER_SAMPLE = 3
HYPERPARAMETER_TUNNING = False

preprocess_type = PCA

In [5]:

if preprocess_type == PCA:
  X_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_model_preprocess_pca.csv"
  Y_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/Y_model.csv"
  X_exam_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_test_preprocess_pca.csv"

elif preprocess_type == FEATURE_REDUC:
  X_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_model_preprocess_feat_reduc.csv"
  Y_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/Y_model.csv"
  X_exam_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_test_preprocess_feat_reduc.csv"

elif UNDER_SAMPLE:
  X_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_model_preprocess_undersampled.csv"
  Y_model_file_path = "/content/drive/Shareddrives/Intro-data-science/data/Y_model_undersampled.csv"
  X_exam_file_path = "/content/drive/Shareddrives/Intro-data-science/data/X_test_preprocess.csv"


In [6]:
X_model_all = pd.read_csv(X_model_file_path, index_col = 0)

In [7]:
Y_model_all = pd.read_csv(Y_model_file_path)

# Model 훈련

In [8]:
model = RandomForestClassifier(random_state = 100)

In [9]:
X_train, X_valid, Y_train, Y_valid = train_test_split(X_model_all, Y_model_all, test_size= 0.2, shuffle = True, random_state= 100)

model.fit(X_train, Y_train)

Yhat = model.predict(X_valid)
roc_auc_valid_score = roc_auc_score(Y_valid, Yhat)
cv_valid_score = cross_val_score(model, X_model_all, Y_model_all, scoring = 'roc_auc', cv = 5)

print(f"roc_auc_valid_score: {roc_auc_valid_score}")
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_valid_score.mean(), cv_valid_score.std()))

  model.fit(X_train, Y_train)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


roc_auc_valid_score: 0.5422910789433879
0.86 accuracy with a standard deviation of 0.00


# Hyperparameter Tuning

In [10]:
model.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 100,
 'verbose': 0,
 'warm_start': False}

In [11]:
if HYPERPARAMETER_TUNNING:
  params = {
    "learning_rate" : [0.08, 0.09, 0.1, 0.11, 0.12],
    "min_child_samples" : list(np.arange(15, 25, 1)),
    "n_estimators" : list(np.arange(80, 120, 10)),
  }

  random_search = RandomizedSearchCV(
    model,
    param_distributions = params, # 파라미터 입력
    n_iter = 50,   # random search 탐색 횟수
    cv = 5,        # cv 검증을 위한 분할 검증 횟수
    scoring = 'roc_auc',  # 오차 평가방법
    verbose = 0,     # 진행상황
    random_state = 100
  )

  random_search.fit(X_model_all, Y_model_all)

  print(f"params: {random_search.best_params_}")
  print(f"estimator: {random_search.best_estimator_}")

  best_model = random_search.best_estimator_

else: 
  best_model = model

In [12]:
best_model.fit(X_train, Y_train)

Yhat = best_model.predict(X_valid)
roc_auc_valid_score = roc_auc_score(Y_valid, Yhat)
cv_valid_score = cross_val_score(best_model, X_model_all, Y_model_all, scoring = 'roc_auc', cv = 5)

print(f"roc_auc_valid_score: {roc_auc_valid_score}")
print("%0.2f accuracy with a standard deviation of %0.2f" % (cv_valid_score.mean(), cv_valid_score.std()))

  best_model.fit(X_train, Y_train)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


KeyboardInterrupt: ignored

# Probability Predication

In [None]:
X_exam = pd.read_csv(X_exam_file_path, index_col = 0)

In [None]:
Y_exam_prob_numpy = best_model.predict_proba(X_exam)
Y_exam_prob = pd.DataFrame(Y_exam_prob_numpy[:, 1])

Y_exam_prob.columns = ['business_prob']

In [None]:
Y_exam_prob.head()

In [None]:
Y_exam_prob[Y_exam_prob["business_prob"] >= 0.5]

In [None]:
Y_exam_prob.to_csv("/content/drive/Shareddrives/Intro-data-science/data/Y_exam_prob.csv")