# Mount Google Drive

In [1]:
from google.colab import drive
drive.mount('/content/drive')

root = '/content/drive/MyDrive/Colab Notebooks/FINAL PROJECT ZENIUS/Dataset/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Import Module

In [2]:
# import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle

In [3]:
# import libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingRegressor

from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_curve, auc, classification_report
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

# Import Data Train and Test

In [4]:
# tabel setting
pd.set_option('display.max_columns',None)
pd.set_option('display.max_rows',None)

In [5]:
# load data
X_train = pd.read_csv(root+'X_train.csv')
X_test = pd.read_csv(root+'X_test.csv')
y_train = pd.read_csv(root+'y_train.csv')
y_test = pd.read_csv(root+'y_test.csv')

# Training

## Logistic Regression Model

In [None]:
# Logistik Regression
lr_model = LogisticRegression(solver='lbfgs')

### Hyperparameter Tuning

In [None]:
param_grid = [
    {
        'penalty' : ['l2', 'none'],
        'max_iter' : [2500, 5000, 10000, 20000]
    }
]

In [None]:
lr_Grid = GridSearchCV(estimator=lr_model, param_grid = param_grid, cv = 3, verbose=True, n_jobs=-1, scoring="roc_auc")

In [None]:
lr_Grid.fit(X_train,y_train)

Fitting 3 folds for each of 8 candidates, totalling 24 fits




GridSearchCV(cv=3, estimator=LogisticRegression(), n_jobs=-1,
             param_grid=[{'max_iter': [2500, 5000, 10000, 20000],
                          'penalty': ['l2', 'none']}],
             scoring='roc_auc', verbose=True)

In [None]:
lr_opt = lr_Grid.best_estimator_

### Train Model

In [None]:
lr_opt.fit(X_train,y_train)

LogisticRegression(max_iter=2500, penalty='none')

## KNN Model

In [None]:
# KNN
knn_model = KNeighborsClassifier(n_neighbors=7)

### Hyperparameter Tuning

In [None]:
param_grid = {
    'n_neighbors': [7, 9, 11, 13, 15],
    'weights': ["uniform", "distance"],
    'leaf_size': [30, 40, 50],
    'metric': ["euclidean", "manhattan", "minkowski"],
}

In [None]:
knn_Grid = GridSearchCV(estimator = knn_model, param_grid = param_grid, cv = 3, scoring="recall")

In [None]:
knn_Grid.fit(X_train, y_train)

KeyboardInterrupt: ignored

In [None]:
knn_opt = knn_Grid.best_estimator_

### Train Model

In [None]:
knn_opt.fit(X_train,y_train)

## Random Forest Model

In [6]:
# Random Forest
rf_model = RandomForestClassifier(random_state=42, n_jobs=-1)

### Hyperparameter Tuning

In [7]:
n_estimators = [100, 120, 150, 200]
max_features = ['auto', 'sqrt']
criterion = ["gini", "entropy", "log_loss"]
max_depth = [4, 8, 16, 32]
bootstrap = [True, False]

In [8]:
param_grid = {
    'n_estimators': n_estimators,
    'max_features': max_features,
    'max_depth': max_depth,
    'criterion': criterion,
    'bootstrap': bootstrap
}

In [9]:
rf_Grid = GridSearchCV(estimator = rf_model, param_grid = param_grid, cv = 3, verbose=2, n_jobs = 4, scoring="roc_auc")

In [10]:
rf_Grid.fit(X_train.values,y_train.values.ravel())

Fitting 3 folds for each of 192 candidates, totalling 576 fits


192 fits failed out of a total of 576.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
96 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.8/dist-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/usr/local/lib/python3.8/dist-packages/joblib/parallel.py", line 1098, in __call__
    self.retrieve()
  File "/usr/local/lib/python3.8/dist-packages/joblib/parallel.py", line 975, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "/usr/lib/python3.8/multiprocessing/poo

GridSearchCV(cv=3, estimator=RandomForestClassifier(n_jobs=-1, random_state=42),
             n_jobs=4,
             param_grid={'bootstrap': [True, False],
                         'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': [4, 8, 16, 32],
                         'max_features': ['auto', 'sqrt'],
                         'n_estimators': [100, 120, 150, 200]},
             scoring='roc_auc', verbose=2)

In [11]:
rf_opt = rf_Grid.best_estimator_

### Train Model

In [12]:
rf_opt.fit(X_train.values,y_train.values.ravel())

RandomForestClassifier(bootstrap=False, criterion='entropy', max_depth=16,
                       n_estimators=200, n_jobs=-1, random_state=42)

## Save Model

In [None]:
# Save Model
pickle.dump(lr_opt, open(root + 'Model/LR_Model.pkl', 'wb'))

In [None]:
pickle.dump(knn_opt, open(root + 'Model/KNN_Model.pkl', 'wb'))

In [13]:
pickle.dump(rf_opt, open(root + 'Model/RF_Model.pkl', 'wb'))