### Basic Import

In [None]:
import pandas as pd
import numpy as np
import os
import sys
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

### Modelling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,recall_score,precision_score, f1_score,roc_auc_score,confusion_matrix,roc_curve, auc
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

## Import the csv Data as pandas as DataFrame

In [None]:
df = pd.read_csv('data\credit_default.csv')

## show top 5 records

In [None]:
df.head()

## Preparing x and y variables

In [None]:
x = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [None]:
# define numerical & categorical columns
categorical_feature = [i for i in x.columns if df[i].nunique()<=11]
numeric_feature = [i for i in x.columns if i not in categorical_feature]

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

categorical_transformer = StandardScaler(with_mean=False)

numerical_pipline = Pipeline(steps=[
    ("scaler",StandardScaler())
])
categorical_pipline = Pipeline(steps=[
    ("scaler",StandardScaler(with_mean=False))
])

preprocessor = ColumnTransformer(
    [
        ("numerical",numerical_pipline,numeric_feature),
        ("categorical",categorical_pipline,categorical_feature)
    ]
)

In [None]:
data = pd.merge(data1, data2, left_index=True, rihjt_index= True)

In [None]:
df.default.value_counts()

In [None]:
from imblearn.over_sampling import SMOTE
smote = SMOTE(sampling_strategy='minority')
x_sm,y_sm = smote.fit_resample(x,y)
y_sm.value_counts()

In [None]:
# seprate dataset into train and test
x_train,x_test,y_train,y_test = train_test_split(x_sm,y_sm,test_size=.25,stratify=y_sm)

### Create an Evaluate Function to give all metrics after Training

In [None]:
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score
def evalueate_model (y_pred, y_test):
    true_value = np.round(y_pred)
    print('Classification_Report\n',classification_report(y_test,y_pred))

In [None]:
# input_feature_train_arr = preprocessor.fit_transform(x_train)

In [None]:
params={
    'XGBClassifier':{
        'learning_rate':[0.1,0.01,0.001],
#         'n_estimators':[600,800,1000],
#         'max_depth':[4,5,6],
#         'min_child_weight':[4,5,6],
#         'gamma':[i/10.0 for i in range(0,4)]
    },
    
    'Random Forest Classifier':{
        'n_estimators':[600,800,1000],
#         'max_depth':[4,5,6],
#         'min_samples_split':[2,4,5],
#         'max_features' = ['sqrt', 'log2']
#         'criterion':['gini','entropy']
    },
      
    'Decision Tree':{
        'criterion':['gini'],
        'splitter':[2,4,6],
#         'max_depth':[2,3,4],
#         'min_samples_leaf':[4,6,3],
#         'max_features':[4,5,7,],
#         'criterion':['gini','entropy']
    },
    
# #     'CatboostClassifier':{
# #         'iterations':[],
# #         'learning_rate':[],
# #         'rsm':[],
# #         'loss_function':[]},
    
    'LogisticRegression':{
        'max_iter':[400,300,500],
#         'solver':['lbfgs','sag','saga','liblinear'],
#         'penalty':['None','l2','l1'],
#         'c_values':[100,10,1,0.1,0.01]
    },
    
    'KNeighborsClassifier':{
        'n_neighbors' : range(1, 21, 2),
#         'weights' = ['uniform', 'distance'],
#         'metric' = ['euclidean', 'manhattan', 'minkowski']
    },
    
# #     'SVMClassifier':{
# #         'kernel' = ['poly', 'rbf', 'sigmoid'],
# #         'C' = [50, 10, 1.0, 0.1, 0.01],
# #         'gamma' = ['scale']},
    
    'GradientBoostingClassifier':{
        'n_estimators' : [10, 100, 1000],
#         'learning_rate' = [0.001, 0.01, 0.1],
#         'subsample' = [0.5, 0.7, 1.0],
#         'max_depth' = [3, 7, 9]
    },
    
    'AdaBoost Classifier':{
        'n_estimators': [2, 3, 4],
#         'learning_rate': [(0.97 + x / 100) for x in range(0, 4)],
#         'algorithm': ['SAMME', 'SAMME.R']
    }
     }

In [None]:
def evaluate_models(x_train,y_train,x_test,y_test,models):
    try:
        report = {}
        for i in range(len(list(models))):
            model = list(models.values())[i]

            # para = params[list(models.keys())[i]]

            # gs = GridSearchCV(model, para, cv=3)
            # gs.fit(x_train,y_train)
            

            # model.set_params(**gs.best_params_)
            model.fit(x_train,y_train)

            y_train_pred = model.predict(x_train)

            y_test_pred = model.predict(x_test)

            train_model_score = roc_auc_score(y_train,y_train_pred)

            test_model_score = roc_auc_score(y_test,y_test_pred)

            report[list(models.keys())[i]] = test_model_score

        return report
    except Exception as e:
        raise CustomException(e,sys)
    

In [None]:
t =evaluate_models(x_train,y_train,x_test,y_test,models)

In [None]:
t

In [None]:
models = {
        'LogisticRegression':LogisticRegression(),
         'Decision Tree': DecisionTreeClassifier(),
         'KNeighborsClassifier':KNeighborsClassifier(),
         'XGBClassifier':XGBClassifier(),
         'AdaBoost Classifier':AdaBoostClassifier(),
         'Random Forest Classifier': RandomForestClassifier()
}
model_list = []
roc_auc_scr_list=[]
for i in range(len(list(models))):
    model = list(models.values())[i]
    model.fit(x_train,y_train) # train model
    
    # make predictions
    y_train_pred = model.predict(x_train)
    y_test_pred = model.predict(x_test)

    #Evaluate Train and test dataset
    model_train_roc_auc_value = evalueate_model(y_train,y_train_pred)
    train_roc_auc_cur_scr = roc_auc_score(y_train,y_train_pred)
    
    model_test_roc_auc_value = evalueate_model(y_test_pred,y_test)
    test_roc_auc_cur_scr = roc_auc_score(y_test,y_test_pred)
    
    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- auc_roc_socre : {}".format(train_roc_auc_cur_scr))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- auc_roc_sore : {} ".format(test_roc_auc_cur_scr))
    roc_auc_scr_list.append(test_roc_auc_cur_scr)
    
    print('='*35)
    print('\n')


by seeing Roc_auc_socre we can say that "XGBClassifier " have good training and testing score 

In [None]:
pd.DataFrame(list(zip(model_list, roc_auc_scr_list)), columns=['Model Name', 'Roc_auc_score']).sort_values(by=['Roc_auc_score'],ascending=False)

In [None]:
# from imblearn.under_sampling import RandomUnderSampler
# rus = RandomUnderSampler(random_state = 42)
# x= data.drop(['default.payment.next.month'],axis=1)
# y = data['default.payment.next.month']
# X_resampled, y_resampled = rus.fit_resample(x,y)
# y_resampled.value_counts()


In [None]:
dic  = {}
lst = []
for i in range(1,8):
    lst.append(i)
dic['valu']=lst

In [None]:
dic