# Installments

In [None]:
# Colab Installments
!pip install pyforest
!pip install squarify
!pip install pyclustertend
!pip install catboost
!pip install optuna
!pip install pandas_profiling
!pip install termcolor
!pip install colorama

In [None]:
!pip install pycaret[full]
!pip install pycaret-nightly
# # https://pycaret.readthedocs.io/en/latest/index.html#
# # install the nightly build
# pip install pycaret-nightly
# # install the full version of the nightly build
# pip install pycaret-nightly[full]

In [None]:
# Colab Installments for Plotly
!pip install --upgrade plotly
!pip install jupyter-dash
import plotly.graph_objects as go
import plotly.express as px
from jupyter_dash import JupyterDash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output 

# Libraries - ML Regression|Classification

In [None]:
## Import Libraies

## import all main libraries automatically with pyforest
# !pip install pyforest
import pyforest

## main libraries
import numpy as np
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib.ticker as mticker

# !pip install squarify
import squarify as sq

import scipy.stats as stats
from scipy.cluster.hierarchy import linkage, dendrogram
import statsmodels.api as sm
import statsmodels.formula.api as smf
import datetime as dt
from datetime import datetime
from pyclustertend import hopkins

## pre-processing
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.dummy import DummyClassifier
from sklearn.impute import SimpleImputer, KNNImputer

## feature Selection
from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, f_regression, mutual_info_regression

## scaling
from sklearn.preprocessing import scale 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer 
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import RobustScaler

## regression/prediction
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet, LogisticRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor

## ann
from sklearn.neural_network import MLPRegressor

## classification
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, plot_tree 
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier, plot_importance

## metrics
from sklearn.metrics import plot_confusion_matrix, r2_score, mean_absolute_error, mean_squared_error
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import make_scorer, precision_score, precision_recall_curve, plot_precision_recall_curve 
from sklearn.metrics import plot_roc_curve, roc_auc_score, roc_curve, f1_score, accuracy_score, recall_score
from sklearn.metrics import silhouette_samples,silhouette_score
from sklearn.metrics.cluster import adjusted_rand_score

## model selection
from sklearn import model_selection
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold, KFold, cross_val_predict, train_test_split
from sklearn.model_selection import StratifiedKFold, GridSearchCV, cross_val_score, cross_validate

## MLearning
from sklearn.pipeline import make_pipeline, Pipeline
import optuna
from sklearn.naive_bayes import GaussianNB

## clevers
# !pip install -U pandas-profiling --user
import pandas_profiling
from pandas_profiling.report.presentation.flavours.html.templates import create_html_assets

import ipywidgets
from ipywidgets import interact
import missingno as msno 
# !pip install wordcloud
from wordcloud import WordCloud

# !pip install termcolor
import colorama
from colorama import Fore, Style  # makes strings colored
from termcolor import colored
from termcolor import cprint
# grey red green yellow blue magenta cyan white (on_grey ..)
# bold dark underline blink reverse concealed
# cprint("Have a first look to:","blue","on_grey", attrs=['bold'])

## plotly and cufflinks
import plotly 
import plotly.express as px
import cufflinks as cf
import plotly.graph_objs as go
import plotly.offline as py
from plotly.offline import iplot
from plotly.subplots import make_subplots
import plotly.figure_factory as ff
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

## Ignore Warnings
import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

## Figure&Display options
plt.rcParams["figure.figsize"] = (10,6)
pd.set_option('max_colwidth',200)
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 200)
pd.set_option('display.float_format', lambda x: '%.3f' % x)

# Usefull Functions

In [None]:
## Some Useful User-Defined-Functions

###############################################################################

def missing_values(df):
    missing_number = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (df.isnull().sum()/df.isnull().count()).sort_values(ascending=False)
    missing_values = pd.concat([missing_number, missing_percent], axis=1, keys=['Missing_Number', 'Missing_Percent'])
    return missing_values[missing_values['Missing_Number']>0]

###############################################################################

def first_looking(df):
    print(colored("Shape:", 'yellow', attrs=['bold']), df.shape,'\n', 
          colored('*'*100, 'red', attrs=['bold']),
          colored("\nInfo:\n",'yellow', attrs=['bold']), sep='')
    print(df.info(), '\n', 
          colored('*'*100, 'red', attrs=['bold']), sep='')
    print(colored("Number of Uniques:\n", 'yellow', attrs=['bold']), df.nunique(),'\n',
          colored('*'*100, 'red', attrs=['bold']), sep='')
    print(colored("Missing Values:\n", 'yellow', attrs=['bold']), missing_values(df),'\n', 
          colored('*'*100, 'red', attrs=['bold']), sep='')
    print(colored("All Columns:", 'yellow', attrs=['bold']), *list(df.columns), sep='\n- ') 
    print(colored('*'*100, 'red', attrs=['bold']), sep='')

    df.columns= df.columns.str.lower().str.replace('&', '_').str.replace(' ', '_')
    
    print(colored("Columns after rename:", 'yellow', attrs=['bold']), *list(df.columns), sep='\n- ')
    print(colored('*'*100, 'red', attrs=['bold']), sep='')
    
###############################################################################
## To view summary information about the columns

def summary(column):
    print(colored("Column: ",'yellow', attrs=['bold']), column)
    print(colored('*'*100, 'red', attrs=['bold']), sep='')
    print(colored("Missing values: ", 'yellow', attrs=['bold']), df[column].isnull().sum())
    print(colored('*'*100, 'red', attrs=['bold']), sep='')
    print(colored("Missing values(%): ", 'yellow', attrs=['bold']), round(df[column].isnull().sum()/df.shape[0]*100, 2))
    print(colored('*'*100, 'red', attrs=['bold']), sep='')
    print(colored("Unique values: ", 'yellow', attrs=['bold']), df[column].nunique())
    print(colored('*'*100, 'red', attrs=['bold']), sep='')
    print(colored("Value counts: \n", 'yellow', attrs=['bold']), df[column].value_counts(dropna = False), sep='')
    print(colored('*'*100, 'red', attrs=['bold']), sep='')
    
###############################################################################
                    
def multicolinearity_control(df):                    
    df_temp = df.corr()
    count = 'Done'
    feature =[]
    collinear= []
    for col in df_temp.columns:
        for i in df_temp.index:
            if abs(df_temp[col][i] > .8 and df_temp[col][i] < 1):
                    feature.append(col)
                    collinear.append(i)
                    cprint(f"multicolinearity alert in between {col} - {i}", "red", attrs=["bold"])
    else:
        cprint(f"There is NO multicollinearity problem.", "blue", attrs=["bold"])                     
                    
###############################################################################

def duplicate_values(df):
    print(colored("Duplicate check...", 'yellow', attrs=['bold']), sep='')
    duplicate_values = df.duplicated(subset=None, keep='first').sum()
    if duplicate_values > 0:
        df.drop_duplicates(keep='first', inplace=True)
        print(duplicate_values, colored(" Duplicates were dropped!"),'\n',
              colored('*'*100, 'red', attrs=['bold']), sep='')
    else:
        print(colored("There are no duplicates"),'\n',
              colored('*'*100, 'red', attrs=['bold']), sep='')     

###############################################################################
        
def drop_columns(df, drop_columns):
    if drop_columns !=[]:
        df.drop(drop_columns, axis=1, inplace=True)
        print(drop_columns, 'were dropped')
    else:
        print(colored('Missing value control...', 'yellow', attrs=['bold']),'\n',
              colored('If there is a missing value above the limit you have given, the relevant columns are dropped and an information is given.'), sep='')

###############################################################################

def drop_null(df, limit):
    for i in df.isnull().sum().index:
        if (df.isnull().sum()[i]/df.shape[0]*100)>limit:
            print(df.isnull().sum()[i], 'percent of', i ,'were null and dropped')
            df.drop(i, axis=1, inplace=True)
    print(colored('Last shape after missing value control:', 'yellow', attrs=['bold']), df.shape, '\n', 
          colored('*'*100, 'red', attrs=['bold']), sep='')

###############################################################################

def shape_control():
    print('df.shape:', df.shape)
    print('X.shape:', X.shape)
    print('y.shape:', y.shape)
    print('X_train.shape:', X_train.shape)
    print('y_train.shape:', y_train.shape)
    print('X_test.shape:', X_test.shape)
    print('y_test.shape:', y_test.shape)

###############################################################################  

## show values in bar graphic
def show_values_on_bars(axs):
    def _show_on_single_plot(ax):        
        for p in ax.patches:
            _x = p.get_x() + p.get_width() / 2
            _y = p.get_y() + p.get_height()
            value = '{:.2f}'.format(p.get_height())
            ax.text(_x, _y, value, ha="center") 
    if isinstance(axs, np.ndarray):
        for idx, ax in np.ndenumerate(axs):
            _show_on_single_plot(ax)
    else:
        _show_on_single_plot(axs)
        
###############################################################################   

# Useful Functions|Classification Reports

In [None]:
###############################################################################

## Model Validation
def model_validation(y_train, y_train_pred, y_test, y_test_pred, model_name):
    
    scores =  {f"{model_name}_train": {"R2" : r2_score(y_train, y_train_pred),
    "rmse" : np.sqrt(mean_squared_error(y_train, y_train_pred)),
    "mse" : mean_squared_error(y_train, y_train_pred), 
    "mae" : mean_absolute_error(y_train, y_train_pred)},
    
    f"{model_name}_test": {"R2" : r2_score(y_test, y_test_pred),
    "rmse" : np.sqrt(mean_squared_error(y_test, y_test_pred)),
    "mse" : mean_squared_error(y_test, y_test_pred),
    "mae" : mean_absolute_error(y_test, y_test_pred)}}
     
    return pd.DataFrame(scores)

# lm = model_validation(y_train, y_train_pred, y_test, y_test_pred, 'lm')

# pd.concat([lm, rs, rcvs, lss, lcvs, es, ecvs], axis = 1)

###############################################################################

def get_classification_report(y_test, y_test_pred):
    from sklearn import metrics
    report = metrics.classification_report(y_test, y_test_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    #df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

###############################################################################

def shape_control(df, X, y, X_train, y_train, X_test, y_test):
    print('df.shape:', df.shape)
    print('X.shape:', X.shape)
    print('y.shape:', y.shape)
    print('X_train.shape:', X_train.shape)
    print('y_train.shape:', y_train.shape)
    print('X_test.shape:', X_test.shape)
    print('y_test.shape:', y_test.shape)
    try:
        print('y_test_pred.shape:', y_pred.shape)
    except:
        print()
        
###############################################################################

def calc_predict():
    return accuracy_score(y_test, y_test_pred), recall_score(y_test, y_test_pred)
    
def get_report():
    from sklearn import metrics
    pd.set_option('display.float_format', lambda x: '%.3f' % x)
    y_train_pred = model.predict(X_train_scaled)
    try:
        y_train_pred_proba = model.predict_proba(X_train_scaled)
    except:
        print()
    try:
        precision, recall, _ = precision_recall_curve(y_train, y_train_pred_proba[:,1])
    except:
        print() 
    try:
        y_test_pred_proba = model.predict_proba(X_test_scaled)
    except:
        print()
    try:
        precision, recall, _ = precision_recall_curve(y_test, y_test_pred_proba[:,1])
    except:
        print()  
    print('Model:', model.get_params, '\n')
    try:
        print('model.best_params_:', model.best_params_, '\n')
    except:
        print()
    print("Train:")
    print('rmse:', np.sqrt(mean_squared_error(y_train, y_train_pred)))
    print('accuracy:', accuracy_score(y_train, y_train_pred))
    try:
        print('roc_auc_score:',roc_auc_score(y_train, y_train_pred_proba[:,1]))
    except:
        print()
    try:
        print('roc_auc_recall_precision_score:',auc(recall, precision),'\n')
    except:
        print()
    print('confusion_matrix:\n\n', confusion_matrix(y_train, y_train_pred), '\n')
    print('classification_report:\n\n', classification_report(y_train, y_train_pred),'\n')
    print()
    print("Test:")
    print('rmse:', np.sqrt(mean_squared_error(y_test, y_test_pred))) 
    print('accuracy:', accuracy_score(y_test, y_test_pred))
    try:
        print('roc_auc_score:',roc_auc_score(y_test, y_test_pred_proba[:,1]))
    except:
        print() 
    try:
        print('roc_auc_recall_precision_score:',auc(recall, precision),'\n')
    except:
        print() 
    print('confusion_matrix:\n\n', confusion_matrix(y_test, y_test_pred), '\n')
    print('classification_report:\n\n', classification_report(y_test, y_test_pred))

def train_control_table():
    y_train_pred = model.predict(X_train_scaled)
    y_train_pred = pd.DataFrame(y_train_pred)
    y_train_pred.rename(columns = {0: 'y_train_pred'}, inplace = True)
    return pd.concat([X_train, y_train, y_train_pred.set_index(y_train.index)], axis=1)

def test_control_table():
    y_test_pred = model.predict(X_test_scaled)
    y_test_pred = pd.DataFrame(y_test_pred)
    y_test_pred.rename(columns = {0: 'y_test_pred'}, inplace = True)
    return pd.concat([X_test, y_test, y_test_pred.set_index(y_test.index)], axis=1)

## Fill Missing Values

In [1]:
###############################################################################
def fill_most(df, group_col, col_name):
    '''Fills the missing values with the most existing value (mode) in the relevant column according to single-stage grouping'''
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        mode = list(df[cond][col_name].mode())
        if mode != []:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[cond][col_name].mode()[0])
        else:
            df.loc[cond, col_name] = df.loc[cond, col_name].fillna(df[col_name].mode()[0])
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))
    
###############################################################################

def fill_prop(df, group_col, col_name):
    for group in list(df[group_col].unique()):
        cond = df[group_col]==group
        df.loc[cond, col_name] = df.loc[cond, col_name].fillna(method="ffill").fillna(method="bfill")
    df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))
    
###############################################################################

def fill(df, group_col1, group_col2, col_name, method): # method can be "mode" or "median" or "ffill"
    if method == "mode":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                mode1 = list(df[cond1][col_name].mode())
                mode2 = list(df[cond2][col_name].mode())
                if mode2 != []:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].mode()[0])
                elif mode1 != []:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond1][col_name].mode()[0])
                else:
                    df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[col_name].mode()[0])
                
    elif method == "median":
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond1 = df[group_col1]==group1
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(df[cond2][col_name].median()).fillna(df[cond1][col_name].median()).fillna(df[col_name].median())
                
    elif method == "ffill":           
        for group1 in list(df[group_col1].unique()):
            for group2 in list(df[group_col2].unique()):
                cond2 = (df[group_col1]==group1) & (df[group_col2]==group2)
                df.loc[cond2, col_name] = df.loc[cond2, col_name].fillna(method="ffill").fillna(method="bfill")
                
        for group1 in list(df[group_col1].unique()):
            cond1 = df[group_col1]==group1
            df.loc[cond1, col_name] = df.loc[cond1, col_name].fillna(method="ffill").fillna(method="bfill")            
           
        df[col_name] = df[col_name].fillna(method="ffill").fillna(method="bfill")
    
    print("Number of NaN : ",df[col_name].isnull().sum())
    print("------------------")
    print(df[col_name].value_counts(dropna=False))
###############################################################################

# Load|Read Data

In [None]:
df0 = pd.read_csv('.csv')
df = df0.copy()
df.head(3) 

# First Looking

In [None]:
first_looking(df)
duplicate_values(df)
drop_columns(df, [])
drop_null(df, 90)
# df.describe().T

# Train-Test Split|Dummy

In [None]:
# make_dtype_object = df[['categorical1','categorical2']].astype('object')  # if we have features need to be dummy!!!
X_numerical = df.drop(target, axis=1).select_dtypes('number').astype('float64')
X_categorical = df.drop(target, axis=1).select_dtypes('object')

if (df.dtypes==object).any():
    dummied = pd.get_dummies(X_categorical, drop_first=True)
    X = pd.concat([X_numerical, dummied[dummied.columns]], axis=1)
    
else:
    X = df.drop(target, axis=1).astype('float64')

y = df[target]

###############################################################################
## Train - Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    stratify=y,
                                                    test_size=0.30, 
                                                    random_state=42)

###############################################################################
## Scaling
X_train_scaled = StandardScaler().fit_transform(X_train)
X_test_scaled = StandardScaler().fit_transform(X_test)
pd.DataFrame(X_train_scaled, columns=X.columns).head()

###############################################################################
## Columns
X_categorical_list = X_categorical.columns.to_list()
X_numerical_list = X_numerical.columns.to_list()
X_columns_list = X.columns.to_list()

# OneHotEncoder

In [None]:
ohe = OneHotEncoder(sparse=False, drop="if_binary")
ohe_df = ohe.fit_transform(df_[X_categorical.columns])
ohe_df = pd.DataFrame(ohe_df, columns=ohe.get_feature_names(df_[X_categorical.columns].columns))

# SMOTE

In [None]:
!pip install imbalanced-learn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

In [None]:
over_to = 100000
over = SMOTE(sampling_strategy={1:over_to})
under = RandomUnderSampler(sampling_strategy={0:(y_train.value_counts().sum()-over_to)})
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
X_resampled, y_resampled = pipeline.fit_resample(X_train_scaled, y_train)

In [None]:
from imblearn.combine import SMOTEENN,SMOTETomek
ros1 = SMOTETomek()
ros2 = SMOTEENN()
X_resampled_, y_resampled_ = ros1.fit_resample(X_train_scaled, y_train)
X_resampled__, y_resampled__ = ros2.fit_resample(X_resampled, y_resampled)

In [None]:
print("Over Sampling:", y_resampled.value_counts())
print("Omek:", y_resampled_.value_counts())
print("ENN", y_resampled__.value_counts())

# Model Selection 1 - Train Scores

In [None]:
def model_selection(X_train, y_train):
    # Logistic Regression
    log = LogisticRegression(class_weight="balanced", random_state=42)
    log.fit(X_train, y_train)
    # Decision Tree
    decision_tree = DecisionTreeClassifier(class_weight="balanced", random_state=42)
    decision_tree.fit(X_train, y_train)
    # Random Forest
    random_forest = RandomForestClassifier(class_weight="balanced", random_state=42)
    random_forest.fit(X_train, y_train)
    # KNN
    knn = KNeighborsClassifier()
    knn.fit(X_train, y_train)
    # SVC
    svc = SVC(class_weight="balanced", random_state=42)
    svc.fit(X_train, y_train)
    # XGB
    xgb = XGBClassifier(random_state=42)
    xgb.fit(X_train, y_train)
    # AdaBoosting
    ab = AdaBoostClassifier(random_state=42)
    ab.fit(X_train, y_train)
    # GB GradientBoosting
    gb = GradientBoostingClassifier(random_state=42)
    gb.fit(X_train, y_train)
#     # AdaCost
#     adac = AdaCost(algorithm = "SAMME", cost_matrix = cost_matrix, random_state=42)
#     adac.fit(X_train, y_train)
   

    # Model Accuracy on Training Data
    print(f"\033[1m1) Logistic Regression Training Accuracy:\033[0m {log.score(X_train, y_train)}")
    print(f"\033[1m2) Decision Tree Training Accuracy:\033[0m {decision_tree.score(X_train, y_train)}")
    print(f"\033[1m3) Random Forest Training Accuracy:\033[0m {random_forest.score(X_train, y_train)}")
    print(f"\033[1m4) KNN Training Accuracy:\033[0m {knn.score(X_train, y_train)}")
    print(f"\033[1m5) SVC Training Accuracy:\033[0m {svc.score(X_train, y_train)}")
    print(f"\033[1m6) XGBoosting Training Accuracy:\033[0m {xgb.score(X_train, y_train)}")
    print(f"\033[1m7) AdaBoosting Training Accuracy:\033[0m {ab.score(X_train, y_train)}")
    print(f"\033[1m8) GradiendBoosting Training Accuracy:\033[0m {gb.score(X_train, y_train)}")
    #print(f"\033[1m9) AdaCost Training Accuracy:\033[0m {adac.score(X_train, y_train)}")
    return log, decision_tree, random_forest, knn, svc, xgb, ab, gb #, adac

model_selection(X_train_scaled, y_train_sccaled)

# Model Selection 2 - Train|Test Scores

In [None]:
transformer = ColumnTransformer([("ohe", OneHotEncoder(drop="if_binary"), X_categorical.columns),
                                 ("scaler", StandardScaler(), X_numerical.columns)], 
                                 remainder="passthrough")


models = []
models.append(("LOG", LogisticRegression(class_weight="balanced", random_state=42)))
models.append(("DTC", DecisionTreeClassifier(class_weight="balanced", random_state=42)))
models.append(("RFC", RandomForestClassifier(class_weight="balanced", random_state=42)))
models.append(("KNN", KNeighborsClassifier()))
models.append(("SVC", SVC(class_weight="balanced", random_state=42)))
models.append(("ADA", AdaBoostClassifier(random_state=42)))
models.append(("GBC", GradientBoostingClassifier(random_state=42)))
# evaluate each model in turn

results = []
names = []
f1_scores = []
recall_scores = []
roc_auc_scores = []


for name, model in models:
    pipe = Pipeline([("transformer", transformer),
                     ("model", model)])
    kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
    cv_results = cross_val_score(pipe, X_train, y_train, cv=kfold, scoring="recall")
    
    results.append(cv_results)
    names.append(name)
    
    print(f"{name}: {round(cv_results.mean(), 4)}")

    y_pred = pipe.fit(X_train, y_train).predict(X_test)
    
    f1_scores.append(f1_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))
    roc_auc_scores.append(roc_auc_score(y_test, y_pred))
    
    

result_df = pd.DataFrame(results, columns=[i for i in range(1, 11)], index=names).T
result_df.iplot(kind="box", boxpoints="all", title="CV Results")

compare = pd.DataFrame({"F1": f1_scores,
                        "Recall": recall_scores,
                        "ROC AUC": roc_auc_scores
                       }, index=names)

for score in compare.columns:
    compare[score].sort_values().iplot(kind="barh", title=f"{score} Score")
    
compare 

# Model Selection 3 - Pycaret

In [None]:
!pip install pycaret[full]
!pip install pycaret-nightly
# # https://pycaret.readthedocs.io/en/latest/index.html#
# # install the nightly build
# pip install pycaret-nightly
# # install the full version of the nightly build
# pip install pycaret-nightly[full]

In [None]:
from pycaret.classification import *
fraud_classifier = setup(df, 
                         target='class',
                         session_id=123,
                         train_size=0.8,
                         log_experiment=True,
                         log_plots=True,
                         html=False,
                         experiment_name='Creditcard_Fraud_Detection')

In [None]:
best_model_scores = compare_models()

# TRANSFORMER

In [None]:
transformer = ColumnTransformer([("ohe", OneHotEncoder(drop="if_binary"), X_categorical_list),
                                 ("scaler", StandardScaler(), X_numerical_list)], 
                                 remainder="passthrough")

# Catboost, XGBoost, LightGBM-Imbalanced Data

In [None]:
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import auc
# https://machinelearningmastery.com/roc-curves-and-precision-recall-curves-for-imbalanced-classification/

In [None]:
comp_recall = []
comp_recall_name = []
X_categorical_list=X_categorical.columns.to_list()
X_numerical_list=X_numerical.columns.to_list()
X_columns_list=X.columns.to_list()

transformer = ColumnTransformer([("ohe", OneHotEncoder(drop="if_binary"), X_categorical_list),
                                 ("scaler", StandardScaler(), X_numerical_list)], 
                                 remainder="passthrough")

models = []
models.append(("XGB", XGBClassifier(random_state=42, verbosity = 0, scale_pos_weight = 600)))
models.append(("LGB", LGBMClassifier(random_state=42, scale_pos_weight = 600)))
models.append(("CAT", CatBoostClassifier(random_state=42, verbose=0, 
                                         cat_features=X_categorical_list, scale_pos_weight = 600)))

# evaluate each model in turn

results = []
names = []
f1_scores = []
recall_scores = []
precision_scores = []
roc_auc_scores = []
precision_recall_auc_scores = []



for name, model in models:
    if name != "CAT":
        pipe = Pipeline([("transformer", transformer),
                         ("model", model)])
        kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
        cv_results = cross_val_score(pipe, X_train, y_train, cv=kfold, scoring="recall")

        results.append(cv_results)
        names.append(name)

        print(f"{name} MODEL: {round(cv_results.mean(), 4)}")
        y_pred = pipe.fit(X_train, y_train).predict(X_test)

        f1_scores.append(f1_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, pos_label=0))
        roc_auc_scores.append(roc_auc_score(y_test, y_pred))
        # calculate the precision-recall auc
        precision, recall, _ = precision_recall_curve(y_test, y_pred)
        precision_recall_auc_scores.append(auc(recall, precision))
        
        
        comp_recall.append(recall_score(y_test, y_pred))
        comp_recall_name.append(f"{name} Scale Post Weight 600")

    else:
        
        kfold = StratifiedKFold(n_splits=10, random_state=0, shuffle=True)
        cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring="recall")

        results.append(cv_results)
        names.append(name)

        print(f"{name} MODEL: {round(cv_results.mean(), 4)}")
        
        y_pred = model.fit(X_train, y_train,cat_features=X_categorical_list).predict(X_test)

        f1_scores.append(f1_score(y_test, y_pred))
        recall_scores.append(recall_score(y_test, y_pred))
        precision_scores.append(precision_score(y_test, y_pred, pos_label=0))
        roc_auc_scores.append(roc_auc_score(y_test, y_pred))
        # calculate the precision-recall auc
        precision, recall, _ = precision_recall_curve(y_test, y_pred)
        precision_recall_auc_scores.append(auc(recall, precision))
        
        
        comp_recall.append(recall_score(y_test, y_pred))
        comp_recall_name.append(f"{name} Scale Pos Weight 600")
    

result_df = pd.DataFrame(results, columns=[i for i in range(1, 11)], index=names).T
result_df.iplot(kind="box", boxpoints="all", title="CV Results")

compare = pd.DataFrame({"F1": f1_scores,
                        "Recall-1": recall_scores,
                        "Precision-0": precision_scores,
                        "ROC AUC": roc_auc_scores,
                        "Recall AUC": precision_recall_auc_scores
                       }, index=names)

for score in compare.columns:
    compare[score].sort_values().iplot(kind="barh", title=f"{score} Score")
    
compare

## Compare all models
pd.DataFrame(comp_recall, index=comp_recall_name, columns=["Recall"]).sort_values(by="Recall").iplot(kind="barh",
                                                                                                    title="Model Comparison")

# Classification Reports

In [None]:
def model_validation(y_train, y_train_pred, y_test, y_test_pred, model_name):
    
    scores =  {f"{model_name}_train": {"R2" : r2_score(y_train, y_train_pred),
    "rmse" : np.sqrt(mean_squared_error(y_train, y_train_pred)),
    "mse" : mean_squared_error(y_train, y_train_pred), 
    "mae" : mean_absolute_error(y_train, y_train_pred)},
    
    f"{model_name}_test": {"R2" : r2_score(y_test, y_test_pred),
    "rmse" : np.sqrt(mean_squared_error(y_test, y_test_pred)),
    "mse" : mean_squared_error(y_test, y_test_pred),
    "mae" : mean_absolute_error(y_test, y_test_pred)}}
     
    return pd.DataFrame(scores)

# lm = model_validation(y_train, y_train_pred, y_test, y_test_pred, 'lm')

# pd.concat([lm, rs, rcvs, lss, lcvs, es, ecvs], axis = 1)

###############################################################################

def get_classification_report(y_test, y_test_pred):
    from sklearn import metrics
    report = metrics.classification_report(y_test, y_test_pred, output_dict=True)
    df_classification_report = pd.DataFrame(report).transpose()
    #df_classification_report = df_classification_report.sort_values(by=['f1-score'], ascending=False)
    return df_classification_report

###############################################################################
    
def get_report(model_name):
    y_train_pred = model_name.predict(X_train_scaled)
    y_train_pred_proba = model_name.predict_proba(X_train_scaled) 
    y_test_pred = model_name.predict(X_test_scaled)
    y_test_pred_proba = model_name.predict_proba(X_test_scaled)
    print("Train:")
    print('rmse:', np.sqrt(mean_squared_error(y_train, y_train_pred)))
    print('accuracy:', accuracy_score(y_train, y_train_pred))
    print('roc_auc_score:',roc_auc_score(y_train, y_train_pred_proba[:,1]))
    precision, recall, _ = precision_recall_curve(y_train, y_train_pred_proba[:,1])
    print('roc_auc_recall_precision_score:',auc(recall, precision),'\n')
    print('confusion_matrix:\n\n', confusion_matrix(y_train, y_train_pred), '\n')
    print('classification_report:\n\n', get_classification_report(y_train, y_train_pred),'\n')
    print()
    print("Test:")
    print('rmse:', np.sqrt(mean_squared_error(y_test, y_test_pred))) 
    print('accuracy:', accuracy_score(y_test, y_test_pred))
    print('roc_auc_score:',roc_auc_score(y_test, y_test_pred_proba[:,1]))
    precision, recall, _ = precision_recall_curve(y_test, y_test_pred_proba[:,1])
    print('roc_auc_recall_precision_score:',auc(recall, precision),'\n')
    print('confusion_matrix:\n\n', confusion_matrix(y_test, y_test_pred), '\n')
    print('classification_report:\n\n', get_classification_report(y_test, y_test_pred))

def train_control_table(model_name):
    y_train_pred = model_name.predict(X_train_scaled)
    y_train_pred = pd.DataFrame(y_train_pred)
    y_train_pred.rename(columns = {0: 'y_train_pred'}, inplace = True)
    return pd.concat([X_train, y_train, y_train_pred.set_index(y_train.index)], axis=1)

def test_control_table(model_name):
    y_test_pred = model_name.predict(X_test_scaled)
    y_test_pred = pd.DataFrame(y_test_pred)
    y_test_pred.rename(columns = {0: 'y_test_pred'}, inplace = True)
    return pd.concat([X_test, y_test, y_test_pred.set_index(y_test.index)], axis=1)

# GridSearchCV|PipeLine

In [None]:
## 1-Logistic Regression - GridSearchCV|PipeLine

## Transformer
X_categorical_list = X_categorical.columns.to_list()
X_numerical_list = X_numerical.columns.to_list()
X_columns_list = X.columns.to_list()

transformer = ColumnTransformer([("ohe", OneHotEncoder(drop="if_binary"), X_categorical_list),
                                 ("scaler", StandardScaler(), X_numerical_list)], 
                                 remainder="passthrough")

## GridSearchCV
params = {"classifier__class_weight" : ["balanced"],
          "classifier__penalty": ["l1","l2"],
          "classifier__solver" : ['saga','lbfgs'],
          "classifier__l1_ratio" : np.linspace(0, 1, 10),
          "classifier__C" : np.logspace(0, 10, 10)} 

kfold = StratifiedKFold(n_splits=10, random_state=42, shuffle=True)
log_gridCV_model = GridSearchCV(pipe, param_grid=params, cv=kfold, scoring="recall", verbose=1)

log_gridCV_model.fit(X_resampled, y_resampled)
y_pred = log_gridCV_model.predict(X_test)

comp_recall = []
comp_recall_name = []
comp_recall.append(recall_score(y_test, y_pred))
comp_recall_name.append("Logistic Regression")

print(classification_report(y_test, y_pred))

## 2-RandomForest - GridSearchCV|PipeLine
## 3-SVM - GridSearchCV|PipeLine
## .
## .

## Compare all models
pd.DataFrame(comp_recall, index=comp_recall_name, columns=["Recall"]).sort_values(by="Recall").iplot(kind="barh",
                                                                                                    title="Model Comparison")

# GridSearchCV|User Defined Functions

In [None]:
## 1-Logistic Regression

## GridSearchCV
log_model = LogisticRegression(class_weight=["balanced", None])
params = {"class_weight" : ["balanced"],
          "penalty": ["l1","l2", "elasticnet"],
          "solver" : ['saga','lbfgs'],
          "l1_ratio" : np.linspace(0, 1, 10),
          "C" : np.logspace(0, 10, 10)}


kfold = StratifiedKFold(n_splits=5, shuffle=True)
log_gridCV_model = GridSearchCV(log_model, param_grid=params, cv=kfold, scoring="recall", verbose=1).fit(X_train_scaled, y_train)
y_pred = log_gridCV_model.predict(X_test)

print(classification_report(y_test, y_pred))
print(recall_score(y_test, y_pred))


# penalty = log_gridCV_model.best_params_["penalty"] 
# C = log_gridCV_model.best_params_["C"]
# l1_ratio = log_gridCV_model.best_params_["l1_ratio"]
# solver = log_gridCV_model.best_params_["class_weight"]
# class_weight = log_gridCV_model.best_params_["solver"]

# print(log_gridCV_model.best_params_)

## Print Scores
# classification_report(log_gridCV_model)
# train_control_table(model)
# test_control_table(model)

# Model Tunning

In [None]:
# Model tunning
tuned_model = LogisticRegression(class_weight=class_weight,
                                 penalty=penalty, 
                                 C=C, 
                                 solver=solver, 
                                 max_iter=5000,
                                 l1_ratio=l1_ratio).fit(X_train_scaled, y_train)
y_test_pred = tuned_model.predict(X_test_scaled)