In [23]:
import pandas as pd 
import numpy as np 

from sklearn.preprocessing import LabelEncoder

import warnings
warnings.filterwarnings("ignore")

In [20]:
def model_builder(estimator,X,y,testsize,randomstate,ml_type= "Classifier"):
    
    from sklearn.model_selection import train_test_split,StratifiedKFold,cross_val_score
    from sklearn.metrics import accuracy_score, confusion_matrix, auc, r2_score
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,test_size= testsize, random_state= randomstate)
    estimator.fit(X_train,y_train)
    if ml_type == "Classifier":
        test_acc = accuracy_score(y_test,estimator.predict(X_test))
        train_acc = accuracy_score(y_train,estimator.predict(X_train))
        stf = StratifiedKFold(n_splits=10, shuffle=True, random_state= randomstate)
        cvmean = np.mean(cross_val_score(estimator,X,y,cv=stf))
        cvstd = np.std(cross_val_score(estimator,X,y,cv=stf))
    else:
        test_acc = accuracy_score(y_test,estimator.predict(X_test))
        train_acc = accuracy_scorec(y_train,estimator.predict(X_train))
        cvmean = np.mean(cross_val_score(estimator,X,y,cv=stf))
        cvstd = np.std(cross_val_score(estimator,X,y,cv=stf))
        
    list_values = [type(estimator).__name__, test_acc, train_acc, cvmean, cvstd]
    
    return list_values


In [25]:
def mainfunc(X,y,testsize,randomstate,ml_type= "Classifier"):
    
    from sklearn.linear_model import LinearRegression, LogisticRegression, Lasso, Ridge
    from sklearn.tree import DecisionTreeClassifier,DecisionTreeRegressor
    from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
    from sklearn.svm import SVC,SVR
    from sklearn.ensemble import RandomForestClassifier,RandomForestRegressor,GradientBoostingClassifier,GradientBoostingRegressor,AdaBoostClassifier,AdaBoostRegressor
    
    l1 = []
    classif_models = [LogisticRegression(),DecisionTreeClassifier(),KNeighborsClassifier(),SVC(),RandomForestClassifier(),GradientBoostingClassifier(),AdaBoostClassifier()]
    reg_models = [LinearRegression(),Lasso(),Ridge(),DecisionTreeRegressor(),KNeighborsRegressor(),SVR(),RandomForestRegressor(),GradientBoostingRegressor(),AdaBoostRegressor()]
    
    if ml_type == "Classifier":
        for i in classif_models:
            l1.append(model_builder(i,X,y,testsize,randomstate,ml_type))
    else:
        for i in reg_models:
            l1.append(model_builder(i,X,y,testsize,randomstate,ml_type))
            
    result = pd.DataFrame(l1, columns=["Model Name","Test Accuracy","Train Accuracy","Cross Val Mean","Cross Val Std"]) 
    
    return result.sort_values("Test Accuracy",ascending=False)
    
    

In [4]:
df = pd.read_csv("Bank data (1).csv")

In [5]:
df.head()

Unnamed: 0,Cust No.,First Name,Surname,Credit Score,Geography,Gender,Age,Tenure,Balance,Num Of Policies,Credit Card,Active Member,Salary,Exited
0,1,Walter,Hargrave,619,France,Female,42,2.0,,1.0,Yes,Yes,101349,1
1,2,Daniel,Hill,608,Spain,Female,41,1.0,83807.86,1.0,No,Yes,112543,0
2,3,Melissa,Onio,502,France,Female,42,8.0,159660.8,3.0,Yes,No,113932,1
3,4,Miley,Boni,699,France,Female,39,1.0,,2.0,No,No,93827,0
4,5,James,Mitchell,850,Spain,Female,43,2.0,125510.82,1.0,Yes,Yes,79084,0


In [6]:
df["Geography"].fillna(value= "France",inplace=True)
df["Gender"].fillna(value= "Male",inplace=True)
df["Tenure"].fillna(value= 5,inplace=True)
df["Balance"].fillna(value= 119843.19,inplace=True)
df["Num Of Policies"].fillna(value= 1.53,inplace=True)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10071 entries, 0 to 10070
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Cust No.         10071 non-null  int64  
 1   First Name       10071 non-null  object 
 2   Surname          10071 non-null  object 
 3   Credit Score     10071 non-null  int64  
 4   Geography        10071 non-null  object 
 5   Gender           10071 non-null  object 
 6   Age              10071 non-null  int64  
 7   Tenure           10071 non-null  float64
 8   Balance          10071 non-null  float64
 9   Num Of Policies  10071 non-null  float64
 10  Credit Card      10071 non-null  object 
 11  Active Member    10071 non-null  object 
 12  Salary           10071 non-null  int64  
 13  Exited           10071 non-null  int64  
dtypes: float64(3), int64(5), object(6)
memory usage: 1.1+ MB


In [10]:
#Label Encoder

le = LabelEncoder()
df["Geography"] = le.fit_transform(df["Geography"])
df["Gender"] = le.fit_transform(df["Gender"])
df["Credit Card"] = le.fit_transform(df["Credit Card"])
df["Active Member"] = le.fit_transform(df["Active Member"])

In [11]:
df.head()

Unnamed: 0,Cust No.,First Name,Surname,Credit Score,Geography,Gender,Age,Tenure,Balance,Num Of Policies,Credit Card,Active Member,Salary,Exited
0,1,Walter,Hargrave,619,0,0,42,2.0,119843.19,1.0,1,1,101349,1
1,2,Daniel,Hill,608,2,0,41,1.0,83807.86,1.0,0,1,112543,0
2,3,Melissa,Onio,502,0,0,42,8.0,159660.8,3.0,1,0,113932,1
3,4,Miley,Boni,699,0,0,39,1.0,119843.19,2.0,0,0,93827,0
4,5,James,Mitchell,850,2,0,43,2.0,125510.82,1.0,1,1,79084,0


In [12]:
df.drop(["Cust No.","First Name","Surname"],axis=1,inplace=True)

In [13]:
def Outlier(my_df, col):
    q1 = my_df[col].quantile(0.25)
    q3 = my_df[col].quantile(0.75)
    IQR = q3 - q1
    lw = q1 - 1.5*IQR
    uw = q3 + 1.5*IQR
    my_df[col] = np.where(my_df[col]>uw,uw,my_df[col])
    my_df[col] = np.where(my_df[col]<lw,lw,my_df[col])
    return my_df 

In [14]:
Outlier(df,["Balance","Num Of Policies","Age","Credit Score"])

Unnamed: 0,Credit Score,Geography,Gender,Age,Tenure,Balance,Num Of Policies,Credit Card,Active Member,Salary,Exited
0,619.0,0,0,42.0,2.0,119843.19,1.00,1,1,101349,1
1,608.0,2,0,41.0,1.0,86979.11,1.00,0,1,112543,0
2,502.0,0,0,42.0,8.0,152258.35,3.00,1,0,113932,1
3,699.0,0,0,39.0,1.0,119843.19,2.00,0,0,93827,0
4,850.0,2,0,43.0,2.0,125510.82,1.00,1,1,79084,0
...,...,...,...,...,...,...,...,...,...,...,...
10066,777.0,0,0,35.0,3.0,86979.11,2.00,1,1,156120,0
10067,561.0,1,1,62.0,8.0,152258.35,1.53,0,1,24613,1
10068,450.0,0,1,60.0,9.0,86979.11,3.50,1,1,86961,0
10069,819.0,1,1,45.0,3.0,94661.00,3.00,1,1,89433,1


In [15]:
X = df.drop("Exited",axis=1)
y = df[["Exited"]]

In [26]:
mainfunc(X,y,testsize=0.2,randomstate=5)

Unnamed: 0,Model Name,Test Accuracy,Train Accuracy,Cross Val Mean,Cross Val Std
5,GradientBoostingClassifier,0.849628,0.872766,0.861086,0.006833
4,RandomForestClassifier,0.848635,1.0,0.855626,0.005385
6,AdaBoostClassifier,0.837717,0.857498,0.850958,0.005122
0,LogisticRegression,0.787097,0.797294,0.786814,0.006522
3,SVC,0.787097,0.797294,0.795254,0.000359
1,DecisionTreeClassifier,0.768734,1.0,0.785324,0.014735
2,KNeighborsClassifier,0.758313,0.816162,0.760698,0.005625
