In [None]:
"""
ML Model comparision
New versus traditional models performance comparision.

10 different models were used in this study. These models are; 
"XGBOOST", "LIGHT_GBM", "RANDOM_FOREST", "GRADIENT_BOOSTING", "DECISION_TREE", "KNN"
,"LOGISTIC_REGRESSION", "MULTINOMINAL_NAIVEBAYES", "SVM", "GAUSSIAN_NAIVEBAYES".

Churn analysis was used in this study.
Dataset consists of 112 column and 6218 rows.

Churn results of dataset are; 
1    5399
0    1429

Therefore I used Stratified kFold cross validation, because there is a big difference between the results.

We can see models performance on this task and compare them. 
Also, we can compare performance of each model with 'train test split' and 'Stratified kFold cross validation'.

The result file is attached.


"""


In [13]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tkinter import *
import openpyxl


from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import GradientBoostingClassifier
import lightgbm as ltb
import xgboost as xgb

from sklearn import preprocessing
from sklearn.model_selection import StratifiedKFold
from statistics import mean, stdev


In [2]:

# Reading dataset from excel file

fileName = r'D:\BiletBankProje\dataset.xlsx'

df = pd.read_excel(fileName)

# Results of project for create plots

fileName2 = r'D:\BiletBankProje\final_result.xlsx'

df_final = pd.read_excel(fileName2)

# Total churn results of our dataset
df["Churn"].value_counts()

1    5399
0    1429
Name: Churn, dtype: int64

In [3]:
# Get File informations.

def fileInfo():
 split_tup = os.path.splitext('D:\BiletBankProje\dataset.xlsx')
 file_name = split_tup[0]
 file_extension = split_tup[1]  
 print("File Name: ", file_name)
 print("File Extension: ", file_extension) 

In [4]:
# Drop churn values from our dataset for estimation of these values
inputs1 = df.drop('Churn', axis='columns')
target = df['Churn']
inputs1

# Split into 
x = inputs1.iloc[:,:112].values
y = target
xTrain,xTest,yTrain,yTest = train_test_split(x,y,test_size=0.2, random_state=0)



In [59]:
# Result method for models
def results(yTest,yPred):

 
 
 print("\n***********************\n")
 
 acc = accuracy_score(yTest,yPred)
 print('Accuracy :',acc)
 prec = precision_score(yTest,yPred)
 print('Precision :',prec)
 f1 =f1_score(yTest,yPred)
 print('F1 Score :',f1)
 recall = recall_score(yTest,yPred)
 print('Recall :',recall)
 cn = confusion_matrix(yTest,yPred)
 print('confusion_matrix :\n',cn) 

 print("\n***********************\n")


In [74]:
# parameters changed and tested manually for each model
# parameters that work best f1_score are wrote
# train test split method used for these models

# Decision Tree

def DecisionTree_train_test_split():
  print('\n Decision Tree \n')
  
  model = tree.DecisionTreeClassifier(criterion="entropy", splitter = "best",min_samples_split = len(df.columns),max_depth =len(df), min_samples_leaf=10,random_state=0)

  model.fit(xTrain,yTrain)
  yPred = model.predict(xTest)
  scr = model.score(xTest,yTest)
 
  results(yTest,yPred)


# Random Forest 

def RandomForest_train_test_split():
   print('\n Random Forest \n')
 
   regressor = RandomForestClassifier(n_estimators=42, random_state=0,criterion= "entropy", bootstrap= True, min_samples_split=2)
   regressor.fit(xTrain, yTrain)
   yPred = regressor.predict(xTest)
  
   results(yTest,yPred)


# LogisticRegression

def LogisticRegressionFunction_train_test_split():
 print('\n Logistic Regression \n')


 lr = LogisticRegression(penalty='l2', C=1.0,solver='newton-cg')            
 lr.fit(xTrain,yTrain)
 yPred = lr.predict(xTest)
 

 results(yTest,yPred)

 # KNN

def KNN_train_test_split():
 print('\n KNN \n')

 knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski'
 ,metric_params=None, n_jobs=1, n_neighbors=3, p=1,weights='uniform')


 knn.fit(xTrain,yTrain)
 yPred = knn.predict(xTest)


 results(yTest,yPred)


# Gaussian Naive Bayes

def GaussianNaiveBayesClass_train_test_split(): 
 print('\n Gaussian NaiveBayes \n')
 NaiveBayes = GaussianNB(var_smoothing=0.01)
 NaiveBayes.fit(xTrain,yTrain)
 NaiveBayes.fit(xTrain, yTrain)
 yPred = NaiveBayes.predict(xTest)


 results(yTest,yPred)



# Multinominal Naive Bayes

def MultiNaiveBayesClass_train_test_split(): 
 print('\n Multinominal NaiveBayes \n')
 NaiveBayesM = MultinomialNB()
 NaiveBayesM.fit(xTrain,yTrain)
 NaiveBayesM.fit(xTrain, yTrain)
 yPred = NaiveBayesM.predict(xTest)
 

 results(yTest,yPred)

def SVM_train_test_split(): 
 print('\n Support Vector Machine \n')

 clf = svm.SVC(kernel=y,gamma=x)
 
 clf.fit(xTrain, yTrain)
 yPred = clf.predict(xTest)

 results(yTest,yPred)


# Gradient Boosting Classifier
 
def GradientBoostingClass_train_test_split(): 
 print('\n Gradient Boosting Classifierxx\n')

 gradient_booster = GradientBoostingClassifier(loss="deviance",learning_rate=0.1, 
 n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=10, 
 random_state=None,max_features=None,warm_start=False)

 gradient_booster.fit(xTrain, yTrain)
 yPred = gradient_booster.predict(xTest)
 

 results(yTest,yPred)

 # Light GBM Classifier

def LGBMClass_train_test_split(): 
 print('\n Light GBM Classifier \n')
 
 lgmbc = ltb.LGBMClassifier(boosting_type='gbdt')
 
 lgmbc.fit(xTrain,yTrain)
 yPred = lgmbc.predict(xTest)
 

 results(yTest,yPred)

# XgBoost Classifier

def XgBoostClass_train_test_split(): 
 print('\n XgBoost Classifier \n')

 xgbc = xgb.XGBClassifier(booster="gbtree",max_depth=8,gamma=2)
 xgbc.fit(xTrain,yTrain)
 yPred = xgbc.predict(xTest)
 

 results(yTest,yPred)


def runAll_train_test_split():
   DecisionTree_train_test_split()
   RandomForest_train_test_split()
   LogisticRegressionFunction_train_test_split()
   KNN_train_test_split()
   GaussianNaiveBayesClass_train_test_split()
   MultiNaiveBayesClass_train_test_split()
   SVM_train_test_split()
   GradientBoostingClass_train_test_split()
   LGBMClass_train_test_split()
   XgBoostClass_train_test_split()

# Accuracy and f1 scores line charts

def lineChart():
 accuracy = df_final[['Score']]
 f1_score1 = df_final[['F1_Score']]
 algorithm = ["XGBOOST","LIGHT_GBM","RANDOM_FOREST","GRADIENT_BOOSTING","DECISION_TREE","KNN"
 ,"LOGISTIC_REGRESSION","MULTINOMINAL_NAIVEBAYES","SVM","GAUSSIAN_NAIVEBAYES"]
 plt.figure(figsize=[25,12.5])


 plt.plot(algorithm,accuracy,linewidth=4.0) #adds the line
 plt.grid() #adds a grid to the plot
 plt.plot(algorithm,f1_score1,linewidth=4.0,color='red') #adds the line
 plt.grid() #adds a grid to the plot
 plt.ylabel('F1_Score, Accuracy',fontsize=16) #xlabel
 plt.xlabel('Algorithms\n Red Line: F1 Score \n Blue Line: Accuracy',fontsize=16) #ylabel

 plt.figure(figsize=[50,25])

 plt.subplot(2,2,1) 
 plt.plot(algorithm,accuracy,linewidth=4.0) #adds the line
 plt.grid() #adds a grid to the plot
 plt.ylabel('Accuracy',fontsize=16) #xlabel
 plt.xlabel('Algorithms',fontsize=16) #ylabel
 plt.subplot(2,2,2)
 plt.plot(algorithm,f1_score1,linewidth=4.0,color='red') #adds the line
 plt.grid() #adds a grid to the plot
 plt.ylabel('F1_Score',fontsize=16) #xlabel
 plt.xlabel('Algorithms',fontsize=16) #ylabel


# Accuracy and f1 scores box charts

def bar_chart():
 accuracy = df_final[['Score']].squeeze()
 f1_score1 = df_final[['F1_Score']].squeeze()
 algorithm = ["XGBOOST","LIGHT_GBM","RANDOM_FOREST","GRADIENT_BOOSTING","DECISION_TREE","KNN"
 ,"LOGISTIC_REGRESSION","MULTINOMINAL_NAIVEBAYES","SVM","GAUSSIAN_NAIVEBAYES"]

 fig = plt.figure(figsize = (25, 12.5))
 
 plt.bar(algorithm, accuracy, color ='maroon',width = 0.4)
 
 plt.ylabel('Accuracy',fontsize=16) #xlabel
 plt.xlabel('Algorithms',fontsize=16) #ylabel
 plt.title("Accuracy in different algorithms",fontsize=16)
 plt.show()

 fig = plt.figure(figsize = (25, 12.5))
 
 plt.bar(algorithm, f1_score1, color ='blue',width = 0.4)
 
 plt.ylabel('F1 Score',fontsize=16) #xlabel
 plt.xlabel('Algorithms',fontsize=16) #ylabel
 plt.title("F1 Score in different algorithms",fontsize=16)
 
 plt.show()



In [66]:
# Feature Scaling for input features.
scaler = preprocessing.MinMaxScaler()
x_scaled = scaler.fit_transform(x)

# Create StratifiedKFold object.
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
lst_f1_stratified = []

In [67]:
# Results of Stratifed Cross Validation

def resultSkf(lst_f1_stratified):
 print("\n***********************\n")	
 print('List of possible F1_score:', lst_f1_stratified)
 print('\nMaximum F1_score That can be obtained from this model is:',
	max(lst_f1_stratified)*100, '%')
 print('\nMinimum F1_score:',
	min(lst_f1_stratified)*100, '%')
 print('\nOverall F1_score:',
	mean(lst_f1_stratified)*100, '%')
 print('\nStandard Deviation is:', stdev(lst_f1_stratified))
 print("\n***********************\n") 


In [76]:
# parameters changed and tested manually for each model
# parameters that work best f1_score are wrote
# Stratified Cross Validation used for these models

# Decision Tree

def DecisionTree_skf():
 model = tree.DecisionTreeClassifier(criterion="entropy", splitter = "best",min_samples_split = len(df.columns),
 max_depth =len(df), min_samples_leaf=10,random_state=0)

 for train_index, test_index in skf.split(x, y):
  x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
  y_train_fold, y_test_fold = y[train_index], y[test_index]
  model.fit(x_train_fold, y_train_fold)
  yPred = model.predict(x_test_fold)
  lst_f1_stratified.append(f1_score(y_test_fold, yPred))

 resultSkf(lst_f1_stratified)



def RandomForest_skf():
 model = RandomForestClassifier(n_estimators=42, random_state=0,criterion= "entropy", 
 bootstrap= True, min_samples_split=2)

 for train_index, test_index in skf.split(x, y):
  x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
  y_train_fold, y_test_fold = y[train_index], y[test_index]
  model.fit(x_train_fold, y_train_fold)
  yPred = model.predict(x_test_fold)
  lst_f1_stratified.append(f1_score(y_test_fold, yPred))


 resultSkf(lst_f1_stratified)


def LogisticRegressionFunction_skf():
 model = LogisticRegression() 

 for train_index, test_index in skf.split(x, y):
  x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
  y_train_fold, y_test_fold = y[train_index], y[test_index]
  model.fit(x_train_fold, y_train_fold)
  yPred = model.predict(x_test_fold)
  lst_f1_stratified.append(f1_score(y_test_fold, yPred))


 resultSkf(lst_f1_stratified)



def KNN_skf():
 model = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski'
 ,metric_params=None, n_jobs=1, n_neighbors=3, p=1,weights='uniform')

 for train_index, test_index in skf.split(x, y):
  x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
  y_train_fold, y_test_fold = y[train_index], y[test_index]
  model.fit(x_train_fold, y_train_fold)
  yPred = model.predict(x_test_fold)
  lst_f1_stratified.append(f1_score(y_test_fold, yPred))


 resultSkf(lst_f1_stratified)



def GaussianNaiveBayesClass_skf():
 model = GaussianNB(var_smoothing=0.01)

 for train_index, test_index in skf.split(x, y):
  x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
  y_train_fold, y_test_fold = y[train_index], y[test_index]
  model.fit(x_train_fold, y_train_fold)
  yPred = model.predict(x_test_fold)
  lst_f1_stratified.append(f1_score(y_test_fold, yPred))


 resultSkf(lst_f1_stratified)



def MultiNaiveBayesClass_skf():
 model = MultinomialNB()

 for train_index, test_index in skf.split(x, y):
  x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
  y_train_fold, y_test_fold = y[train_index], y[test_index]
  model.fit(x_train_fold, y_train_fold)
  yPred = model.predict(x_test_fold)
  lst_f1_stratified.append(f1_score(y_test_fold, yPred))


 resultSkf(lst_f1_stratified)



def SVM_skf():
 model = svm.SVC(kernel=y,gamma=x)

 for train_index, test_index in skf.split(x, y):
  x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
  y_train_fold, y_test_fold = y[train_index], y[test_index]
  model.fit(x_train_fold, y_train_fold)
  yPred = model.predict(x_test_fold)
  lst_f1_stratified.append(f1_score(y_test_fold, yPred))


 resultSkf(lst_f1_stratified)



def GradientBoostingClass_skf():
 model = GradientBoostingClassifier(loss="deviance",learning_rate=0.1, 
 n_estimators=100,max_depth=3, min_samples_split=2, min_samples_leaf=1, subsample=10, 
 random_state=None,max_features=None,warm_start=False)

 for train_index, test_index in skf.split(x, y):
  x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
  y_train_fold, y_test_fold = y[train_index], y[test_index]
  model.fit(x_train_fold, y_train_fold)
  yPred = model.predict(x_test_fold)
  lst_f1_stratified.append(f1_score(y_test_fold, yPred))


 resultSkf(lst_f1_stratified)



def LGBMClass_skf():
 model = ltb.LGBMClassifier(boosting_type='gbdt')

 for train_index, test_index in skf.split(x, y):
  x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
  y_train_fold, y_test_fold = y[train_index], y[test_index]
  model.fit(x_train_fold, y_train_fold)
  yPred = model.predict(x_test_fold)
  lst_f1_stratified.append(f1_score(y_test_fold, yPred))


 resultSkf(lst_f1_stratified)



def XgBoostClass_skf():
 model = xgb.XGBClassifier(booster="gbtree",max_depth=8,gamma=2)

 for train_index, test_index in skf.split(x, y):
  x_train_fold, x_test_fold = x_scaled[train_index], x_scaled[test_index]
  y_train_fold, y_test_fold = y[train_index], y[test_index]
  model.fit(x_train_fold, y_train_fold)
  yPred = model.predict(x_test_fold)
  lst_f1_stratified.append(f1_score(y_test_fold, yPred))


 resultSkf(lst_f1_stratified)



def runAll_skf():
   DecisionTree_skf()
   RandomForest_skf()
   #LogisticRegressionFunction_skf()
   KNN_skf()
   GaussianNaiveBayesClass_skf()
   MultiNaiveBayesClass_skf()
   SVM_skf()
   GradientBoostingClass_skf()
   LGBMClass_skf()
   XgBoostClass_skf()

In [77]:
def menu():
     print (' '
      '\n1 - Enter "1" for Decision Tree\n'
        '2 - Enter "2" for Random Forest\n'
        '4 - Enter "3" for KNN\n'
        '5 - Enter "4" for Gaussian Naive Bayes\n'
        '6 - Enter "5" for Multinominal Naive Bayes\n'
        '7 - Enter "6" for Support Vector Machine\n'
        '8 - Enter "7" for Gradient Boosting Classifier\n'
        '9 - Enter "8" for Light GBM Classifier\n'
        '10 - Enter "9" for XgBoost Classifier\n'
        '11 - Enter "10" for Run all Algorithms\n'
        '12 - Enter "11" for Show Line Charts\n'
        '13 - Enter "12" for Show Box Charts\n'
        '0 - Enter "0" for Exit\n\n')
def menu2():
     print (' '
      '\n1 - Enter "1" for Train Test Split\n'
        '2 - Enter "2" for Cross Validation (Stratified kFold)\n')

def options():
 flag = True
 while flag:
    fileInfo() 
    menu()
    option = int(input('Chose a model...'))
    
   
    if option == 1:
     option2 = int(input('Chose an option...\n '))
     menu2()
     if option2 == 1:
         DecisionTree_train_test_split()
         continue
     elif option2 == 2:
         DecisionTree_skf()
         continue
     else:
        print ('Invalid choice')
        continue

    
    elif option == 2:
     option2 = int(input('Chose an option...\n '))
     menu2()
     if option2 == 1:
         RandomForest_train_test_split()
         continue
     elif option2 == 2:
         RandomForest_skf()
         continue
     else:
        print ('Invalid choice')
        continue


    elif option == 3:
     option2 = int(input('Chose an option...\n '))
     menu2()
     if option2 == 1:
         KNN_train_test_split()
         continue
     elif option2 == 2:
           KNN_skf()
           continue
     else:
        print ('Invalid choice')
        continue


    elif option == 4:
     option2 = int(input('Chose an option...\n '))
     menu2()
     if option2 == 1:
         GaussianNaiveBayesClass_train_test_split()
         continue
     elif option2 == 2:
           GaussianNaiveBayesClass_skf()
           continue
     else:
        print ('Invalid choice')
        continue


    elif option == 5:
     option2 = int(input('Chose an option...\n '))
     menu2()
     if option2 == 1:
         MultiNaiveBayesClass_train_test_split()
         continue
     elif option2 == 2:
           MultiNaiveBayesClass_skf()
           continue
     else:
        print ('Invalid choice')
        continue


    elif option == 6:
     option2 = int(input('Chose an option...\n '))
     menu2()
     if option2 == 1:
         SVM_train_test_split()
         continue
     elif option2 == 2:
           SVM_skf()
           continue
     else:
        print ('Invalid choice')
        continue


    elif option == 7:
     option2 = int(input('Chose an option...\n '))
     menu2()
     if option2 == 1:
         GradientBoostingClass_train_test_split()
         continue
     elif option2 == 2:
           GradientBoostingClass_skf()
           continue
     else:
        print ('Invalid choice')
        continue


    elif option == 8:
     option2 = int(input('Chose an option...\n '))
     menu2()
     if option2 == 1:
         LGBMClass_train_test_split()
         continue
     elif option2 == 2:
           LGBMClass_train_test_split()
           continue
     else:
        print ('Invalid choice')
        continue


    elif option == 9:     
     option2 = int(input('Chose an option...\n '))
     menu2()
     if option2 == 1:
         XgBoostClass_train_test_split()
         continue
     elif option2 == 2:
           XgBoostClass_skf()
           continue
     else:
        print ('Invalid choice')
        continue


    elif option == 10:
     option2 = int(input('Chose an option...\n '))
     menu2()
     if option2 == 1:
         runAll_train_test_split()
         continue
     elif option2 == 2:
           runAll_skf()
           continue
     else:
        print ('Invalid choice')
        continue


    elif option == 11:
         lineChart()
         continue
    elif option == 13:
         bar_chart()
         continue
    elif option == 0:
         print('Exit...')
         break

    else:
        print ('Invalid choice')
        continue

In [78]:
def main():
    try:
        options()
    except ValueError as error:
        print('Error! Please enter a valid number...')
   


In [80]:
main()

File Name:  D:\BiletBankProje\dataset
File Extension:  .xlsx
 
1 - Enter "1" for Decision Tree
2 - Enter "2" for Random Forest
3- Enter "3" for Logistic Regression
4 - Enter "4" for KNN
5 - Enter "5" for Gaussian Naive Bayes
6 - Enter "6" for Multinominal Naive Bayes
7 - Enter "7" for Support Vector Machine
8 - Enter "8" for Gradient Boosting Classifier
9 - Enter "9" for Light GBM Classifier
10 - Enter "10" for XgBoost Classifier
11 - Enter "11" for Run all Algorithms
12 - Enter "12" for Show Line Charts
13 - Enter "13" for Show Box Charts
0 - Enter "0" for Exit


 
1 - Enter "1" for Train Test Split
2 - Enter "2" for Cross Validation (Stratified kFold)


***********************

List of possible F1_score: [0.9710550887021475, 0.9592417061611374, 0.968342644320298, 0.9740259740259741, 0.9719626168224299, 0.9593956562795091, 0.9721189591078068, 0.9719626168224299, 0.9672591206735267, 0.9766136576239476, 0.9666666666666667, 0.9504761904761904, 0.9644859813084113, 0.9676823638042474, 0.96

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(



***********************

List of possible F1_score: [0.9710550887021475, 0.9592417061611374, 0.968342644320298, 0.9740259740259741, 0.9719626168224299, 0.9593956562795091, 0.9721189591078068, 0.9719626168224299, 0.9672591206735267, 0.9766136576239476, 0.9666666666666667, 0.9504761904761904, 0.9644859813084113, 0.9676823638042474, 0.9651376146788991, 0.9549718574108818, 0.9673202614379085, 0.9652014652014653, 0.9599254426840634, 0.9750231267345051, 0.9710550887021475, 0.9592417061611374, 0.968342644320298, 0.9740259740259741, 0.9719626168224299, 0.9593956562795091, 0.9721189591078068, 0.9719626168224299, 0.9672591206735267, 0.9766136576239476, 0.9286343612334802, 0.9165186500888101, 0.9306409130816506, 0.9275618374558304, 0.9271290605794555, 0.9260563380281691, 0.9379432624113475, 0.9288194444444444, 0.9219982471516214, 0.9295774647887324]

Maximum F1_score That can be obtained from this model is: 97.66136576239475 %

Minimum F1_score: 91.651865008881 %

Overall F1_score: 95.7393165768