In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt



def feature_importance_selection(indep_X, dep_Y, n):
    model = RandomForestClassifier(random_state=0)
    model.fit(indep_X, dep_Y)
    importances = model.feature_importances_
    indices = np.argsort(importances)[-n:]  # Top 'n' features
    selected_features = indep_X.iloc[:, indices]
    return selected_features, importances
    
def split_scalar(indep_X,dep_Y):
        X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
        sc = StandardScaler()
        X_train = sc.fit_transform(X_train)
        X_test = sc.transform(X_test)    
        return X_train, X_test, y_train, y_test
    
 
def cm_prediction(classifier,X_test):
     y_pred = classifier.predict(X_test)
        
        # Making the Confusion Matrix
     from sklearn.metrics import confusion_matrix
     cm = confusion_matrix(y_test, y_pred)
        
     from sklearn.metrics import accuracy_score 
     from sklearn.metrics import classification_report 
        
     Accuracy=accuracy_score(y_test, y_pred )
        
     report=classification_report(y_test, y_pred)
     return  classifier,Accuracy,report,X_test,y_test,cm

def logistic(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.linear_model import LogisticRegression
        classifier = LogisticRegression(random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm      
    
def svm_linear(X_train,y_train,X_test):
                
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'linear', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm
    
def svm_NL(X_train,y_train,X_test):
                
        from sklearn.svm import SVC
        classifier = SVC(kernel = 'rbf', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm
   
def Navie(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB()
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm         
    
    
def knn(X_train,y_train,X_test):
           
        # Fitting K-NN to the Training set
        from sklearn.neighbors import KNeighborsClassifier
        classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm
def Decision(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training set
        from sklearn.tree import DecisionTreeClassifier
        classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm      


def random(X_train,y_train,X_test):
        
        # Fitting K-NN to the Training set
        from sklearn.ensemble import RandomForestClassifier
        classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm

def lightgbm_model(X_train, y_train, X_test):

    # Fitting lightgbm to the Training set
    from lightgbm import LGBMClassifier
    classifier = LGBMClassifier(random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm

def xgboost_model(X_train, y_train, X_test):

     # Fitting xgboost to the Training set
    from xgboost import XGBClassifier
    classifier = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0)
    classifier.fit(X_train, y_train)
    classifier, Accuracy, report, X_test, y_test, cm = cm_prediction(classifier, X_test)
    return classifier, Accuracy, report, X_test, y_test, cm
    
def featureImportance_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf,acclgbm,accxgb): 
    
    dataframe=pd.DataFrame(index=['FeatureImportance'],columns=['Logistic','SVMl','SVMnl','KNN','Navie',
                                                                'Decision','Random','lightgbm_model','XGBoost'])
    for number,idex in enumerate(dataframe.index):      
        dataframe['Logistic'][idex]=acclog[number]       
        dataframe['SVMl'][idex]=accsvml[number]
        dataframe['SVMnl'][idex]=accsvmnl[number]
        dataframe['KNN'][idex]=accknn[number]
        dataframe['Navie'][idex]=accnav[number]
        dataframe['Decision'][idex]=accdes[number]
        dataframe['Random'][idex]=accrf[number]
        dataframe['lightgbm_model'][idex]=acclgbm[number]
        dataframe['XGBoost'][idex]=accxgb[number]
    return dataframe

In [2]:
dataset1=pd.read_csv("churn_preprocess_data.csv",index_col=None)

df2=dataset1

df2 = pd.get_dummies(df2, drop_first=True)
df2.replace({True: 1, False: 0}, inplace=True)

indep_X=df2.drop('Churn_Yes', axis=1)
dep_Y=df2['Churn_Yes']

In [3]:
df2

Unnamed: 0,SeniorCitizen,tenure,MonthlyCharges,TotalCharges,gender_Male,Partner_Yes,Dependents_Yes,PhoneService_Yes,MultipleLines_No phone service,MultipleLines_Yes,...,StreamingTV_Yes,StreamingMovies_No internet service,StreamingMovies_Yes,Contract_One year,Contract_Two year,PaperlessBilling_Yes,PaymentMethod_Credit card,PaymentMethod_Electronic check,PaymentMethod_Mailed check,Churn_Yes
0,0,5,27.43,137.15,1,1,1,0,0,1,...,0,1,0,0,0,1,1,0,0,1
1,0,42,38.28,1607.76,0,0,0,1,1,0,...,0,1,0,0,1,0,0,1,0,1
2,0,61,106.44,6492.84,1,0,0,1,1,0,...,1,0,0,1,0,0,0,0,0,1
3,1,22,92.49,2034.78,1,0,1,1,0,0,...,0,0,1,0,0,0,0,1,0,0
4,1,21,19.63,412.23,1,1,1,0,0,0,...,0,1,0,1,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295,1,61,42.52,2593.72,0,0,1,0,0,1,...,0,0,0,1,0,1,0,1,0,0
296,0,49,22.31,1093.19,0,0,0,1,1,0,...,1,0,1,0,0,1,0,0,1,1
297,0,71,59.57,4229.47,1,0,1,1,0,1,...,0,0,1,0,1,0,0,0,1,1
298,1,1,31.32,31.32,1,1,1,1,0,1,...,1,0,1,0,1,0,1,0,0,0


In [24]:
selected_features, feature_importances = feature_importance_selection(indep_X, dep_Y, 7)        

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]
acclgbm=[]
accxgb=[]


  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


In [25]:
feature_importances

array([0.02542023, 0.11504174, 0.11604276, 0.13834624, 0.02083013,
       0.02125757, 0.02862483, 0.0232442 , 0.02582049, 0.03238285,
       0.02317239, 0.01911503, 0.02627953, 0.02213421, 0.02426215,
       0.03135677, 0.02252831, 0.02229584, 0.02425261, 0.01766516,
       0.03239214, 0.01145804, 0.02075214, 0.0209814 , 0.02756766,
       0.02606376, 0.02052851, 0.01972526, 0.02195264, 0.01850544])

In [26]:
X_train, X_test, y_train, y_test = split_scalar(selected_features, dep_Y)    
    
        
classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test)
acclog.append(Accuracy)

classifier,Accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test)  
accsvml.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test)  
accsvmnl.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test)  
accknn.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=Navie(X_train,y_train,X_test)  
accnav.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test)  
accdes.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test)  
accrf.append(Accuracy)

classifier, Accuracy, report, X_test, y_test, cm = lightgbm_model(X_train, y_train, X_test)
acclgbm.append(Accuracy)

classifier, Accuracy, report, X_test, y_test, cm = xgboost_model(X_train, y_train, X_test)
accxgb.append(Accuracy)
    
result=featureImportance_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf,acclgbm,accxgb)

  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  old_joblib = LooseVersion(joblib_version) < LooseVersion('0.12')
  if _joblib.__version__ >= LooseVersion('0.12'):
  if _joblib.__version__ >= LooseVersion('0.12'):


[LightGBM] [Info] Number of positive: 108, number of negative: 117
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 220
[LightGBM] [Info] Number of data points in the train set: 225, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.480000 -> initscore=-0.080043
[LightGBM] [Info] Start training from score -0.080043


In [7]:
result
#2

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random,lightgbm_model,XGBoost
FeatureImportance,0.48,0.493333,0.426667,0.493333,0.466667,0.413333,0.453333,0.4,0.466667


In [11]:
result
#3

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random,lightgbm_model,XGBoost
FeatureImportance,0.506667,0.506667,0.44,0.493333,0.48,0.453333,0.36,0.386667,0.413333


In [15]:
result
#4

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random,lightgbm_model,XGBoost
FeatureImportance,0.546667,0.546667,0.493333,0.453333,0.506667,0.466667,0.493333,0.413333,0.44


In [19]:
result
#5

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random,lightgbm_model,XGBoost
FeatureImportance,0.493333,0.52,0.506667,0.506667,0.493333,0.493333,0.466667,0.493333,0.48


In [23]:
result
#6

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random,lightgbm_model,XGBoost
FeatureImportance,0.56,0.546667,0.573333,0.506667,0.533333,0.493333,0.493333,0.48,0.386667


In [27]:
result
#7

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Navie,Decision,Random,lightgbm_model,XGBoost
FeatureImportance,0.52,0.56,0.573333,0.573333,0.533333,0.493333,0.533333,0.533333,0.426667
