# Select K Algorithm Classification

In [1]:
# Import Modules
import pandas as pd
from sklearn.model_selection import train_test_split 
import time
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier   
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import warnings
# Suppress all warnings
warnings.filterwarnings('ignore')

In [14]:
# Recursive_Feature_Elimination
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

def rfeFeature(indep_X, dep_Y, n):
    rfelist = []
    
    # Define models
    log_model = LogisticRegression(solver='lbfgs', max_iter=1000)
    svc_model = SVC(kernel='linear', random_state=0)
    RF = RandomForestClassifier(n_estimators=10, criterion='entropy', random_state=0)
    DT = DecisionTreeClassifier(criterion='gini', max_features='sqrt', splitter='best', random_state=0)
    
    # List of models to perform RFE
    rfemodellist = [log_model, svc_model, RF, DT]
    
    # Perform RFE
    for model in rfemodellist:
        print(f"Running RFE for: {model.__class__.__name__}")
        log_rfe = RFE(estimator=model, n_features_to_select=n)   
        log_fit = log_rfe.fit(indep_X, dep_Y)
        log_rfe_feature = log_fit.transform(indep_X)
        rfelist.append(log_rfe_feature)
    
    return rfelist

    
# Standard Scaler Function    
def split_scalar(indep_X,dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test
    
# Accuracy Score prediction 
def cm_prediction(classifier,X_test):
    y_pred = classifier.predict(X_test)
    
    # Making the Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    
    from sklearn.metrics import accuracy_score 
    from sklearn.metrics import classification_report 
    #from sklearn.metrics import confusion_matrix
    #cm = confusion_matrix(y_test, y_pred)
    
    Accuracy=accuracy_score(y_test, y_pred )
    
    report=classification_report(y_test, y_pred)
    return  classifier,Accuracy,report,X_test,y_test,cm

# Logistic Regression model
def logistic(X_train,y_train,X_test):       
    # Fitting K-NN to the Training set
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm      
    
#svm_linear model    
def svm_linear(X_train,y_train,X_test):
                
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm

#svm_nonlinear model    
def svm_NL(X_train,y_train,X_test):
                
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'rbf', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm

def Navie(X_train,y_train,X_test):       
        # Fitting K-NN to the Training set
        from sklearn.naive_bayes import GaussianNB
        classifier = GaussianNB()
        classifier.fit(X_train, y_train)
        classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
        return  classifier,Accuracy,report,X_test,y_test,cm   


# knn model    
def knn(X_train,y_train,X_test):
           
    # Fitting K-NN to the Training set
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm

# Decision Tree model
def Decision(X_train,y_train,X_test):
        
    # Fitting K-NN to the Training set
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm      

# Random forest model
def random(X_train,y_train,X_test):
        
    # Fitting K-NN to the Training set
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm
    
#table for compare model  
def rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf): 
    # Create the DataFrame with proper column names
    dataframe = pd.DataFrame(
        index=['Logistic','SVC','RandomForest','DecisionTree'],
        columns=['Logistic','SVMl','SVMnl','KNN','DecisionTree','RandomForest']
    )

    # Fill values for each row
    for number, idx in enumerate(dataframe.index):      
        dataframe.at[idx, 'Logistic'] = acclog[number]       
        dataframe.at[idx, 'SVMl'] = accsvml[number]
        dataframe.at[idx, 'SVMnl'] = accsvmnl[number]
        dataframe.at[idx, 'KNN'] = accknn[number]
        dataframe.at[idx, 'DecisionTree'] = accdes[number]
        dataframe.at[idx, 'RandomForest'] = accrf[number]

    return dataframe

In [17]:
# Load dataset
dataset=pd.read_csv("prep.csv",index_col=None)

# Copy dataset to df2
df2=dataset

#make lableing using one hot lable
df2 = pd.get_dummies(df2, drop_first=True).astype(int)

#split independent X (all features) , Y dependent
indep_X=df2.drop(columns=['classification_yes'])
dep_Y=df2[['classification_yes']]

# Call function for K best
rfelist=rfeFeature(indep_X,dep_Y,3) 


acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
#accnav=[]
accdes=[]
accrf=[]

for i in rfelist:   
    X_train, X_test, y_train, y_test=split_scalar(i,dep_Y)   
    
        
    classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test)
    acclog.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test)  
    accsvml.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test)  
    accsvmnl.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test)  
    accknn.append(Accuracy)
    
    #classifier,Accuracy,report,X_test,y_test,cm=Navie(X_train,y_train,X_test)  
    #accnav.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test)  
    accdes.append(Accuracy)
    
    classifier,Accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test)  
    accrf.append(Accuracy)
    
result=rfe_classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

result

Running RFE for: LogisticRegression
Running RFE for: SVC
Running RFE for: RandomForestClassifier
Running RFE for: DecisionTreeClassifier


Unnamed: 0,Logistic,SVMl,SVMnl,KNN,DecisionTree,RandomForest
Logistic,0.94,0.94,0.94,0.94,0.94,0.94
SVC,0.85,0.85,0.85,0.85,0.85,0.85
RandomForest,0.95,0.94,0.95,0.95,0.92,0.94
DecisionTree,0.96,0.96,0.96,0.96,0.94,0.95


In [18]:
# K=3
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,DecisionTree,RandomForest
Logistic,0.94,0.94,0.94,0.94,0.94,0.94
SVC,0.85,0.85,0.85,0.85,0.85,0.85
RandomForest,0.95,0.94,0.95,0.95,0.92,0.94
DecisionTree,0.96,0.96,0.96,0.96,0.94,0.95


In [16]:
# K=5
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,DecisionTree,RandomForest
Logistic,0.98,0.98,0.98,0.98,0.98,0.98
SVC,0.97,0.97,0.97,0.97,0.97,0.97
RandomForest,0.98,0.97,0.99,0.99,0.96,0.99
DecisionTree,0.97,0.97,0.97,0.96,0.93,0.97
