# Select K Algorithm Classification

In [1]:
# Import Modules
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 
import time
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import pickle
import matplotlib.pyplot as plt
import warnings
# Suppress all warnings
warnings.filterwarnings('ignore')

In [2]:
# Select K Algorithm best on K=n
def selectkbest(indep_X,dep_Y,n):
    test = SelectKBest(score_func=chi2, k=n)
    fit1= test.fit(indep_X,dep_Y)
    selectk_features = fit1.transform(indep_X)
    return selectk_features
    
# Standard Scaler Function    
def split_scalar(indep_X,dep_Y):
    X_train, X_test, y_train, y_test = train_test_split(indep_X, dep_Y, test_size = 0.25, random_state = 0)
    sc = StandardScaler()
    X_train = sc.fit_transform(X_train)
    X_test = sc.transform(X_test)    
    return X_train, X_test, y_train, y_test
    
# Accuracy Score prediction 
def cm_prediction(classifier,X_test):
    y_pred = classifier.predict(X_test)
    
    # Making the Confusion Matrix
    from sklearn.metrics import confusion_matrix
    cm = confusion_matrix(y_test, y_pred)
    
    from sklearn.metrics import accuracy_score 
    from sklearn.metrics import classification_report 
    #from sklearn.metrics import confusion_matrix
    #cm = confusion_matrix(y_test, y_pred)
    
    Accuracy=accuracy_score(y_test, y_pred )
    
    report=classification_report(y_test, y_pred)
    return  classifier,Accuracy,report,X_test,y_test,cm

#LogisticRegression model
def logistic(X_train,y_train,X_test):       
    # Fitting K-NN to the Training set
    from sklearn.linear_model import LogisticRegression
    classifier = LogisticRegression(random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm      
    
#svm_linear model    
def svm_linear(X_train,y_train,X_test):
                
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'linear', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm

#svm_nonlinear model    
def svm_NL(X_train,y_train,X_test):
                
    from sklearn.svm import SVC
    classifier = SVC(kernel = 'rbf', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm

# Navie model
def Navie(X_train,y_train,X_test):       
    # Fitting K-NN to the Training set
    from sklearn.naive_bayes import GaussianNB
    classifier = GaussianNB()
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm         
    
# knn model    
def knn(X_train,y_train,X_test):
           
    # Fitting K-NN to the Training set
    from sklearn.neighbors import KNeighborsClassifier
    classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm

# Decision Tree model
def Decision(X_train,y_train,X_test):
        
    # Fitting K-NN to the Training set
    from sklearn.tree import DecisionTreeClassifier
    classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm      

# Random forest model
def random(X_train,y_train,X_test):
        
    # Fitting K-NN to the Training set
    from sklearn.ensemble import RandomForestClassifier
    classifier = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
    classifier.fit(X_train, y_train)
    classifier,Accuracy,report,X_test,y_test,cm=cm_prediction(classifier,X_test)
    return  classifier,Accuracy,report,X_test,y_test,cm
    
#table for compare model  
def selectk_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf): 
    # Create the DataFrame with proper column names
    dataframe = pd.DataFrame(
        index=['ChiSquare'],
        columns=['Logistic', 'SVMl', 'SVMnl', 'KNN', 'Naive', 'DecisionTree', 'RandomForest']
    )

    # Fill values for each row
    for number, idx in enumerate(dataframe.index):      
        dataframe.at[idx, 'Logistic'] = acclog[number]       
        dataframe.at[idx, 'SVMl'] = accsvml[number]
        dataframe.at[idx, 'SVMnl'] = accsvmnl[number]
        dataframe.at[idx, 'KNN'] = accknn[number]
        dataframe.at[idx, 'Naive'] = accnav[number]
        dataframe.at[idx, 'DecisionTree'] = accdes[number]
        dataframe.at[idx, 'RandomForest'] = accrf[number]

    return dataframe

In [8]:
# Load dataset
dataset=pd.read_csv("prep.csv",index_col=None)

# Copy dataset to df2
df2=dataset

#make lableing using one hot lable
df2 = pd.get_dummies(df2, drop_first=True).astype(int)

#split independent X (all features) , Y dependent
indep_X=df2.drop(columns=['classification_yes'])
dep_Y=df2[['classification_yes']]

# Call function for K best
kbest=selectkbest(indep_X,dep_Y,6) 

acclog=[]
accsvml=[]
accsvmnl=[]
accknn=[]
accnav=[]
accdes=[]
accrf=[]

X_train, X_test, y_train, y_test=split_scalar(kbest,dep_Y)   
            
classifier,Accuracy,report,X_test,y_test,cm=logistic(X_train,y_train,X_test)
acclog.append(Accuracy)

classifier,Accuracy,report,X_test,y_test,cm=svm_linear(X_train,y_train,X_test)  
accsvml.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=svm_NL(X_train,y_train,X_test)  
accsvmnl.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=knn(X_train,y_train,X_test)  
accknn.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=Navie(X_train,y_train,X_test)  
accnav.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=Decision(X_train,y_train,X_test)  
accdes.append(Accuracy)
    
classifier,Accuracy,report,X_test,y_test,cm=random(X_train,y_train,X_test)  
accrf.append(Accuracy)
    
result=selectk_Classification(acclog,accsvml,accsvmnl,accknn,accnav,accdes,accrf)

result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,DecisionTree,RandomForest
ChiSquare,0.97,0.97,0.98,0.92,0.89,0.99,0.98


In [5]:
# K=4
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,DecisionTree,RandomForest
ChiSquare,0.82,0.81,0.82,0.84,0.78,0.83,0.88


In [7]:
# K=5
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,DecisionTree,RandomForest
ChiSquare,0.94,0.94,0.95,0.9,0.84,0.97,0.97


In [9]:
# K=6
result

Unnamed: 0,Logistic,SVMl,SVMnl,KNN,Naive,DecisionTree,RandomForest
ChiSquare,0.97,0.97,0.98,0.92,0.89,0.99,0.98
