In [14]:
# import all packages and libraries here.
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
from sklearn.datasets import load_breast_cancer,  
from sklearn.svm import LinearSVC
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn import metrics, svm, linear_model , model_selection, preprocessing, pipeline, impute, compose,decomposition


In [52]:
dataset = load_breast_cancer()
X,y = dataset.data , dataset.target
X_train , X_test , y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)

In [137]:
k_ = 5

In [105]:
from sklearn.feature_selection import SelectKBest, f_classif,chi2

fs_anova = SelectKBest(score_func=f_classif, k=k_)
X_selected_anova = fs_anova.fit_transform(X_train,y_train)

In [106]:
fs_anova.get_feature_names_out()

array(['x0', 'x2', 'x3', 'x5', 'x6', 'x7', 'x20', 'x22', 'x23', 'x25',
       'x26', 'x27'], dtype=object)

In [107]:
X_selected_anova.shape, X.shape

((455, 12), (569, 30))

In [108]:
X[X<0].sum() # there is no mines data in dataset so we can use chi2 method 

0.0

In [109]:
fs_chi2 = SelectKBest(score_func=chi2, k=k_)
X_selected_chi2 = fs_chi2.fit_transform(X_train,y_train)

In [110]:
fs_chi2.get_feature_names_out()

array(['x0', 'x1', 'x2', 'x3', 'x10', 'x12', 'x13', 'x20', 'x21', 'x22',
       'x23', 'x26'], dtype=object)

In [111]:
model_anova = LinearSVC(dual='auto')
model_anova.fit(X_selected_anova, y_train)
model_chi2 = LinearSVC(dual='auto')
model_chi2.fit(X_selected_chi2, y_train)

In [112]:
X_test_anova = fs_anova.transform(X_test)
X_test_chi2 = fs_chi2.transform(X_test)

pred_anova = model_anova.predict(X_test_anova)
pred_chi2 = model_chi2.predict(X_test_chi2)

print( 'ANOVA prediction: \n\n' , metrics.classification_report(y_test, pred_anova))
print( 'CHI2 prediction: \n\n' , metrics.classification_report(y_test, pred_chi2))

ANOVA prediction: 

               precision    recall  f1-score   support

           0       1.00      0.98      0.99        43
           1       0.99      1.00      0.99        71

    accuracy                           0.99       114
   macro avg       0.99      0.99      0.99       114
weighted avg       0.99      0.99      0.99       114

CHI2 prediction: 

               precision    recall  f1-score   support

           0       0.98      0.93      0.95        43
           1       0.96      0.99      0.97        71

    accuracy                           0.96       114
   macro avg       0.97      0.96      0.96       114
weighted avg       0.97      0.96      0.96       114



In [135]:
def feature_selcetion(K_=5):
    dataset = load_breast_cancer()
    X,y = dataset.data , dataset.target
    X_train , X_test , y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=42)
    results = []
    for _k in range(1,K_+1):
        # FS
        fs_anova = SelectKBest(score_func=f_classif, k=_k)
        X_selected_anova = fs_anova.fit_transform(X_train,y_train)
        
        fs_chi2 = SelectKBest(score_func=chi2, k=_k)
        X_selected_chi2 = fs_chi2.fit_transform(X_train,y_train)
        
        # Model
        model_anova = LinearSVC(dual='auto')
        model_anova.fit(X_selected_anova, y_train)
        model_chi2 = LinearSVC(dual='auto')
        model_chi2.fit(X_selected_chi2, y_train)
        
        # X Test
        X_test_anova = fs_anova.transform(X_test)
        X_test_chi2 = fs_chi2.transform(X_test)
        
        pred_anova = model_anova.predict(X_test_anova)
        pred_chi2 = model_chi2.predict(X_test_chi2)

        # Metrics
        meter_anova = metrics.classification_report(y_test, pred_anova, output_dict=True)
        meter_chi2 = metrics.classification_report(y_test, pred_chi2, output_dict=True)
        
        # Result
        result = {
                "K": _k,
            
                "F1 Score (anova)": meter_anova['1']['f1-score'],
                "F1 Score (chi2)": meter_chi2['1']['f1-score'],
        
                
                "Recall (anova)": meter_anova['1']['recall'],
                "Recall (chi2)": meter_chi2['1']['recall'],
        
                "Accuracy (anova)":meter_anova['accuracy'],
                "Accuracy (chi2)": meter_chi2['accuracy'],
            }
            
        results.append(result)

    return pd.DataFrame(results)

In [136]:
feature_selcetion(30)

Unnamed: 0,K,F1 Score (anova),F1 Score (chi2),Recall (anova),Recall (chi2),Accuracy (anova),Accuracy (chi2)
0,1,0.931507,0.959459,0.957746,1.0,0.912281,0.947368
1,2,0.938776,0.972603,0.971831,1.0,0.921053,0.964912
2,3,0.972222,0.97931,0.985915,1.0,0.964912,0.973684
3,4,0.972222,0.972603,0.985915,1.0,0.964912,0.964912
4,5,0.97931,0.965986,1.0,1.0,0.973684,0.95614
5,6,0.986111,0.986111,1.0,1.0,0.982456,0.982456
6,7,0.993007,0.986111,1.0,1.0,0.991228,0.982456
7,8,0.993007,0.985915,1.0,0.985915,0.991228,0.982456
8,9,0.986111,0.972222,1.0,0.985915,0.982456,0.964912
9,10,0.993007,0.972222,1.0,0.985915,0.991228,0.964912
