<a href="https://colab.research.google.com/github/anthonyibr24/Feature-Selection-Methods/blob/Waveform/Waveform.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install ucimlrepo

Collecting ucimlrepo
  Downloading ucimlrepo-0.0.7-py3-none-any.whl.metadata (5.5 kB)
Downloading ucimlrepo-0.0.7-py3-none-any.whl (8.0 kB)
Installing collected packages: ucimlrepo
Successfully installed ucimlrepo-0.0.7


In [None]:
import pandas as pd
import numpy as np

from ucimlrepo import fetch_ucirepo

# fetch dataset
waveform_database_generator_version_1 = fetch_ucirepo(id=107)

# data (as pandas dataframes)
X = waveform_database_generator_version_1.data.features
y = waveform_database_generator_version_1.data.targets

df=pd.concat([X,y],axis=1)
df.shape


(5000, 22)

In [None]:
df.shape

(5000, 22)

In [None]:
print(df['class'].value_counts())

class
2    1696
0    1657
1    1647
Name: count, dtype: int64


In [None]:
from sklearn.utils import Bunch

# Create a Bunch-like object for compatibility
data_set = Bunch(
    data=df.drop('class', axis=1).values,
    target=df['class'].values,
    feature_names=df.drop('class', axis=1).columns.tolist(),
)


#load dataset
data=data_set
df=pd.DataFrame(data.data,columns=data.feature_names)
df['target']=data.target
df.head()

#scale mean=0 std=1
from sklearn.preprocessing import StandardScaler
scaler=StandardScaler()
df=df.drop('target',axis=1)
df[data.feature_names]=scaler.fit_transform(df[data.feature_names])

df['target']=data.target
df.describe().round(3)

Unnamed: 0,Attribute1,Attribute2,Attribute3,Attribute4,Attribute5,Attribute6,Attribute7,Attribute8,Attribute9,Attribute10,...,Attribute13,Attribute14,Attribute15,Attribute16,Attribute17,Attribute18,Attribute19,Attribute20,Attribute21,target
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,...,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,-0.0,0.0,-0.0,-0.0,-0.0,0.0,0.0,0.0,-0.0,-0.0,...,-0.0,-0.0,0.0,-0.0,0.0,-0.0,-0.0,0.0,-0.0,1.008
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.819
min,-3.312,-3.406,-4.102,-3.414,-2.855,-2.623,-2.968,-3.539,-3.639,-3.121,...,-3.203,-3.107,-2.58,-2.756,-2.932,-3.596,-3.476,-3.632,-3.87,0.0
25%,-0.678,-0.675,-0.692,-0.715,-0.759,-0.776,-0.77,-0.727,-0.723,-0.724,...,-0.726,-0.732,-0.757,-0.751,-0.758,-0.715,-0.703,-0.654,-0.671,0.0
50%,0.005,0.001,-0.01,-0.036,-0.114,-0.076,-0.08,0.035,0.083,0.007,...,0.091,0.029,-0.078,-0.1,-0.081,-0.043,-0.035,-0.007,-0.009,1.0
75%,0.678,0.675,0.663,0.691,0.732,0.74,0.768,0.734,0.762,0.713,...,0.759,0.756,0.76,0.734,0.717,0.679,0.675,0.661,0.683,2.0
max,3.896,3.361,3.407,3.363,3.092,3.1,3.026,2.967,3.143,3.031,...,2.919,2.899,3.008,3.236,3.237,3.681,3.858,3.97,4.044,2.0


# Full features

In [None]:
#Cross validation
from sklearn.model_selection import cross_validate
from sklearn.linear_model import LogisticRegression

lr=LogisticRegression()

cv_results = cross_validate(
    lr,
    df.drop('target', axis=1),
    df['target'],
    cv=4,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro'],
    return_train_score=False
)

print("\nAverage Metrics Across 4 Folds:")
print(f"Accuracy:  {np.mean(cv_results['test_accuracy']):.4f} ± {np.std(cv_results['test_accuracy']):.4f}")
print(f"Precision: {np.mean(cv_results['test_precision_macro']):.4f} ± {np.std(cv_results['test_precision_macro']):.4f}")
print(f"Recall:    {np.mean(cv_results['test_recall_macro']):.4f} ± {np.std(cv_results['test_recall_macro']):.4f}")
print(f"F1 Score:  {np.mean(cv_results['test_f1_macro']):.4f} ± {np.std(cv_results['test_f1_macro']):.4f}")


Average Metrics Across 4 Folds:
Accuracy:  0.8692 ± 0.0052
Precision: 0.8690 ± 0.0053
Recall:    0.8690 ± 0.0053
F1 Score:  0.8689 ± 0.0053


# Using 13 features


---


Accuracy:  0.9777


Precision: 0.9773


Recall:    0.9806


F1 Score:  0.9781


---



# Mutual Gain

In [None]:
#Mutual gain
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, mutual_info_classif
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

#load dataset
data=data_set
df=pd.DataFrame(data.data,columns=data.feature_names)
df['target']=data.target
df.head()

X=df.drop('target',axis=1)
y=df['target']



In [None]:
#Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(mutual_info_classif, k=10)),
    ('classifier', LogisticRegression())
])
pipeline.verbose=True

#Cross-validate and calculate all metrics
scores = cross_validate(
    pipeline,
    X, y,
    cv=4,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
)

print("\nAverage Scores Across 4 Folds:")
for metric in ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']:
    mean_score = scores[f'test_{metric}'].mean()
    std_score = scores[f'test_{metric}'].std()
    print(f"{metric.capitalize()}: {mean_score:.4f} ± {std_score:.4f}")

pipeline

[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s

Average Scores Across 4 Folds:
Accuracy: 0.8278 ± 0.0124
Precision_macro: 0.8276 ± 0.0126
Recall_macro: 0.8275 ± 0.0123
F1_macro: 0.8269 ± 0.0123


In [None]:
#ANOVA
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline


#load dataset
data=data_set
df=pd.DataFrame(data.data,columns=data.feature_names)
df['target']=data.target
df.head()

X=df.drop('target',axis=1)
y=df['target']

#Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectKBest(score_func=f_classif, k=10)),
    ('classifier', LogisticRegression())
])
pipeline.verbose=True

#Cross-validate and calculate all metrics
scores = cross_validate(
    pipeline,
    X, y,
    cv=4,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
)

print("\nAverage Scores Across 4 Folds:")
for metric in ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']:
    mean_score = scores[f'test_{metric}'].mean()
    std_score = scores[f'test_{metric}'].std()
    print(f"{metric.capitalize()}: {mean_score:.4f} ± {std_score:.4f}")

[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.1s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.1s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.0s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s

Average Scores Across 4 Folds:
Accuracy: 0.8308 ± 0.0102
Precision_macro: 0.8305 ± 0.0104
Recall_macro: 0.8304 ± 0.0101
F1_macro: 0.8299 ± 0.0098


# Wrapper Methods

# Forward

In [None]:
#FORWARD
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

#load dataset
data=data_set
df=pd.DataFrame(data.data,columns=data.feature_names)
df['target']=data.target
df.head()

X=df.drop('target',axis=1)
y=df['target']

knn=KNeighborsClassifier(5)



In [None]:
#Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SequentialFeatureSelector(knn, n_features_to_select=10, direction='forward', scoring='accuracy')),
    ('classifier', LogisticRegression())
])
pipeline.verbose=True

#Cross-validate and calculate all metrics
scores = cross_validate(
    pipeline,
    X, y,
    cv=4,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
)

print("\nAverage Scores Across 4 Folds:")
for metric in ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']:
    mean_score = scores[f'test_{metric}'].mean()
    std_score = scores[f'test_{metric}'].std()
    print(f"{metric.capitalize()}: {mean_score:.4f} ± {std_score:.4f}")



[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=  48.2s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=  45.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=  45.1s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.1s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=  45.1s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s

Average Scores Across 4 Folds:
Accuracy: 0.8362 ± 0.0089
Precision_macro: 0.8360 ± 0.0089
Recall_macro: 0.8360 ± 0.0089
F1_macro: 0.8358 ± 0.0089


# BACKWARD

In [None]:
#BACKWARD
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier

#load dataset
data=data_set
df=pd.DataFrame(data.data,columns=data.feature_names)
df['target']=data.target
df.head()

X=df.drop('target',axis=1)
y=df['target']

knn=KNeighborsClassifier(5)

In [None]:
#Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SequentialFeatureSelector(knn, n_features_to_select=10, direction='backward', scoring='accuracy')),
    ('classifier', LogisticRegression())
])
pipeline.verbose=True

#Cross-validate and calculate all metrics
scores = cross_validate(
    pipeline,
    X, y,
    cv=4,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
)

print("\nAverage Scores Across 4 Folds:")
for metric in ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']:
    mean_score = scores[f'test_{metric}'].mean()
    std_score = scores[f'test_{metric}'].std()
    print(f"{metric.capitalize()}: {mean_score:.4f} ± {std_score:.4f}")


[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=  47.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=  49.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=  47.6s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=  49.1s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s

Average Scores Across 4 Folds:
Accuracy: 0.8322 ± 0.0091
Precision_macro: 0.8320 ± 0.0092
Recall_macro: 0.8319 ± 0.0092
F1_macro: 0.8317 ± 0.0092


# RFE

In [None]:
#RFE
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

#load dataset
data=data_set
df=pd.DataFrame(data.data,columns=data.feature_names)
df['target']=data.target
df.head()

X=df.drop('target',axis=1)
y=df['target']



In [None]:
#Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', RFE(estimator=SVC(kernel='linear'), n_features_to_select=6)),
    ('classifier', LogisticRegression())
])
pipeline.verbose=True

#Cross-validate and calculate all metrics
scores = cross_validate(
    pipeline,
    X, y,
    cv=4,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
)

print("\nAverage Scores Across 4 Folds:")
for metric in ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']:
    mean_score = scores[f'test_{metric}'].mean()
    std_score = scores[f'test_{metric}'].std()
    print(f"{metric.capitalize()}: {mean_score:.4f} ± {std_score:.4f}")


[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   4.4s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   3.0s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   3.1s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   3.1s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s

Average Scores Across 4 Folds:
Accuracy: 0.8028 ± 0.0089
Precision_macro: 0.8027 ± 0.0089
Recall_macro: 0.8025 ± 0.0089
F1_macro: 0.8024 ± 0.0090


# Embeded Methods

# Random Forest

In [None]:
#RF
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel


#load dataset
data=data_set
df=pd.DataFrame(data.data,columns=data.feature_names)
df['target']=data.target
df.head()

X=df.drop('target',axis=1)
y=df['target']


In [None]:
#Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(RandomForestClassifier(random_state=42), max_features=10)),
    ('classifier', LogisticRegression())
])
pipeline.verbose=True

#Cross-validate and calculate all metrics
scores = cross_validate(
    pipeline,
    X, y,
    cv=4,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
)

print("\nAverage Scores Across 4 Folds:")
for metric in ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']:
    mean_score = scores[f'test_{metric}'].mean()
    std_score = scores[f'test_{metric}'].std()
    print(f"{metric.capitalize()}: {mean_score:.4f} ± {std_score:.4f}")

[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   1.9s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.2s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   1.9s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   1.5s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   1.6s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.0s

Average Scores Across 4 Folds:
Accuracy: 0.8446 ± 0.0097
Precision_macro: 0.8445 ± 0.0098
Recall_macro: 0.8444 ± 0.0098
F1_macro: 0.8442 ± 0.0098


# Logistic using L1 penalty

In [None]:
#LR L1 penalty
from sklearn.datasets import load_wine
from sklearn.model_selection import cross_validate
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectKBest


#load dataset
data=data_set
df=pd.DataFrame(data.data,columns=data.feature_names)
df['target']=data.target
df.head()

X=df.drop('target',axis=1)
y=df['target']


In [None]:
#Pipeline
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('selector', SelectFromModel(LogisticRegression(penalty='l1',solver='saga',C=0.1,max_iter=3000,random_state=42),max_features=10)),
    ('classifier', LogisticRegression())
])
pipeline.verbose=True

#Cross-validate and calculate all metrics
scores = cross_validate(
    pipeline,
    X, y,
    cv=4,
    scoring=['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']
)

print("\nAverage Scores Across 4 Folds:")
for metric in ['accuracy', 'precision_macro', 'recall_macro', 'f1_macro']:
    mean_score = scores[f'test_{metric}'].mean()
    std_score = scores[f'test_{metric}'].std()
    print(f"{metric.capitalize()}: {mean_score:.4f} ± {std_score:.4f}")

[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.3s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.2s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.3s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.2s
[Pipeline] ............ (step 1 of 3) Processing scaler, total=   0.0s
[Pipeline] .......... (step 2 of 3) Processing selector, total=   0.3s
[Pipeline] ........ (step 3 of 3) Processing classifier, total=   0.1s

Average Scores Across 4 Folds:
Accuracy: 0.8418 ± 0.0080
Precision_macro: 0.8417 ± 0.0079
Recall_macro: 0.8415 ± 0.0080
F1_macro: 0.8413 ± 0.0079
