# Data

In [148]:
import numpy as np
import pandas as pd
from sklearn.model_selection  import train_test_split
from sklearn.preprocessing import  MinMaxScaler
from collections import Counter
import seaborn as sns
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier

In [92]:
df=pd.read_csv('Breast_Cancer.csv')
df.head()

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,


In [93]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 33 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   id                       569 non-null    int64  
 1   diagnosis                569 non-null    object 
 2   radius_mean              569 non-null    float64
 3   texture_mean             569 non-null    float64
 4   perimeter_mean           569 non-null    float64
 5   area_mean                569 non-null    float64
 6   smoothness_mean          569 non-null    float64
 7   compactness_mean         569 non-null    float64
 8   concavity_mean           569 non-null    float64
 9   concave points_mean      569 non-null    float64
 10  symmetry_mean            569 non-null    float64
 11  fractal_dimension_mean   569 non-null    float64
 12  radius_se                569 non-null    float64
 13  texture_se               569 non-null    float64
 14  perimeter_se             5

In [94]:
df.shape

(569, 33)

In [95]:
df.columns

Index(['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean',
       'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean',
       'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean',
       'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se',
       'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se',
       'fractal_dimension_se', 'radius_worst', 'texture_worst',
       'perimeter_worst', 'area_worst', 'smoothness_worst',
       'compactness_worst', 'concavity_worst', 'concave points_worst',
       'symmetry_worst', 'fractal_dimension_worst', 'Unnamed: 32'],
      dtype='object')

In [96]:
df.drop(['id','Unnamed: 32'],axis=1, inplace=True)

In [97]:
df['diagnosis']=[1 if i=='M' else 0  for i in df['diagnosis']]

In [98]:
df['diagnosis'].unique()

array([1, 0], dtype=int64)

In [99]:
x=df.drop('diagnosis',axis=1)
y=df['diagnosis']

In [100]:
Counter(y)

Counter({0: 357, 1: 212})

In [101]:
smote=SMOTE(sampling_strategy='minority',random_state=42)
x, y=smote.fit_resample(x,y)

In [102]:
Counter(y)

Counter({1: 357, 0: 357})

# split_data

In [103]:
x_train, x_test, y_train, y_test=train_test_split(x,y,test_size=.2 , random_state=42)

In [104]:
scaler=MinMaxScaler()
x_train=scaler.fit_transform(x_train)
x_test=scaler.transform(x_test)

# Model

In [105]:
def svc_grid_search(x_train, y_train):
  params={
      'C':[ .1,1, 10],
      'kernel':['linear','rbf','poly'],
      'gamma':['auto','scale']  }

  model=SVC(random_state=42)
  grid=GridSearchCV(model,params,scoring='accuracy',cv=5) #cross validation >> folds
  grid.fit(x_train, y_train)
  print(grid.best_params_)

In [106]:
def svc_build_model(x_train, y_train, x_test, y_test,c,k,g):
  model=SVC(C=c  , kernel=k , gamma=g)
  model.fit(x_train, y_train)
  y_pred=model.predict(x_test)
  print('Accuracy',accuracy_score(y_test, y_pred))

# Feature Selection

# 1. what is a Feature Selection?
**Feature selection is the process of selecting a subset of relevant and important features (or variables) from the original set of features in a dataset**

# 2. Why is Feature Selection Important in Machine Learning?
1. **Avoiding curse of dimensionality** as instead of working with a lot of features that may become overhead to model and take high computational and speed up training process
2. **improve model performance** as you remove irrelevant and redundant features which can improve generalization and reduce overfitting as you make the model less likely to deal with noise pattern

# Feature Selection Methods
#### 1. **Filter Methods**
###### 1. Correlation coefficient "Pearson’s Correlation"
###### 2. Mutual Information
######  3. Chi-Square
#### 2. **Wrapper methods**
 ######    1. Recursive Features Elimination (RFE)
 ######  2. Forward Selection
######   3. Backward Elimination
#### 3. **Embedded methods**
######   1. Tree-based method

# Filter Methods

### Correlation coefficient "Pearson correlation"

1. calculate correlation between each feature and the target variable. Select features with the highest correlation coefficient.
2. When 2 features are highly correlated with one another, then keeping just one in the model will be enough because otherwise they provide duplicate information (multicollinearity). The second variable will only be redundant and serve to contribute unnecessary noise.
to know which variable to remove from the 2 features u will look at their correlation between them and target variable and select the lower correlation and remove it.
3. correaltion is easy to interpret.
3. **`Disadvantage`** is it correlation so it just capture lienar dependencies, as there may be a polynomial relation but this correlation tell it has weak or zero correlation.


In [159]:
from sklearn.feature_selection import SelectKBest , f_regression
fs=SelectKBest(f_regression ,k=8)
fs.fit(x_train,y_train)
x_train_selecetd_corr =fs.transform(x_train)
x_test_selecetd_corr =fs.transform(x_test)

In [160]:
svc_grid_search(x_train_selecetd_corr, y_train)

{'C': 10, 'gamma': 'scale', 'kernel': 'poly'}


In [161]:
svc_build_model(x_train_selecetd_corr, y_train, x_test_selecetd_corr, y_test,10,'poly','scale')

Accuracy 0.951048951048951


### Mutual Information

1. Mutual information measures mutual dependences between 2 variables and can be used to measure how much a feature contributes to the prediction of the target variable, as it quantifies the amount of information obtained about one variable through the other variable.
2. its non-negative number and its symmetric. It’s like Pearson’s correlation, but it’s **`not limited to detect linear association`**.
3. Another advantage of mutual information is that it also works on discrete features or target, unlike correlation. Although, categorical variables need to be numerically encoded first.

In [162]:
from sklearn.feature_selection import SelectKBest , mutual_info_classif
fs=SelectKBest(mutual_info_classif ,k=8)
fs.fit(x_train,y_train)
x_train_selecetd_mi =fs.transform(x_train)
x_test_selecetd_mi =fs.transform(x_test)

In [163]:
svc_grid_search(x_train_selecetd_mi, y_train)

{'C': 10, 'gamma': 'scale', 'kernel': 'poly'}


In [165]:
svc_build_model(x_train_selecetd_mi, y_train, x_test_selecetd_mi, y_test,10,'poly','scale')

Accuracy 0.951048951048951


### Chi square

1. Used for classification tasks to check the independence of 2 variables **`"categorical only"`**. 

In [166]:
from sklearn.feature_selection import SelectKBest , chi2
fs=SelectKBest(chi2 ,k=8)
fs.fit(x_train,y_train)
x_train_selecetd_chi =fs.transform(x_train)
x_test_selecetd_chi =fs.transform(x_test)

In [167]:
svc_grid_search(x_train_selecetd_chi, y_train)

{'C': 10, 'gamma': 'scale', 'kernel': 'poly'}


In [168]:
svc_build_model(x_train_selecetd_chi, y_train, x_test_selecetd_chi, y_test,10,'poly','scale')

Accuracy 0.986013986013986


### ANOVA

1. can work for both categorical and non-categorical data 
2. **`Only captures linear relationships.`**

In [169]:
from sklearn.feature_selection import SelectKBest, f_classif
fs=SelectKBest(f_classif ,k=8)
fs.fit(x_train,y_train)
x_train_selecetd_anova =fs.transform(x_train)
x_test_selecetd_anova =fs.transform(x_test)

In [170]:
svc_grid_search(x_train_selecetd_anova, y_train)

{'C': 10, 'gamma': 'scale', 'kernel': 'poly'}


In [171]:
svc_build_model(x_train_selecetd_anova, y_train, x_test_selecetd_anova, y_test,10,'poly','scale')

Accuracy 0.951048951048951


# Wrapper Methods


### Forward Selection

1. start with no features and add them one by one based on performance improvement until a stopping criterion is met.
2. Greedy algorithm may not find the global optimum as it doesn't take into account **`Feature interaction`**.

In [174]:
model_SVC = SVC()
from sklearn.feature_selection import SequentialFeatureSelector
forw=SequentialFeatureSelector(estimator=model_SVC , n_features_to_select=10 , direction='forward')
forw.fit(x_train, y_train)

x_train_selected_for=forw.transform(x_train)
x_test_selected_for=forw.transform(x_test)

In [176]:
model_SVC.fit(x_train_selected_for,y_train)     # default HYperParamerets
pred = model_SVC.predict(x_test_selected_for)
print('Accuracy',accuracy_score(pred,y_test) )

Accuracy 0.986013986013986


### Backward Elimination

1. start with all features and eliminate them one by one based on performance deterioration.
2. Greedy algorithm may not find the global optimum as it doesn't take into account **`Feature interaction`**.

In [178]:
model_SVC = SVC()
from sklearn.feature_selection import SequentialFeatureSelector
forw=SequentialFeatureSelector(estimator=model_SVC , n_features_to_select=10 , direction='backward')
forw.fit(x_train, y_train)

x_train_selected_back=forw.transform(x_train)
x_test_selected_back=forw.transform(x_test)

In [179]:
model_SVC.fit(x_train_selected_back,y_train)  # default HYperParamerets
pred = model_SVC.predict(x_test_selected_back)
print('Accuracy',accuracy_score(pred,y_test) )

Accuracy 0.993006993006993


### RFE

1. build model and recursively remove the least important features based on coefficient or feature importance.
2. Begins by training the model with all features. The model can be any supervise model that provides a way to rank features by importance
3. it takes into account `feature interaction` which gurantee to find the best features 
4. **Disadvantages** Sensitive to the choice of the model used for rankin, and very high computational if data was large as it fit a model with every combination of features. 

In [180]:
model_SVC = SVC(kernel='linear')
from sklearn.feature_selection  import RFE
REC=RFE(model_SVC , n_features_to_select=10 )
REC.fit(x_train, y_train)

x_train_selected_rfe = forw.transform(x_train)
x_test_selected_rfe  = forw.transform(x_test)

In [181]:
model_SVC.fit(x_train_selected_rfe,y_train)
pred = model_SVC.predict(x_test_selected_rfe)
accuracy_score(pred,y_test)

0.986013986013986

## Note
Filter methods evaluate each feature individually and select the most meaningful features based on statistical measures such as correlation and mutual information. Its quick and easy to implement but may not consider the interactions between features and may not be affective with high dimensions dataset.  

# Embeded Methods

### Tree base Method 

1. Handles nonlinear relationships and interactions and provides feature importance scores naturally.

In [182]:
rf= RandomForestClassifier()
rf.fit(x_train,y_train)
print(rf.feature_importances_)

[0.02870012 0.02033554 0.03870045 0.06372455 0.00585075 0.01124981
 0.04059012 0.13585323 0.00357806 0.00279266 0.01339489 0.00534591
 0.00876524 0.03674547 0.0046644  0.00380625 0.00424405 0.0072166
 0.00760691 0.00478683 0.09020183 0.01760011 0.15438463 0.13143905
 0.01377912 0.01565482 0.03387083 0.07766387 0.01046892 0.00698499]


In [183]:
mask = rf.feature_importances_ >.1
X_selected = x.loc[:,mask]
X_selected.columns

Index(['concave points_mean', 'perimeter_worst', 'area_worst'], dtype='object')