## Import library

In [49]:
import numpy as np
import pandas as pd

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import scorer, accuracy_score, f1_score, confusion_matrix, roc_auc_score
from sklearn.model_selection import train_test_split

from sklearn.preprocessing import LabelEncoder

from collections import Counter

pd.set_option('max_column',30)

import matplotlib.pyplot as plt
import seaborn as sns

## Load dataset

In [50]:
# classification data mobile price 
clasfi_data = pd.read_csv(r'C:\\Users\\Benai\\Documents\\machin-learning\\feature-selection\mobile-price.csv')

In [51]:
# classification data
reg_data = pd.read_csv(r'C:\\Users\\Benai\\Documents\\machin-learning\\feature-selection\housing.csv')

In [52]:
reg_data.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [53]:
reg_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
date             4600 non-null object
price            4600 non-null float64
bedrooms         4600 non-null float64
bathrooms        4600 non-null float64
sqft_living      4600 non-null int64
sqft_lot         4600 non-null int64
floors           4600 non-null float64
waterfront       4600 non-null int64
view             4600 non-null int64
condition        4600 non-null int64
sqft_above       4600 non-null int64
sqft_basement    4600 non-null int64
yr_built         4600 non-null int64
yr_renovated     4600 non-null int64
street           4600 non-null object
city             4600 non-null object
statezip         4600 non-null object
country          4600 non-null object
dtypes: float64(4), int64(9), object(5)
memory usage: 647.0+ KB


In [54]:
# convert categorical variable to numeric variable
reg_data = reg_data.apply(LabelEncoder().fit_transform)

In [55]:
clasfi_data.head()

Unnamed: 0,battery_power,blue,clock_speed,dual_sim,fc,four_g,int_memory,m_dep,mobile_wt,n_cores,pc,px_height,px_width,ram,sc_h,sc_w,talk_time,three_g,touch_screen,wifi,price_range
0,842,0,2.2,0,1,0,7,0.6,188,2,2,20,756,2549,9,7,19,0,0,1,1
1,1021,1,0.5,1,0,1,53,0.7,136,3,6,905,1988,2631,17,3,7,1,1,0,2
2,563,1,0.5,1,2,1,41,0.9,145,5,6,1263,1716,2603,11,2,9,1,1,0,2
3,615,1,2.5,0,0,0,10,0.8,131,6,9,1216,1786,2769,16,8,11,1,0,0,2
4,1821,1,1.2,0,13,1,44,0.6,141,2,14,1208,1212,1411,8,2,15,1,1,0,1


In [56]:
clasfi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 21 columns):
battery_power    2000 non-null int64
blue             2000 non-null int64
clock_speed      2000 non-null float64
dual_sim         2000 non-null int64
fc               2000 non-null int64
four_g           2000 non-null int64
int_memory       2000 non-null int64
m_dep            2000 non-null float64
mobile_wt        2000 non-null int64
n_cores          2000 non-null int64
pc               2000 non-null int64
px_height        2000 non-null int64
px_width         2000 non-null int64
ram              2000 non-null int64
sc_h             2000 non-null int64
sc_w             2000 non-null int64
talk_time        2000 non-null int64
three_g          2000 non-null int64
touch_screen     2000 non-null int64
wifi             2000 non-null int64
price_range      2000 non-null int64
dtypes: float64(2), int64(19)
memory usage: 328.2 KB


In [57]:
print('Classifciation :',clasfi_data.shape)
print('Regression:',reg_data.shape)

Classifciation : (2000, 21)
Regression: (4600, 18)


In [58]:
clasfi_data.price_range.value_counts()

3    500
2    500
1    500
0    500
Name: price_range, dtype: int64

## Separate target and response variable

In [59]:
# regression
x_r = reg_data.drop('price',axis=1)
y_r = reg_data['price']

# classification
x_c = clasfi_data.iloc[:,:-1]
y_c = clasfi_data.iloc[:,-1]

## Train test split

In [60]:
# regression  
x_train_r, x_test_r, y_train_r,y_test_r = train_test_split(x_r,y_r, test_size=0.20,random_state=42 )

In [61]:
# classification 
x_train_c, x_test_c, y_train_c,y_test_c = train_test_split(x_c,y_c, test_size=0.20,random_state=42 )

# 1. Filter method

## a) Basic Filter mehod
### 1.Remove feature with low variance

In [63]:
from sklearn.feature_selection import VarianceThreshold

vt = VarianceThreshold(threshold=0)

# take only numeric features
vt.fit_transform(x_r)

selected_features = vt.get_support()

vt_features = x_r.columns[selected_features]

print('Before features  selection --->',x_r.shape[1])
print('After features selection --->',len(vt_features))
print('\nSelected features --->',vt_features)

Before features  selection ---> 17
After features selection ---> 16

Selected features ---> Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'street', 'city', 'statezip'],
      dtype='object')


### 2. Quasi-constant features

In [15]:
theshold = 0.98

# create empty list
quasi_constant_feature = []

# loop over all the columns
for features in x_r.columns:
    # calulate the ratio
    ratio = (x_r[features].value_counts() / np.float(len(x_c))).sort_values(ascending=False).values[0]
    
    if ratio >= theshold:
        quasi_constant_feature.append(features)
        
print('Before feature selection --->',x_r.shape[1])
print('After feature selection --->',len(quasi_constant_feature))
print('\nSelected features--->',quasi_constant_feature)

Before feature selection ---> 17
After feature selection ---> 8

Selected features---> ['bedrooms', 'floors', 'waterfront', 'view', 'condition', 'sqft_basement', 'yr_renovated', 'country']


### 3. Duplication

In [16]:
# transpose the feature matrix
features_T = x_r.T

print(features_T.duplicated().sum())

# select the duplicated feature column names
duplicate_features = features_T[features_T.duplicated()].index.values

print('No of duplicate feature --->',duplicate_features)

0
No of duplicate feature ---> []


## b) Correlation Filter method
### 1. Pearson correlation

In [17]:
from sklearn.feature_selection import SelectKBest, f_regression

pc = SelectKBest(f_regression,k=8)

x_r_new = pc.fit_transform(x_r,y_r)

print('No of Features before selection --->',x_r.shape)
print('No of Feature after selection --->',x_r_new.shape)

No of Features before selection ---> (4600, 17)
No of Feature after selection ---> (4600, 8)


  corr /= X_norms
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


### 2. Spearman's rank correlation

In [18]:
from scipy.stats import spearmanr

corr, p = spearmanr(reg_data)

df = pd.DataFrame(corr,reg_data.columns)
df

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17
date,1.0,0.02146,0.011031,0.021892,0.032308,0.011743,0.035727,0.017569,0.001781,0.005571,0.046941,-0.017774,0.00489,-0.019776,-0.004087,-0.02166,-0.004405,
price,0.02146,1.0,0.337863,0.492451,0.631264,0.075483,0.320929,0.083184,0.267564,0.023843,0.534038,0.236591,0.084415,-0.070935,0.087595,0.113391,-0.039309,
bedrooms,0.011031,0.337863,1.0,0.537882,0.651607,0.237566,0.220267,-0.008774,0.092243,0.006001,0.53272,0.248473,0.159718,-0.056341,-0.037215,-0.157756,-0.168333,
bathrooms,0.021892,0.492451,0.537882,1.0,0.746818,0.091993,0.539832,0.049207,0.165425,-0.163474,0.695998,0.189853,0.530261,-0.2128,0.01089,-0.124747,-0.206647,
sqft_living,0.032308,0.631264,0.651607,0.746818,1.0,0.3252,0.396749,0.069558,0.249297,-0.066262,0.843412,0.322549,0.32243,-0.126955,0.00364,-0.151491,-0.219579,
sqft_lot,0.011743,0.075483,0.237566,0.091993,0.3252,1.0,-0.204243,0.094542,0.112377,0.091853,0.305195,0.022575,-0.012471,0.051275,-0.16172,-0.342143,-0.337685,
floors,0.035727,0.320929,0.220267,0.539832,0.396749,-0.204243,1.0,0.02117,0.019924,-0.303411,0.603723,-0.28801,0.537538,-0.229399,0.053898,0.036383,-0.070226,
waterfront,0.017569,0.083184,-0.008774,0.049207,0.069558,0.094542,0.02117,1.0,0.272121,0.00157,0.052719,0.052983,-0.026801,0.005345,0.035439,0.001598,0.006591,
view,0.001781,0.267564,0.092243,0.165425,0.249297,0.112377,0.019924,0.272121,1.0,0.061721,0.152727,0.27001,-0.083885,0.027501,0.068984,0.023135,0.094563,
condition,0.005571,0.023843,0.006001,-0.163474,-0.066262,0.091853,-0.303411,0.00157,0.061721,1.0,-0.183099,0.201031,-0.433361,-0.241569,-0.013943,-0.004311,0.018044,


In [19]:
corr_matrix = reg_data.corr(method='spearman')

# to hold corellated feature
core_feature = set()

for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i,j]) > 0.8:
            col_name = corr_matrix.columns[i]
            core_feature.add(col_name)
            print('Features to be removed --->',col_name)

# drop  highly  corelated features 
spearman_feature = reg_data.drop(core_feature,axis=1)

print('Before feature selection --->',reg_data.shape)
print('After feature selection --->',spearman_feature.shape)

Features to be removed ---> sqft_above
Before feature selection ---> (4600, 18)
After feature selection ---> (4600, 17)


### 3. kendall's rank corellation

In [20]:
from scipy.stats import kendalltau

kendal_corr_matrix = reg_data.corr()

# to hold corellated feature
kendal_corr_features = set()

for i in range(len(kendal_corr_matrix.columns)):
    for j in range(i):
        if abs(kendal_corr_matrix.iloc[i,j]) > 0.8:
            col_name = kendal_corr_matrix.columns[i]
            kendal_corr_features.add(col_name)
            print('Features tp be removed ---->',col_name)
            
kendall_feature = reg_data.drop(kendal_corr_features,axis=1)

print('Before feature selection --->', reg_data.shape)
print('After feature selection --->',kendall_feature.shape)

Features tp be removed ----> sqft_above
Before feature selection ---> (4600, 18)
After feature selection ---> (4600, 17)


## c) Statistical and ranking filter method 
### 1. Mutual  information 
#### i) Mutual  information  for classifications 

In [21]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

mi_feature_c = SelectKBest(mutual_info_classif,  k=10).fit(x_c, y_c)

final_mi_feature_c = x_c.columns[mi_feature_c.get_support()]

print('Before feature selection --->',x_c.shape[1])
print('After feature selection --->', final_mi_feature_c.shape[0])
print('\nSelected features for classification --->',final_mi_feature_c)

Before feature selection ---> 20
After feature selection ---> 10

Selected features for classification ---> Index(['battery_power', 'fc', 'mobile_wt', 'px_height', 'px_width', 'ram',
       'sc_h', 'sc_w', 'three_g', 'touch_screen'],
      dtype='object')


#### ii) Mutual  information  for regression  

In [22]:
from sklearn.feature_selection import SelectKBest, mutual_info_regression

mi_feature_r = SelectKBest(mutual_info_regression,  k=10).fit(x_r, y_r)

final_mi_feature_r = x_r.columns[mi_feature_r.get_support()]

print('Before feature selection --->',x_r.shape[1])
print('After feature selections --->',final_mi_feature_r.shape[0])
print('\nSelected features for regression ---->',final_mi_feature_r)

Before feature selection ---> 17
After feature selections ---> 10

Selected features for regression ----> Index(['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'view',
       'sqft_above', 'sqft_basement', 'yr_built', 'statezip'],
      dtype='object')


### 2. Chi-squared score

In [23]:
# only  categorical variable
from sklearn.feature_selection import SelectKBest, chi2

cs_feature = SelectKBest(chi2,  k=10).fit(x_c, y_c)

final_cs_feature = x_c.columns[cs_feature.get_support()]

print('Before feature selection -->', x_c.shape[1])
print('After feature selection --->',final_cs_feature.shape[0])
print('\nSelected features ---->',final_cs_feature)

Before feature selection --> 20
After feature selection ---> 10

Selected features ----> Index(['battery_power', 'fc', 'int_memory', 'mobile_wt', 'px_height',
       'px_width', 'ram', 'sc_h', 'sc_w', 'talk_time'],
      dtype='object')


### 3. ANOVA test

In [24]:
from sklearn.feature_selection import SelectKBest, f_classif

anova_feature = SelectKBest(f_classif,  k=10).fit(x_r, y_r)

final_anova_feature = x_r.columns[anova_feature.get_support()]

print('Before feature selection --->',x_r.shape[1])
print('After features  selection --->',final_anova_feature.shape[0])
print('\nSelected features ---->',final_anova_feature)

Before feature selection ---> 17
After features  selection ---> 10

Selected features ----> Index(['bedrooms', 'bathrooms', 'sqft_living', 'floors', 'waterfront', 'view',
       'sqft_above', 'sqft_basement', 'yr_built', 'city'],
      dtype='object')


  f = msb / msw


### 4. ROC-AUC and RMSE

In [25]:
# we will  uses  decide  tree to evaluate  the model 
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import roc_auc_score

# list of score
roc_values = []

# loop over  all the features
for feature in x_c.columns:
    tree_model = DecisionTreeClassifier()
    tree_model.fit(x_c[feature].to_frame(),y_c)
    y_score = tree_model.predict_proba(x_c[feature].to_frame())
    roc_values.append(roc_auc_score(y_c,y_score,multi_class='ovr'))
    
print(np.sort(roc_values))

[0.50566667 0.507      0.507      0.50766667 0.51033333 0.51333333
 0.53601933 0.5429     0.55096933 0.55501067 0.555096   0.56558333
 0.566919   0.567824   0.62096067 0.662339   0.93816667 0.942139
 0.9429     0.99635233]


# 2. Wrapper method

### 1. Forward Feature Selection

In [26]:
from mlxtend.feature_selection import SequentialFeatureSelector
 
# import the algorithm you want to evaluate on your features.
from sklearn.ensemble import RandomForestClassifier

# create the SequentialFeatureSelector object, and configure the parameters.
sfs = SequentialFeatureSelector(RandomForestClassifier(), 
           k_features=10, 
           forward=True, 
           floating=False,
           scoring='accuracy',
           cv=2)

# fit the object to the training data.
sfs = sfs.fit(x_train_c, y_train_c)

# print the selected features.
selected_features = x_train_c.columns[list(sfs.k_feature_idx_)]
print('Selected features are --->',selected_features)

# print the final prediction score.
print('\nAccuracy score --->',sfs.k_score_)

# transform to the newly selected features.
x_train_sfs = sfs.transform(x_train_c)
x_test_sfs = sfs.transform(x_test_c)

Selected features are ---> Index(['battery_power', 'dual_sim', 'four_g', 'int_memory', 'px_height',
       'px_width', 'ram', 'sc_w', 'touch_screen', 'wifi'],
      dtype='object')

Accuracy score ---> 0.88375


### 2. Backward Feature Elimination

In [27]:
from mlxtend.feature_selection import SequentialFeatureSelector
 
# import the algorithm you want to evaluate on your features.
from sklearn.ensemble import RandomForestClassifier

# create the SequentialFeatureSelector object, and configure the parameters.
sfs = SequentialFeatureSelector(RandomForestClassifier(), 
           k_features=10, 
           forward=False, 
           floating=False,
           scoring='accuracy',
           cv=2)

# fit the object to the training data.
sfs = sfs.fit(x_train_c, y_train_c)

# print the selected features.
selected_features = x_train_c.columns[list(sfs.k_feature_idx_)]
print('Selected features are --->',selected_features)


# print the final prediction score.
print('\nAccuracy score --->',sfs.k_score_)

# transform to the newly selected features.
x_train_sfs = sfs.transform(x_train_c)
x_test_sfs = sfs.transform(x_test_c)

Selected features are ---> Index(['battery_power', 'clock_speed', 'four_g', 'int_memory', 'n_cores', 'pc',
       'px_height', 'px_width', 'ram', 'three_g'],
      dtype='object')

Accuracy score ---> 0.881875


### 3. Exhaustive Feature Selection

In [None]:
from mlxtend.feature_selection import ExhaustiveFeatureSelector
           
# import the algorithm you want to evaluate on your features.
from sklearn.ensemble import RandomForestClassifier

# create the ExhaustiveFeatureSelector object.
efs = ExhaustiveFeatureSelector(RandomForestClassifier(), 
           min_features=4,
           max_features=10, 
           scoring='accuracy',
           cv=2)

# fit the object to the training data.
efs = efs.fit(x_train_c, y_train_c)

# print the selected features.
selected_features = x_train.columns[list(efs.k_feature_idx_)]
print('Selected features are --->',selected_features)

# print the final prediction score.
print('\nAccuracy score --->',efs.k_score_)

# transform our data to the newly selected features.
x_train_sfs = efs.transform(x_train_c)
x_test_sfs = efs.transform(x_test_c)

### 4. Bidirectional Search (BDS)

In [14]:
from mlxtend.feature_selection import SequentialFeatureSelector
 
# import the algorithm you want to evaluate on your features.
from sklearn.ensemble import RandomForestClassifier

# create the SequentialFeatureSelector object, and configure the parameters.
sfs = SequentialFeatureSelector(RandomForestClassifier(), 
           k_features=10, 
           forward=True, 
           floating=True,
           scoring='accuracy',
           cv=2)

# fit the object to the training data.
sfs = sfs.fit(x_train_c, y_train_c)

# print the selected features.
selected_features = x_train_c.columns[list(sfs.k_feature_idx_)]
print('Selected features are --->',selected_features)

# print the final prediction score.
print('\nAccuracy score --->',sfs.k_score_)

# transform to the newly selected features.
x_train_sfs = sfs.transform(x_train_c)
x_test_sfs = sfs.transform(x_test_c)

Selected features are ---> Index(['battery_power', 'dual_sim', 'int_memory', 'm_dep', 'pc', 'px_height',
       'px_width', 'ram', 'sc_h', 'touch_screen'],
      dtype='object')

Accuracy score ---> 0.8812500000000001


# 3. Embedded method 

## 1. Regularization

### i) LASSO or L1 Regularization

In [48]:
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel

# using logistic regression with penalty l1.
selection = SelectFromModel(LogisticRegression(C=1, penalty='l1',solver='liblinear'))
selection.fit(x_train_r, y_train_r)

# see the selected features.
selected_features = selection.get_support()

print('Original features --->',len(x_train_r.columns),'\n',x_train_r.columns)
print('\nSelected features --->',len(x_train_r.columns[selected_features]),'\n',x_train_r.columns[selected_features])

Original features ---> 17 
 Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'street', 'city', 'statezip', 'country'],
      dtype='object')

Selected features ---> 15 
 Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'view', 'condition', 'sqft_above', 'sqft_basement', 'yr_built',
       'yr_renovated', 'street', 'city', 'statezip'],
      dtype='object')




### ii) Ridge or L2 Regularization

In [47]:
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.feature_selection import SelectFromModel

ridge_features = SelectFromModel(LogisticRegression(C=1, penalty='l2'))

ridge_features.fit(x_train_r,y_train_r)
ridge_selected_features = ridge_features.get_support()

print('Original features --->',len(x_train_r.columns), '\n',x_train_r.columns)
print('\nSelected features --->',len(x_train_r.columns[ridge_selected_features]),'\n', x_train_r.columns[ridge_selected_features])

Original features ---> 17 
 Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'street', 'city', 'statezip', 'country'],
      dtype='object')

Selected features ---> 7 
 Index(['sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement', 'yr_built',
       'street', 'statezip'],
      dtype='object')


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


### iii) Elastic net

In [46]:
from sklearn.linear_model import ElasticNet

elastic = ElasticNet(random_state=0)
elastic.fit(x_train_r,y_train_r)

ss = elastic.n_features_in_

print(ss)

print('Original features --->', len(x_train_r.columns),'\n',x_train_r.columns)
#print('\nSelected features --->', len(x_train_r.columns[ss]),'\n',x_train_r.columns[ss])

17
Original features ---> 17 
 Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'street', 'city', 'statezip', 'country'],
      dtype='object')


## 2. Tree based

### i) Random Forest

In [45]:
from sklearn.ensemble import RandomForestClassifier

# create the random forest with your hyperparameters.
rfc = RandomForestClassifier(n_estimators=150)

# fit the model to start training.
rfc.fit(x_train_c, y_train_c)

# get the importance of the resulting features.
f_importances = rfc.feature_importances_

# create a data frame for visualization.
final_df = pd.DataFrame({"Features": x_train_c.columns, "Importances":f_importances})

final_df.sort_values('Importances', ascending=False)

Unnamed: 0,Features,Importances
13,ram,0.478404
0,battery_power,0.07458
11,px_height,0.057197
12,px_width,0.054081
8,mobile_wt,0.040661
6,int_memory,0.037776
10,pc,0.029615
16,talk_time,0.029349
14,sc_h,0.028833
2,clock_speed,0.027982


### ii) Decision tree

In [44]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import SelectFromModel

clf = DecisionTreeClassifier()
clf = clf.fit(x_train_c, y_train_c)
f=clf.feature_importances_

final_dfs = pd.DataFrame({"Features": x_train_c.columns, "Importances":f})

final_dfs.sort_values('Importances', ascending=False)

Unnamed: 0,Features,Importances
13,ram,0.627295
0,battery_power,0.126164
11,px_height,0.077545
12,px_width,0.071689
8,mobile_wt,0.016911
16,talk_time,0.014282
7,m_dep,0.012335
9,n_cores,0.011974
2,clock_speed,0.007632
4,fc,0.007327


# 4. Hybrid method 

### 1. Recursive Feature Elimination

In [15]:
from sklearn.feature_selection import RFECV

# use any other model you want here.
from sklearn.ensemble import RandomForestClassifier

rfe_model = RandomForestClassifier(n_estimators=411)

# build the RFE with CV option.
rfe = RFECV(rfe_model, min_features_to_select = 3, step = 1 , cv=5, scoring='accuracy')

# fit the RFE to our data.
selection  = rfe.fit(x_train_c, y_train_c)

# print the selected features.
print(x_train_c.columns[selection.support_]) 
print('Original features --->',len(x_train_r.columns), '\n',x_train_r.columns)
print('\nSelected features --->',len(x_train_c.columns[selection.support_]),'\n', x_train_c.columns[selection.support_])

Index(['battery_power', 'px_height', 'px_width', 'ram'], dtype='object')
Original features ---> 17 
 Index(['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors',
       'waterfront', 'view', 'condition', 'sqft_above', 'sqft_basement',
       'yr_built', 'yr_renovated', 'street', 'city', 'statezip', 'country'],
      dtype='object')

Selected features ---> 4 
 Index(['battery_power', 'px_height', 'px_width', 'ram'], dtype='object')


### 2. Recursive Feature Addition

In [None]:
# you can use any other algorithm.
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score 

# array to hold the feature to be keept.
features_to_keep = [x_train_c.columns[0]]

# set this value according to you.
threshold = 0.002

# create your prefered model and  fit it to the training data.
model_one_feature = RandomForestClassifier(n_estimators=332)
model_one_feature.fit(x_train_c[[x_train_c.columns[0]]], y_train_c)

# evaluate against your metric.
y_pred_test = model_one_feature.predict(x_test_c[[x_train_c.columns[0]]])
auc_score_all = roc_auc_score(x_test_c, y_pred_test, multi_class='ovr')

# start iterating from the feature.
for feature in x_train_c.columns[1:]:
    
    model = RandomForestClassifier(n_estimators=332)
    
    # fit model with  the selected features and the feature to be evaluated
    model.fit(x_train_c[features_to_keep + [feature]], y_train_c)
    y_pred_test = model.predict(x_test_c[features_to_keep + [feature]])
    auc_score_int = roc_auc_score(y_test_c, y_pred_test[0] )

    # determine the drop in the roc-auc
    diff_auc = auc_score_int - auc_score_all

    # compare the drop in roc-auc with the threshold
    if diff_auc >= threshold:
        
        # if the increase in the roc is bigger than the threshold
        # we keep the feature and re-adjust the roc-auc to the new value
        # considering the added feature
        auc_score_all = auc_score_int
        features_to_keep.append(feature)

# print the feature to keep.
print(features_to_keep)