# Feature selection Technique 

#  On Credit_Risk Data

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
cr = pd.read_csv("CreditRisk.csv")

In [4]:
cr.isnull().sum()

Loan_ID               0
Gender               24
Married               3
Dependents           25
Education             0
Self_Employed        55
ApplicantIncome       0
CoapplicantIncome     0
LoanAmount           27
Loan_Amount_Term     20
Credit_History       79
Property_Area         0
Loan_Status           0
dtype: int64

In [5]:
cr.Gender = cr.Gender.fillna("Male")
cr.Married = cr.Married.fillna("No")
cr.Dependents = cr.Dependents.fillna(0)
cr.Self_Employed = cr.Self_Employed.fillna("No")
cr.LoanAmount = cr.LoanAmount.fillna(cr.LoanAmount.mean())
cr.Loan_Amount_Term = cr.Loan_Amount_Term.fillna(cr.Loan_Amount_Term.mean())
cr.Credit_History = cr.Credit_History.fillna(0)

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [7]:
cr.Gender = le.fit_transform(cr.Gender)
cr.Married = le.fit_transform(cr.Married)
cr.Education = le.fit_transform(cr.Education)
cr.Self_Employed = le.fit_transform(cr.Self_Employed)
cr.Property_Area = le.fit_transform(cr.Property_Area)

In [8]:
cr.Loan_Status = cr.Loan_Status.replace({ "Y" :1 , "N" :0 })

In [9]:
cr.head()

Unnamed: 0,Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area,Loan_Status
0,LP001002,1,0,0.0,0,0,5849,0.0,142.51153,360.0,1.0,2,1
1,LP001003,1,1,1.0,0,0,4583,1508.0,128.0,360.0,1.0,0,0
2,LP001005,1,1,0.0,0,1,3000,0.0,66.0,360.0,1.0,2,1
3,LP001006,1,1,0.0,1,0,2583,2358.0,120.0,360.0,1.0,2,1
4,LP001008,1,0,0.0,0,0,6000,0.0,141.0,360.0,1.0,2,1


In [10]:
cr_x = cr.iloc[ : , 1:12]
cr_y = cr.iloc[ : , 12]

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
cr_x_train, cr_x_test, cr_y_train, cr_y_test = train_test_split(cr_x,cr_y, test_size = 0.2, random_state = 555)

# Using boruta Technique

In [13]:
import boruta
from boruta import  BorutaPy

In [14]:
cr_x = np.array(cr_x)
cr_y = np.array(cr_y)

In [15]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

In [16]:
boruta_feature_selector = BorutaPy(rf, max_iter=30, verbose=2)

In [17]:
boruta_feature_selector.fit(cr_x,cr_y)

Iteration: 	1 / 30
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	2 / 30
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	3 / 30
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	4 / 30
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	5 / 30
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	6 / 30
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	7 / 30
Confirmed: 	0
Tentative: 	11
Rejected: 	0
Iteration: 	8 / 30
Confirmed: 	2
Tentative: 	1
Rejected: 	8
Iteration: 	9 / 30
Confirmed: 	2
Tentative: 	1
Rejected: 	8
Iteration: 	10 / 30
Confirmed: 	2
Tentative: 	1
Rejected: 	8
Iteration: 	11 / 30
Confirmed: 	2
Tentative: 	1
Rejected: 	8
Iteration: 	12 / 30
Confirmed: 	2
Tentative: 	1
Rejected: 	8
Iteration: 	13 / 30
Confirmed: 	2
Tentative: 	1
Rejected: 	8
Iteration: 	14 / 30
Confirmed: 	2
Tentative: 	1
Rejected: 	8
Iteration: 	15 / 30
Confirmed: 	2
Tentative: 	1
Rejected: 	8
Iteration: 	16 / 30
Confirmed: 	2
Tentative: 	1
Rejected: 	8
Iteration: 	17 / 30
Confir

BorutaPy(estimator=RandomForestClassifier(n_estimators=1000,
                                          random_state=RandomState(MT19937) at 0x19A82ED0540),
         max_iter=30, random_state=RandomState(MT19937) at 0x19A82ED0540,
         verbose=2)

In [18]:
boruta_feature_selector.support_

array([False, False, False, False, False,  True, False, False, False,
        True, False])

# Using RFE  Technique

In [19]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()

In [20]:
from sklearn.feature_selection import RFE

In [21]:
rfe = RFE( dt, 5)



In [22]:
rfe.fit(cr_x_train,cr_y_train)

RFE(estimator=DecisionTreeClassifier(), n_features_to_select=5)

In [23]:
rfe.support_

array([False, False, False, False, False,  True,  True,  True, False,
        True,  True])

In [24]:
cr_x_train.columns

Index(['Gender', 'Married', 'Dependents', 'Education', 'Self_Employed',
       'ApplicantIncome', 'CoapplicantIncome', 'LoanAmount',
       'Loan_Amount_Term', 'Credit_History', 'Property_Area'],
      dtype='object')

In [25]:
feature_importance = pd.DataFrame({"Feature Name":cr_x_train.columns,
                                 "Importance": rfe.support_})

In [26]:
feature_importance.sort_values("Importance", ascending=False)

Unnamed: 0,Feature Name,Importance
5,ApplicantIncome,True
6,CoapplicantIncome,True
7,LoanAmount,True
9,Credit_History,True
10,Property_Area,True
0,Gender,False
1,Married,False
2,Dependents,False
3,Education,False
4,Self_Employed,False


In [27]:
# ; colon is used to hide the op which used at the  end of input code 