In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from google.colab import files          #Upload bank.csv from the "Feature Selection" folder of the course 
uploaded = files.upload()

Saving bank.csv to bank.csv


In [None]:
bank_data = pd.read_csv('bank.csv')
bank_data.head()

Unnamed: 0,age,job,marital,education,default,housing,loan,contact,month,day_of_week,...,campaign,pdays,previous,poutcome,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,y
0,56,housemaid,married,basic.4y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
1,57,services,married,high.school,unknown,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
2,37,services,married,high.school,no,yes,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
3,40,admin.,married,basic.6y,no,no,no,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no
4,56,services,married,high.school,no,no,yes,telephone,may,mon,...,1,999,0,nonexistent,1.1,93.994,-36.4,4.857,5191.0,no


In [None]:
#You dont know the duration of calls when approaching new customers. 
#Higher duration of call usually means that the customer bought the product
#But using the duration as a predictive feature would be cheating in the forecast!!

bank_data = bank_data.drop(['duration'], axis=1) 
bank_data.shape

(41188, 20)

In [None]:
#Null Values
bank_data.isnull().sum()

age               0
job               0
marital           0
education         0
default           0
housing           0
loan              0
contact           0
month             0
day_of_week       0
campaign          0
pdays             0
previous          0
poutcome          0
emp.var.rate      0
cons.price.idx    0
cons.conf.idx     0
euribor3m         0
nr.employed       0
y                 0
dtype: int64

In [None]:
#See if we need to use dummy variables (If there are categorical featrues)
bank_data.dtypes

age                 int64
job                object
marital            object
education          object
default            object
housing            object
loan               object
contact            object
month              object
day_of_week        object
campaign            int64
pdays               int64
previous            int64
poutcome           object
emp.var.rate      float64
cons.price.idx    float64
cons.conf.idx     float64
euribor3m         float64
nr.employed       float64
y                  object
dtype: object

In [None]:
#Turn categorical features into dummy variables
bank_data = pd.get_dummies(bank_data, drop_first=True)
bank_data.head()

Unnamed: 0,age,campaign,pdays,previous,emp.var.rate,cons.price.idx,cons.conf.idx,euribor3m,nr.employed,job_blue-collar,...,month_nov,month_oct,month_sep,day_of_week_mon,day_of_week_thu,day_of_week_tue,day_of_week_wed,poutcome_nonexistent,poutcome_success,y_yes
0,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,1,0,0
1,57,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,1,0,0
2,37,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,1,0,0
3,40,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,1,0,0
4,56,1,999,0,1.1,93.994,-36.4,4.857,5191.0,0,...,0,0,0,1,0,0,0,1,0,0


In [None]:
#Specify X (independant variables) and Y(predicted variable or target variable)
X = bank_data.iloc[:, 0:-1]
Y = bank_data.iloc[:, -1]

In [None]:
#Train and Test split (**Stratified Sampling**)

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.3, random_state=1234, stratify=Y) #Stratified Sampling


In [None]:
#Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

bank_RF_classifier = RandomForestClassifier(random_state=1234)
bank_RF_classifier.fit(X_train, Y_train)

RandomForestClassifier(random_state=1234)

In [None]:
score = bank_RF_classifier.score(X_test, Y_test)
print('Model Score is:', score)

Model Score is: 0.8943109168892126


In [None]:
from sklearn.metrics import confusion_matrix

Y_predict = bank_RF_classifier.predict(X_test)
cm = confusion_matrix(Y_test, Y_predict)
cm

array([[10647,   318],
       [  988,   404]])

#RFE (Recursive Feature Elimination)
Easiest method to use compared to F-regression, SelectKBest and GenericUnivariateSelect. 

rfe.transform(X_test) modeifies the X-test dataset to include the selected variables only. Something that you had to do manually in the previous session!

In [None]:
from sklearn.feature_selection import RFE
bank_data.shape

(41188, 53)

In [None]:
bank_RF_classifier2 = RandomForestClassifier(random_state=1234)

rfe = RFE( estimator= bank_RF_classifier2,    n_features_to_select= 30,   step= 1 ) #Keep only 30 features out of 53
rfe.fit(X, Y) #now RFE selects the best 30 features by looking at the "entire" dataset

X_train_rfe = rfe.transform(X_train)  #keep those 30 selected features in the train set
X_test_rfe = rfe.transform(X_test)    #keep those 30 selected features in the test set



In [None]:
X_test_rfe.shape #It has 30 features as we wanted

(12357, 30)

In [None]:
X_train_rfe.shape #It has 30 features as we wanted

(28831, 30)

In [None]:
X_train.shape

(28831, 52)

In [None]:
bank_RF_classifier2.fit(X_train_rfe, Y_train)

RandomForestClassifier(random_state=1234)

In [None]:
score2 = bank_RF_classifier2.score(X_test_rfe, Y_test)
print('Model Score is:', score2)

Model Score is: 0.8943109168892126


In [None]:
from sklearn.metrics import confusion_matrix

Y_predict2 = bank_RF_classifier2.predict(X_test_rfe)
cm_rfe = confusion_matrix(Y_test, Y_predict2)
cm_rfe  #Confusion matrix shows some improvement

array([[10638,   327],
       [  979,   413]])

**Feature importances**

In [None]:
ranking = rfe.ranking_
ranking #all features with rank=1 are the ones that have been selected by RFE. We need to specify their importance

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  5, 11,  1,  1,  3,  1,  6,
        1,  8, 16,  1,  1, 20,  4,  1,  1, 22,  1,  1,  2,  1, 23,  7,  1,
       13,  1,  1, 17, 21, 15, 14, 10, 12, 19,  9, 18,  1,  1,  1,  1,  1,
        1])

In [None]:
rfe.ranking_.shape   #total of 52 variables

(52,)

In [None]:
rfe.ranking_[rfe.ranking_==1].sum()  #30 variables have rank 1 based on RFE and thus, ahave been selected

30

In [None]:
feature_importance = bank_RF_classifier.feature_importances_ #it gives importance of all 52 features
feature_importance

array([1.70809903e-01, 8.39670422e-02, 3.09585398e-02, 1.91089531e-02,
       2.20649131e-02, 2.31893481e-02, 2.78846783e-02, 1.21091042e-01,
       6.26615200e-02, 1.40153409e-02, 6.52409605e-03, 5.32901247e-03,
       1.14962225e-02, 7.62594112e-03, 7.65622686e-03, 1.14585079e-02,
       6.14242701e-03, 1.75824848e-02, 5.61703920e-03, 2.60052665e-03,
       2.11730222e-02, 1.76798976e-02, 9.09749915e-04, 7.64070771e-03,
       1.21102433e-02, 1.73136763e-02, 5.71709699e-04, 1.26410579e-02,
       1.78213433e-02, 7.69009624e-03, 1.66857423e-02, 1.39446286e-07,
       3.31006585e-03, 3.89403727e-02, 3.21863249e-03, 2.38703939e-02,
       1.64336689e-02, 2.42264097e-03, 9.09274796e-04, 2.25768711e-03,
       2.98676679e-03, 3.84770285e-03, 4.23807825e-03, 2.12292725e-03,
       5.45740279e-03, 1.87285593e-03, 1.62182135e-02, 1.61434986e-02,
       1.55667196e-02, 1.59937293e-02, 9.71855647e-03, 2.44496615e-02])

In [None]:
columns = list(X.columns)
rfe_selected = pd.DataFrame() #create an empty data frame
rfe_selected = pd.concat([pd.DataFrame(columns),pd.DataFrame(ranking),pd.DataFrame(feature_importance)], axis=1)
rfe_selected.head(3)

Unnamed: 0,0,0.1,0.2
0,age,1,0.17081
1,campaign,1,0.083967
2,pdays,1,0.030959


In [None]:
#Let's add column names to this data frame
rfe_selected.columns = ['Feature Name', 'Ranking', 'Feature Importance']
rfe_selected

Unnamed: 0,Feature Name,Ranking,Feature Importance
0,age,1,0.1708099
1,campaign,1,0.08396704
2,pdays,1,0.03095854
3,previous,1,0.01910895
4,emp.var.rate,1,0.02206491
5,cons.price.idx,1,0.02318935
6,cons.conf.idx,1,0.02788468
7,euribor3m,1,0.121091
8,nr.employed,1,0.06266152
9,job_blue-collar,1,0.01401534


#Multicolinearity and feature selection
##LASSO (L1) and Reidge Regularization

**Read Data**

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

In [None]:
from google.colab import files          #Upload mcl.csv from the main folder of the course 
uploaded = files.upload()

Saving mcl.csv to mcl.csv


In [None]:
df = pd.read_csv('mcl.csv')
df.head() #X1 and X2 are 100% correlated: X2 = 1.8*X1   Also, Xi has less importance than Xj for all i>j

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15,Y
0,7,12.6,2,16,12,19,19,2,5,11,13,12,6,20,10,294.958
1,4,7.2,13,14,12,16,11,18,20,1,9,6,17,17,19,344.721
2,10,18.0,20,9,2,10,14,7,3,9,15,19,2,14,14,343.366
3,15,27.0,1,20,2,18,18,15,8,14,11,4,19,5,6,280.772
4,6,10.8,20,2,17,16,15,11,4,13,20,2,19,20,19,374.397


In [None]:
#Specify X (independant variables) and Y(predicted variable or target variable)
X = df.iloc[:, 0:-1]
Y = df.iloc[:, -1]

In [None]:
correlation = X.corr()
correlation.style.background_gradient(cmap='coolwarm')

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,X11,X12,X13,X14,X15
X1,1.0,1.0,-0.209711,0.119736,-0.128403,-0.251226,-0.245742,0.13516,0.066598,0.206307,-0.15545,0.044458,0.003999,-0.078752,0.067695
X2,1.0,1.0,-0.209711,0.119736,-0.128403,-0.251226,-0.245742,0.13516,0.066598,0.206307,-0.15545,0.044458,0.003999,-0.078752,0.067695
X3,-0.209711,-0.209711,1.0,-0.046544,0.071759,-0.212747,0.033865,0.009074,-0.245954,-0.085453,-0.05144,0.188441,0.154852,0.156795,0.339773
X4,0.119736,0.119736,-0.046544,1.0,-0.295913,0.221554,0.316329,0.171753,-0.155269,-0.086964,-0.29826,0.219644,0.264612,0.302779,-0.096975
X5,-0.128403,-0.128403,0.071759,-0.295913,1.0,-0.191911,-0.377967,-0.173086,0.06295,0.141345,0.318774,-0.292829,0.445463,-0.053803,-0.06531
X6,-0.251226,-0.251226,-0.212747,0.221554,-0.191911,1.0,0.195824,-0.141802,0.132583,0.227458,0.173524,0.134872,0.087818,0.182317,-0.021305
X7,-0.245742,-0.245742,0.033865,0.316329,-0.377967,0.195824,1.0,0.107389,-0.316919,-0.044498,-0.290376,-0.024236,-0.150627,0.315024,0.02788
X8,0.13516,0.13516,0.009074,0.171753,-0.173086,-0.141802,0.107389,1.0,0.036707,0.072132,-0.196835,-0.313592,0.109849,0.462225,0.107362
X9,0.066598,0.066598,-0.245954,-0.155269,0.06295,0.132583,-0.316919,0.036707,1.0,-0.243126,-0.159466,0.076146,0.256291,-0.193855,0.293955
X10,0.206307,0.206307,-0.085453,-0.086964,0.141345,0.227458,-0.044498,0.072132,-0.243126,1.0,-0.052777,-0.091408,-0.064017,0.059012,-0.238839


**No Regularization: Linear Regression Model**

In [None]:
from sklearn.linear_model import Lasso, Ridge, LinearRegression

In [None]:
LR = LinearRegression()
LR.fit(X,Y)
LR_coefficienct = LR.coef_
LR_intercept = LR.intercept_
LR_coefficienct

array([ 0.9685666 ,  1.74341988,  5.22151795,  4.76932763,  6.34161228,
        2.27250423,  3.10401606,  1.41894342,  1.14483366,  0.13845666,
        1.27299503,  0.18844984, -2.33424536, -0.47582293,  0.48954817])

In [None]:
import statsmodels.api as sm

X2 = sm.add_constant(X) # adding a constant (intercept)
model = sm.OLS(Y, X2)
LR2 = model.fit()
print(LR2.summary())

  import pandas.util.testing as tm


                            OLS Regression Results                            
Dep. Variable:                      Y   R-squared:                       0.811
Model:                            OLS   Adj. R-squared:                  0.433
Method:                 Least Squares   F-statistic:                     2.146
Date:                Wed, 27 Apr 2022   Prob (F-statistic):              0.157
Time:                        05:15:16   Log-Likelihood:                -97.352
No. Observations:                  22   AIC:                             224.7
Df Residuals:                       7   BIC:                             241.1
Df Model:                          14                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         16.0186     79.038      0.203      0.8

  x = pd.concat(x[::order], 1)


**Regularization: Lasso**

In [None]:
Lasso = Lasso(alpha=10)
Lasso.fit(X,Y)
Lasso_coefficienct = Lasso.coef_
Lasso_intercept = Lasso.intercept_
Lasso_coefficienct

array([ 0.        ,  1.7309199 ,  3.93145036,  3.18600347,  4.26593065,
        1.27754125,  1.64825311,  0.66718862,  0.        ,  0.        ,
        0.69332269,  0.50395826, -0.16795267,  0.12460803,  0.51203443])

**Regularization: Ridge**

In [None]:
Ridge = Ridge(alpha=100)
Ridge.fit(X,Y)
Ridge_coefficienct = Ridge.coef_
Ridge_intercept = Ridge.intercept_
Ridge_coefficienct

array([ 0.70168778,  1.26303801,  3.63253895,  2.96709834,  3.84581771,
        0.94041065,  1.87287829,  0.76603986,  0.24004053,  0.59800807,
        1.15741106,  0.90270645, -0.04607831,  0.36735591,  0.73950502])

Comment:
Originally (using Linear Regression), X1 was given coefficient 0.96.
But notice that X1 has been given coefficient 0 in Lasso (i.e. it has been totally eliminated from the features), while Ridge only reduced X1 coefficient to 0.7.
It means that Lasso is better able to detect collinearity between features and remove them. Thus, it can be used for FEATURE SELECTION.


#Let's try f_regression on this dataset:

In [None]:
from sklearn.feature_selection import f_regression as fr
result = fr(X, Y)
F_Score = result[0]
P_Value = result[1]


columns = list(X.columns)

i=int(-1)
for c in columns:
 i+=1
 print(c,':', P_Value[i],',', F_Score[i]) 
 #Based on P-Values shown below, we pick "Hours" and "sHours" since for them P-Value<0.05 and drop the other features

X1 : 0.45335509278154507 , 0.5848295391821596
X2 : 0.45335509278154507 , 0.5848295391821604
X3 : 0.015253075433945438 , 7.040528874358367
X4 : 0.1199381720590338 , 2.638803387482131
X5 : 0.1248163950902143 , 2.566640287112773
X6 : 0.986294160746121 , 0.0003025728718993146
X7 : 0.7399719519755656 , 0.1132567284710993
X8 : 0.5941667391090402 , 0.29318667453918357
X9 : 0.5364540698762195 , 0.3956681629756933
X10 : 0.589469246321285 , 0.30077156415603423
X11 : 0.8095189111122011 , 0.05966223159422576
X12 : 0.5650724891029977 , 0.3422561198512775
X13 : 0.1353573995552795 , 2.4215777091370385
X14 : 0.17318370042941048 , 1.9950738426800527
X15 : 0.2535993773935698 , 1.3817997310099814


#Also let's try GenericUnivariateSelect on this dataset:

In [None]:
from sklearn.feature_selection import  GenericUnivariateSelect, f_regression  


Selecor = GenericUnivariateSelect(score_func = f_regression, 
                                     mode = 'k_best',            # mode ='percentile' :we can have different modes based on which the selector selects features
                                     param = 5 )                 #This selector would use f_regressor to pick top 3 features

x_G1 = Selecor.fit_transform(X,Y)
columns = Selecor.get_support(indices=True) #This gives the index number of selected features
#Selected_columns = X.columns[columns]
Selected_columns = X.columns[columns].tolist()
Selected_columns

['X3', 'X4', 'X5', 'X13', 'X14']