### Method We should be followed:
1. Dropping Constant Features
2. Pearson Correlation Coefficient
3. Information Gain 
4. Decision Tree
5. Random Forest
6. Gradient Boosting
7. Chi Square Method
8. Wrapper Method - Forward and Backward Method

## Dropping Constant Feature

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score, silhouette_score
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans, AgglomerativeClustering, DBSCAN
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC, SVR
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, BaggingClassifier
from xgboost import XGBClassifier, XGBRegressor

In [2]:
######## Import the dataset ########
df = pd.read_csv('santa.csv')
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.03,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0


In [3]:
###### Scaling the dataset #######
min_max_scaler_ = MinMaxScaler()
df = pd.DataFrame(min_max_scaler_.fit_transform(df), columns = df.columns)
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,0.0,0.999764,0.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001545,0.0
1,1.3e-05,0.999764,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002003,0.0
2,2e-05,0.999764,0.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002822,0.0
3,4.6e-05,0.999764,0.32,0.0,0.01513,0.009275,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002671,0.0
4,5.9e-05,0.999764,0.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005091,0.0


In [4]:
##### print the shape ######
df.shape
##### There are lots of features in this dataset ######

(76020, 371)

In [5]:
###### X, y ######
X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

###### train and test split #######
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

###### Shape ######
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((53214, 370), (22806, 370), (53214,), (22806,))

In [6]:
##### Dropping Constant Features #######
from sklearn.feature_selection import VarianceThreshold
variance_threshold_ = VarianceThreshold(threshold = 0.0)
##### Take only X_train dataset ######
variance_threshold_.fit_transform(X_train)

array([[0.0025883 , 0.99976406, 0.66      , ..., 0.        , 0.        ,
        0.00726423],
       [0.82137424, 0.99976406, 0.18      , ..., 0.        , 0.        ,
        0.00480158],
       [0.42777452, 0.99976406, 0.33      , ..., 0.        , 0.        ,
        0.00382138],
       ...,
       [0.7212603 , 0.99976406, 0.22      , ..., 0.        , 0.        ,
        0.0026587 ],
       [0.01120939, 0.99976406, 0.12      , ..., 0.        , 0.        ,
        0.00437269],
       [0.20883579, 0.99976406, 0.19      , ..., 0.        , 0.        ,
        0.00509076]])

In [7]:
len(df.columns[:-1][variance_threshold_.get_support()])

324

In [8]:
len(variance_threshold_.get_support())

370

#### Disadvantage:
1. There is no relationship of features and target columns.

## Pearson Correlation

In [9]:
######## Import the dataset ########
df = pd.read_csv('santa.csv')
df.head()

###### Scaling the dataset #######
min_max_scaler_ = MinMaxScaler()
df = pd.DataFrame(min_max_scaler_.fit_transform(df), columns = df.columns)
df.head()

##### print the shape ######
df.shape
##### There are lots of features in this dataset ######

###### X, y ######
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

###### train and test split #######
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

###### Shape ######
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((53214, 370), (22806, 370), (53214,), (22806,))

In [10]:
##### import seaborn #####

# import seaborn as sns

# plt.figure(figsize = (12, 7))
# corr_ = df.iloc[:, :-1].corr()
# sns.heatmap(corr_, annot = True)

In [11]:
def correlation(dataset, threshold_):
    col_corr_ = set()
    corr_matrix_ = dataset.corr()
    ###### Run a Loop ######
    for i in range(len(corr_matrix_.columns)):
        for j in range(i):
            if abs(corr_matrix_.iloc[i, j] > threshold_):
                colname_ = corr_matrix_.columns[i]
                col_corr_.add(colname_)
    return col_corr_

In [12]:
pearson_ = correlation(X_train, 0.90)

In [13]:
count_ = 0
for column_ in pearson_:
    print(column_, end = " | ")
    count_ = count_ + 1
print("\n\n")
print("Total columns are = ", count_)

ind_var13_largo | saldo_medio_var12_ult3 | num_var25 | num_meses_var5_ult3 | saldo_var6 | num_op_var39_comer_ult3 | saldo_var13_medio | num_var13_corto | ind_var34 | num_var29_0 | num_var40 | num_op_var39_ult1 | num_var7_recib_ult1 | ind_var13_medio | ind_var24 | ind_var25 | num_var24_0 | num_trasp_var17_in_hace3 | num_op_var41_comer_ult3 | ind_var13_corto | num_op_var39_efect_ult1 | num_var32_0 | num_op_var40_ult3 | saldo_var34 | imp_trasp_var33_out_ult1 | num_var41_0 | num_op_var39_hace3 | num_var1_0 | num_op_var41_comer_ult1 | ind_var13 | saldo_medio_var13_medio_ult1 | num_var45_hace2 | imp_op_var41_comer_ult1 | ind_var40_0 | num_med_var22_ult3 | delta_imp_amort_var34_1y3 | saldo_medio_var13_corto_ult3 | saldo_medio_var17_ult3 | ind_var26_cte | delta_num_trasp_var17_in_1y3 | ind_var9_cte_ult1 | num_op_var41_efect_ult3 | num_var26_0 | saldo_var29 | imp_amort_var34_ult1 | num_op_var41_ult3 | num_op_var39_hace2 | num_trasp_var17_out_ult1 | saldo_medio_var13_largo_ult1 | delta_num_venta

In [14]:
df.drop(pearson_, axis = 1).head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,...,saldo_medio_var29_hace2,saldo_medio_var29_hace3,saldo_medio_var29_ult1,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,var38,TARGET
0,0.0,0.999764,0.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.001545,0.0
1,1.3e-05,0.999764,0.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002003,0.0
2,2e-05,0.999764,0.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002822,0.0
3,4.6e-05,0.999764,0.32,0.0,0.01513,0.009275,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.002671,0.0
4,5.9e-05,0.999764,0.34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.005091,0.0


#### Disadvantage:
1. There is no relationship between features and target

## Mutual Information Gain

In [15]:
######## Import the dataset ########
df = pd.read_csv('santa.csv')
df.head()

###### Scaling the dataset #######
min_max_scaler_ = MinMaxScaler()
df = pd.DataFrame(min_max_scaler_.fit_transform(df), columns = df.columns)
df.head()

##### print the shape ######
df.shape
##### There are lots of features in this dataset ######

###### X, y ######
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

###### train and test split #######
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

###### Shape ######
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((53214, 370), (22806, 370), (53214,), (22806,))

In [16]:
##### import mutual information gain ######
from sklearn.feature_selection import mutual_info_classif

##### it will take X_train and y_train
mutual_info_classifier_ = mutual_info_classif(X_train, y_train)

In [17]:
check_ = pd.DataFrame(pd.Series(mutual_info_classifier_, index = df.columns[:-1]).sort_values(ascending = False))
for column_, value_ in zip(check_.head(50).index, check_.head(50).values):
    print(column_, value_, end = " , ")

ind_var30 [0.01613683] , saldo_var30 [0.01559151] , num_meses_var5_ult3 [0.01476765] , ind_var5 [0.01362487] , saldo_var42 [0.01329595] , num_var4 [0.01182153] , saldo_var5 [0.01178978] , saldo_medio_var5_hace2 [0.01166634] , ind_var30_0 [0.01163128] , var15 [0.01145863] , num_var30 [0.01131306] , saldo_medio_var5_ult3 [0.01128319] , saldo_medio_var5_ult1 [0.01123409] , num_var35 [0.01116127] , saldo_medio_var5_hace3 [0.01112151] , ind_var5_0 [0.01087783] , num_var42 [0.01038293] , ind_var39_0 [0.01031685] , ind_var41_0 [0.00940457] , num_var5 [0.00874369] , var36 [0.00731394] , var3 [0.00669615] , num_meses_var39_vig_ult3 [0.00612832] , saldo_var13_corto [0.00359215] , ind_var13_corto_0 [0.00305478] , num_var5_0 [0.00250102] , num_var45_ult1 [0.00240052] , num_aport_var13_hace3 [0.00226543] , num_var39_0 [0.00212014] , saldo_medio_var13_corto_ult3 [0.00207094] , num_var42_0 [0.00206134] , saldo_medio_var12_ult3 [0.00197979] , saldo_medio_var13_corto_ult1 [0.00197587] , num_var31 [0.00

## Feature Importance - Using Decision Tree, Random Forest

In [18]:
######## Import the dataset ########
df = pd.read_csv('santa.csv')
df.head()

###### Scaling the dataset #######
min_max_scaler_ = MinMaxScaler()
df = pd.DataFrame(min_max_scaler_.fit_transform(df), columns = df.columns)
df.head()

##### print the shape ######
df.shape
##### There are lots of features in this dataset ######

###### X, y ######
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

###### train and test split #######
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

###### Shape ######
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((53214, 370), (22806, 370), (53214,), (22806,))

In [19]:
##### Call Decision Tree ######
decisionTree_ = DecisionTreeClassifier()
decisionTree_.fit(X_train, y_train)
predicted_ = decisionTree_.predict(X_test)
accuracy_score(predicted_, y_test)

0.9257651495220556

In [20]:
##### Get the feature importance ######
check_ = pd.Series(decisionTree_.feature_importances_, index = df.columns[:-1]).sort_values(ascending = False)
columns_ = check_[check_ > 0].index

##### import dataset again #####
df = pd.read_csv('santa.csv', usecols = columns_)
df.head()

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_efect_ult1,imp_op_var41_comer_ult1,imp_op_var41_comer_ult3,imp_op_var41_efect_ult1,...,saldo_medio_var5_ult3,saldo_medio_var8_hace2,saldo_medio_var8_hace3,saldo_medio_var8_ult1,saldo_medio_var8_ult3,saldo_medio_var12_hace3,saldo_medio_var12_ult1,saldo_medio_var12_ult3,saldo_medio_var13_corto_ult3,var38
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.17
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,240.75,49278.03
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.07,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.77
3,8,2,37,0.0,195.0,195.0,0.0,195.0,195.0,0.0,...,138.84,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.97
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,13501.47,0.0,0.0,0.0,0.0,0.0,85501.89,85501.89,0.0,117310.979016


In [21]:
###### Scaling the dataset #######
min_max_scaler_ = MinMaxScaler()
df = pd.DataFrame(min_max_scaler_.fit_transform(df), columns = df.columns)
df.head()

##### print the shape ######
df.shape
##### There are lots of features in this dataset ######

###### X, y ######
X = df.iloc[:, :-1].values
df1 = pd.read_csv('santa.csv')
y = df1.iloc[:, -1]

###### train and test split #######
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

###### Shape ######
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((53214, 116), (22806, 116), (53214,), (22806,))

In [22]:
##### Call Decision Tree ######
decisionTree_ = DecisionTreeClassifier()
decisionTree_.fit(X_train, y_train)
predicted_ = decisionTree_.predict(X_test)
accuracy_score(predicted_, y_test)

0.9272998333771815

In [23]:
##### Random Forest #####
decisionTree_ = RandomForestClassifier()
decisionTree_.fit(X_train, y_train)
predicted_ = decisionTree_.predict(X_test)
accuracy_score(predicted_, y_test)

0.9503200912040691

In [24]:
##### XGboost ######
decisionTree_ = XGBClassifier()
decisionTree_.fit(X_train, y_train)
predicted_ = decisionTree_.predict(X_test)
accuracy_score(predicted_, y_test)

0.9599666754362887

## Chi Square Test 

In [33]:
######## Import the dataset ########
df = pd.read_csv('santa.csv')
df.head()

###### Scaling the dataset #######
min_max_scaler_ = MinMaxScaler()
df = pd.DataFrame(min_max_scaler_.fit_transform(df), columns = df.columns)
df.head()

##### print the shape ######
df.shape
##### There are lots of features in this dataset ######

###### X, y ######
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

###### train and test split #######
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)

###### Shape ######
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((53214, 370), (22806, 370), (53214,), (22806,))

In [34]:
df.isnull().sum().sum()

0

In [35]:
##### Import the Chi Square Method ######
from sklearn.feature_selection import chi2
##### call the function ######
chi_square_ = chi2(X_train, y_train)

In [39]:
pd.Series(chi_square_[1], index = df.columns[:-1]).sort_values(ascending = True)

num_meses_var5_ult3              5.113657e-78
var36                            1.692993e-76
ind_var5                         7.227574e-76
ind_var30                        2.526500e-75
ind_var8_0                       3.395205e-23
                                     ...     
num_trasp_var17_out_hace3                 NaN
num_trasp_var33_out_hace3                 NaN
saldo_var2_ult1                           NaN
saldo_medio_var13_medio_hace3             NaN
saldo_medio_var29_hace3                   NaN
Length: 370, dtype: float64

## Wrapper Method - Forward Selection

In [51]:
##### import the dataset #####
df = pd.read_csv('Iris.csv')
df1 = pd.read_csv('Iris.csv')
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [52]:
df.iloc[:, -1].value_counts()

Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: Species, dtype: int64

In [53]:
##### Label Encoding in Target class #####
df.loc[:, 'Species'] = df.loc[:, 'Species'].map({'Iris-setosa': 0, 'Iris-versicolor': 1, 'Iris-virginica': 2})
##### Drop the ID Column #####
df.drop(['Id'], axis = 1, inplace = True)
##### print the dataset #####
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [54]:
##### sacling the dataset #####
min_max_scaler_ = MinMaxScaler()
df = pd.DataFrame(min_max_scaler_.fit_transform(df.iloc[:, :-1]), columns = df.columns[:-1])
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667


In [55]:
#### Warning Remove
import sys

if not sys.warnoptions:
    import warnings
    warnings.simplefilter("ignore")

In [56]:
##### X and y
X = df.loc[:].values
y = df1.iloc[:, -1].values

##### train and test 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 42)
##### Shape of this train and test
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((105, 4), (45, 4), (105,), (45,))

In [57]:
##### import the file ######
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
##### call this with this Parameters 
SFS_ = SFS(RandomForestClassifier(),
          k_features = 'best',
          forward = True,
          floating = False,
          scoring = 'accuracy')
##### Done ######

In [58]:
SFS_.fit(X_train, y_train)

In [59]:
SFS_.k_feature_names_, SFS_.k_feature_idx_

(('2', '3'), (2, 3))

In [60]:
df = pd.DataFrame(SFS_.get_metric_dict()).T
df.loc[:, ['feature_idx', 'avg_score']]

Unnamed: 0,feature_idx,avg_score
1,"(2,)",0.933333
2,"(2, 3)",0.961905
3,"(0, 2, 3)",0.942857
4,"(0, 1, 2, 3)",0.942857


#### Advantage:
1. It performs well than all other techniques

#### Disadvantage:
1. Takes huge time 
2. Computational Expensive

## Wrapper Method - Backward Selection

In [61]:
SFS_ = SFS(RandomForestClassifier(),
          k_features = 'best',
          forward = False,
          floating = False,
          verbose = 0,
          scoring = 'accuracy')

SFS_.fit(X_train, y_train)

In [62]:
SFS_.k_feature_names_

('2', '3')

In [65]:
pd.DataFrame(SFS_.get_metric_dict()).T[['feature_idx', 'avg_score']]

Unnamed: 0,feature_idx,avg_score
4,"(0, 1, 2, 3)",0.942857
3,"(0, 2, 3)",0.942857
2,"(2, 3)",0.952381
1,"(2,)",0.933333


#### Disadvantage:
1. Takes huge time 
2. Required High Computational Power

#### Advantage:
1. Well performance than all techniques