In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt 
import re
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [3]:
def Constant_Features(x_train, x_test,threshold=0):
  """
  Removing Constant Features using Variance Threshold
  Input: threshold parameter to identify the variable as constant
         train data (pd.Dataframe) 
         test data (pd.Dataframe)
  Output: train data, test data after applying filter methods
  """
  # import and create the VarianceThreshold object.
  from sklearn.feature_selection import VarianceThreshold
  vs_constant = VarianceThreshold(threshold)

  # select the numerical columns only.
  numerical_x_train = x_train[x_train.select_dtypes([np.number]).columns]

  # fit the object to our data.
  vs_constant.fit(numerical_x_train)

  # get the constant colum names.
  constant_columns = [column for column in numerical_x_train.columns
                      if column not in numerical_x_train.columns[vs_constant.get_support()]]

  # detect constant categorical variables.
  constant_cat_columns = [column for column in x_train.columns 
                          if (x_train[column].dtype == "O" and len(x_train[column].unique())  == 1 )]

  # concatenating the two lists.
  all_constant_columns = constant_cat_columns + constant_columns
  
  return all_constant_columns


In [4]:
def Quansi_Constant_Feature(x_train, x_test,threshold=0.98):
  # create empty list
  quasi_constant_feature = []

  # loop over all the columns
  for feature in x_train.columns:
    # calculate the ratio.
    predominant = (x_train[feature].value_counts() / np.float(len(x_train))).sort_values(ascending=False).values[0]
    
    # append the column name if it is bigger than the threshold
    if predominant >= threshold:
        quasi_constant_feature.append(feature)   
  return quasi_constant_feature

In [5]:
def Dupplicate_Feature(x_train,x_test):
  # transpose the feature matrice
  train_features_T = x_train.T

  # print the number of duplicated features
  print(train_features_T.duplicated().sum())

  # select the duplicated features columns names
  duplicated_columns = train_features_T[train_features_T.duplicated()].index.values

  return duplicated_columns

In [6]:
def Correlated_Feature(x_train,x_test,threshold=0.8):
  # creating set to hold the correlated features
  corr_features = set()

  # create the correlation matrix (default to pearson)
  corr_matrix = x_train.corr()
  '''
  # optional: display a heatmap of the correlation matrix
  plt.figure(figsize=(11,11))
  sns.heatmap(corr_matrix)
  #'''

  for i in range(len(corr_matrix .columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            colname = corr_matrix.columns[i]
            corr_features.add(colname)
  return corr_features

In [7]:
def Mutual_Information(x_train, x_test, select_k = 10):
  # import the required functions and object.
  from sklearn.feature_selection import mutual_info_classif
  from sklearn.feature_selection import SelectKBest

  # get only the numerical features.
  numerical_x_train = x_train[x_train.select_dtypes([np.number]).columns]


  # create the SelectKBest with the mutual info strategy.
  selection = SelectKBest(mutual_info_classif, k=select_k)
  selection.fit(numerical_x_train, y_train)

  # display the retained features.
  features = x_train.columns[selection.get_support()]
  return features

In [8]:
def Select_Model(x_train,y_train,x_test,y_test):
  from sklearn.feature_selection import SelectFromModel
  from sklearn.ensemble import RandomForestClassifier
  # define model
  rfc = RandomForestClassifier(n_estimators=100)
  # feature extraction
  select_model = SelectFromModel(rfc)
  # fit on train set
  fit = select_model.fit(x_train, y_train)
  # transform train set
  x_train = fit.transform(x_train)
  x_test = fit.transform(x_test)
  return x_train, x_test

In [9]:
def PCA_Feature(x_train,x_test):
  import numpy as np
  from sklearn.decomposition import PCA
  pca = PCA(n_components=2)
  pca.fit(x_train)
  x_train = pca.transform(x_train)

In [10]:
def RFE_Feature(x_train,y_train,x_test,y_test):
  from sklearn.feature_selection import RFE
  from sklearn.ensemble import RandomForestClassifier
  # define model
  rfc = RandomForestClassifier(n_estimators=100)
  rfe = RFE(estimator=rfc, n_features_to_select=3)
  # fit the model
  rfe.fit(x_train, y_train)
  #transform the data
  x_train, y_train = rfe.transform(x_train, y_train)
  x_test, y_test = rfe.transform(x_test, y_test)
  return x_train,y_train,x_test,y_test

In [11]:
def Drop_Columns(x_train,x_test,columns):
  x_train = x_train.drop(labels=columns, axis=1, inplace=True)
  x_test = x_test.drop(labels=columns, axis=1, inplace=True)
  return x_train, x_test

In [12]:
def RFCaccuracy(X_train,X_test,Y_train,Y_test):
  import time
  from sklearn.ensemble import RandomForestClassifier
  from sklearn.metrics import accuracy_score
  start = time.time()
  clf = RandomForestClassifier()
  clf.fit(X_train,Y_train)
  Y_predict = clf.predict(X_test)
  score = accuracy_score(Y_test,Y_predict)
  end = time.time()
  print("Accuracy score for Random Forest Classifier: ", score)
  print("Time estimated: ",end-start)
  return score

In [13]:

path='/content/drive/My Drive/Datasets/Py4DS_Lab6/Santander_train.csv'
df = pd.read_csv(path)
df
#'''

Unnamed: 0,ID,var3,var15,imp_ent_var16_ult1,imp_op_var39_comer_ult1,imp_op_var39_comer_ult3,imp_op_var40_comer_ult1,imp_op_var40_comer_ult3,imp_op_var40_efect_ult1,imp_op_var40_efect_ult3,imp_op_var40_ult1,imp_op_var41_comer_ult1,imp_op_var41_comer_ult3,imp_op_var41_efect_ult1,imp_op_var41_efect_ult3,imp_op_var41_ult1,imp_op_var39_efect_ult1,imp_op_var39_efect_ult3,imp_op_var39_ult1,imp_sal_var16_ult1,ind_var1_0,ind_var1,ind_var2_0,ind_var2,ind_var5_0,ind_var5,ind_var6_0,ind_var6,ind_var8_0,ind_var8,ind_var12_0,ind_var12,ind_var13_0,ind_var13_corto_0,ind_var13_corto,ind_var13_largo_0,ind_var13_largo,ind_var13_medio_0,ind_var13_medio,ind_var13,...,saldo_medio_var5_ult1,saldo_medio_var5_ult3,saldo_medio_var8_hace2,saldo_medio_var8_hace3,saldo_medio_var8_ult1,saldo_medio_var8_ult3,saldo_medio_var12_hace2,saldo_medio_var12_hace3,saldo_medio_var12_ult1,saldo_medio_var12_ult3,saldo_medio_var13_corto_hace2,saldo_medio_var13_corto_hace3,saldo_medio_var13_corto_ult1,saldo_medio_var13_corto_ult3,saldo_medio_var13_largo_hace2,saldo_medio_var13_largo_hace3,saldo_medio_var13_largo_ult1,saldo_medio_var13_largo_ult3,saldo_medio_var13_medio_hace2,saldo_medio_var13_medio_hace3,saldo_medio_var13_medio_ult1,saldo_medio_var13_medio_ult3,saldo_medio_var17_hace2,saldo_medio_var17_hace3,saldo_medio_var17_ult1,saldo_medio_var17_ult3,saldo_medio_var29_hace2,saldo_medio_var29_hace3,saldo_medio_var29_ult1,saldo_medio_var29_ult3,saldo_medio_var33_hace2,saldo_medio_var33_hace3,saldo_medio_var33_ult1,saldo_medio_var33_ult3,saldo_medio_var44_hace2,saldo_medio_var44_hace3,saldo_medio_var44_ult1,saldo_medio_var44_ult3,var38,TARGET
0,1,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.00,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,39205.170000,0
1,3,2,34,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,1,1,1,0,0,0,0,1,...,0.00,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,300.0,122.22,300.0,240.75,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,49278.030000,0
2,4,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3.00,2.07,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,67333.770000,0
3,8,2,37,0.0,195.0,195.0,0.0,0.0,0.0,0.0,0.0,195.0,195.0,0.0,0.0,195.0,0.0,0.0,195.0,0.0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,91.56,138.84,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,64007.970000,0
4,10,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,...,40501.08,13501.47,0.0,0.0,0.0,0.0,0.00,0.0,85501.89,85501.89,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,117310.979016,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
76015,151829,2,48,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0.00,0.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,60926.490000,0
76016,151830,2,39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,...,0.00,0.00,0.0,0.0,0.0,0.0,15498.42,0.0,48175.62,31837.02,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,118634.520000,0
76017,151835,2,23,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3.00,3.00,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,74028.150000,0
76018,151836,2,25,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,3.00,2.58,0.0,0.0,0.0,0.0,0.00,0.0,0.00,0.00,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,84278.160000,0


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 76020 entries, 0 to 76019
Columns: 371 entries, ID to TARGET
dtypes: float64(111), int64(260)
memory usage: 215.2 MB


In [15]:
df.select_dtypes(include=['object']).columns

Index([], dtype='object')

In [16]:
list_nan = []
for col in df.columns:
  if (df[col].isnull().sum()>0):
    list_nan.append(col)
list_nan

[]

In [17]:

X = df.drop(['TARGET'],axis = 1)
y = df['TARGET']
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
#'''
'''
path='/content/drive/My Drive/Datasets/Py4DS_Lab6/Santander_train.csv'
df_train = pd.read_csv(path)
x_train = df_train.drop(['TARGET'],axis = 1)
y_train = df_train['TARGET']
path='/content/drive/My Drive/Datasets/Py4DS_Lab6/Santander_test.csv'
df_test = pd.read_csv(path)
x_train = df_test
#'''

"\npath='/content/drive/My Drive/Datasets/Py4DS_Lab6/Santander_train.csv'\ndf_train = pd.read_csv(path)\nx_train = df_train.drop(['TARGET'],axis = 1)\ny_train = df_train['TARGET']\npath='/content/drive/My Drive/Datasets/Py4DS_Lab6/Santander_test.csv'\ndf_test = pd.read_csv(path)\nx_train = df_test\n#"

In [18]:
cf = Constant_Features(x_train, x_test, threshold=0)
Drop_Columns(x_train,x_test,cf)
print(cf)
#'''

['ind_var2_0', 'ind_var2', 'ind_var13_medio_0', 'ind_var13_medio', 'ind_var18_0', 'ind_var18', 'ind_var27_0', 'ind_var28_0', 'ind_var28', 'ind_var27', 'ind_var41', 'ind_var46_0', 'ind_var46', 'num_var13_medio_0', 'num_var13_medio', 'num_var18_0', 'num_var18', 'num_var27_0', 'num_var28_0', 'num_var28', 'num_var27', 'num_var41', 'num_var46_0', 'num_var46', 'saldo_var13_medio', 'saldo_var18', 'saldo_var28', 'saldo_var27', 'saldo_var41', 'saldo_var46', 'delta_imp_amort_var18_1y3', 'delta_imp_reemb_var33_1y3', 'delta_num_reemb_var33_1y3', 'imp_amort_var18_hace3', 'imp_amort_var18_ult1', 'imp_amort_var34_hace3', 'imp_reemb_var13_hace3', 'imp_reemb_var33_hace3', 'imp_reemb_var33_ult1', 'imp_trasp_var17_out_hace3', 'imp_trasp_var33_out_hace3', 'num_var2_0_ult1', 'num_var2_ult1', 'num_meses_var13_medio_ult3', 'num_reemb_var13_hace3', 'num_reemb_var33_hace3', 'num_reemb_var33_ult1', 'num_trasp_var17_out_hace3', 'num_trasp_var33_out_hace3', 'saldo_var2_ult1', 'saldo_medio_var13_medio_hace2', 'sal

In [19]:

qcf = Quansi_Constant_Feature(x_train,y_train)
print(qcf)
Drop_Columns(x_train,x_test,qcf)
#'''

['imp_op_var40_comer_ult1', 'imp_op_var40_comer_ult3', 'imp_op_var40_efect_ult1', 'imp_op_var40_efect_ult3', 'imp_op_var40_ult1', 'imp_sal_var16_ult1', 'ind_var1_0', 'ind_var1', 'ind_var6_0', 'ind_var6', 'ind_var13_largo_0', 'ind_var13_largo', 'ind_var14', 'ind_var17_0', 'ind_var17', 'ind_var19', 'ind_var20_0', 'ind_var20', 'ind_var29_0', 'ind_var29', 'ind_var30_0', 'ind_var31_0', 'ind_var31', 'ind_var32_cte', 'ind_var32_0', 'ind_var32', 'ind_var33_0', 'ind_var33', 'ind_var34_0', 'ind_var34', 'ind_var40_0', 'ind_var40', 'ind_var39', 'ind_var44_0', 'ind_var44', 'num_var1_0', 'num_var1', 'num_var6_0', 'num_var6', 'num_var13_largo_0', 'num_var13_largo', 'num_var14', 'num_var17_0', 'num_var17', 'num_var20_0', 'num_var20', 'num_op_var40_hace2', 'num_op_var40_hace3', 'num_op_var40_ult1', 'num_op_var40_ult3', 'num_op_var41_hace3', 'num_op_var39_hace3', 'num_var29_0', 'num_var29', 'num_var31_0', 'num_var31', 'num_var32_0', 'num_var32', 'num_var33_0', 'num_var33', 'num_var34_0', 'num_var34', 'n

(None, None)

In [20]:

d_f = Dupplicate_Feature(x_train, x_test)
print(d_f)
Drop_Columns(x_train,x_test,d_f)
#'''

6
['ind_var26' 'ind_var25' 'ind_var37' 'num_var26' 'num_var25' 'num_var37']


(None, None)

In [21]:

c_f = Correlated_Feature(x_train,x_test,threshold=0.8)
print(c_f)
Drop_Columns(x_train,x_test,c_f)
#'''


{'saldo_medio_var8_ult3', 'num_var45_ult3', 'saldo_medio_var12_ult3', 'ind_var8', 'ind_var24_0', 'num_var12_0', 'num_var35', 'num_op_var41_efect_ult3', 'imp_op_var39_comer_ult3', 'num_var26_0', 'num_var5_0', 'num_op_var39_hace2', 'imp_trans_var37_ult1', 'ind_var10_ult1', 'num_meses_var12_ult3', 'ind_var37_0', 'num_op_var41_ult3', 'ind_var24', 'ind_var26_0', 'num_meses_var8_ult3', 'saldo_medio_var12_hace2', 'ind_var9_ult1', 'num_var25_0', 'num_var39_0', 'saldo_var13', 'num_op_var39_efect_ult3', 'num_meses_var13_corto_ult3', 'imp_op_var39_efect_ult1', 'imp_op_var41_ult1', 'ind_var25_0', 'saldo_medio_var12_ult1', 'ind_var30', 'num_var13_0', 'num_var37_0', 'saldo_medio_var13_corto_hace2', 'imp_op_var39_efect_ult3', 'num_op_var39_comer_ult3', 'saldo_var30', 'ind_var13_corto', 'delta_num_aport_var13_1y3', 'num_op_var41_comer_ult3', 'imp_op_var41_comer_ult1', 'saldo_medio_var8_ult1', 'ind_var41_0', 'saldo_medio_var13_corto_ult1', 'num_var13_corto_0', 'saldo_var25', 'ind_var10cte_ult1', 'num_m

(None, None)

In [22]:

x_train_mi = x_train.copy()
y_train_mi = y_train.copy()
x_test_mi = x_test.copy()
y_test_mi = y_test.copy()
m_i = Mutual_Information(x_train_mi,x_test_mi, select_k = 10)
print(m_i)
x_train_mi = x_train_mi[m_i]
x_test_mi = x_test_mi[m_i]
#'''

Index(['var15', 'ind_var5_0', 'ind_var5', 'ind_var39_0', 'num_var4',
       'saldo_var5', 'var36', 'saldo_medio_var5_hace2',
       'saldo_medio_var5_hace3', 'saldo_medio_var5_ult3'],
      dtype='object')


In [23]:

x_train_sm = x_train.copy()
y_train_sm = y_train.copy()
x_test_sm = x_test.copy()
y_test_sm = y_test.copy()
Select_Model(x_train_sm,y_train_sm,x_test_sm,y_test_sm)
#'''

(array([[5.61830000e+04, 2.40000000e+01, 9.00000000e+01, ...,
         7.75800000e+01, 8.58600000e+01, 1.17310979e+05],
        [4.33540000e+04, 2.60000000e+01, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 4.07115540e+05],
        [3.14000000e+04, 4.30000000e+01, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 8.55647100e+04],
        ...,
        [1.09515000e+05, 2.70000000e+01, 3.00000000e+00, ...,
         8.70000000e-01, 2.28000000e+00, 6.37337700e+04],
        [1.70300000e+03, 1.70000000e+01, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 1.01492340e+05],
        [3.17100000e+04, 2.40000000e+01, 3.00000000e+00, ...,
         2.70000000e-01, 2.10000000e+00, 1.17310979e+05]]),
 array([[1.0101900e+05, 4.5000000e+01, 7.8147000e+02, ..., 4.2030000e+01,
         3.8763000e+02, 7.7930250e+04],
        [1.0518000e+04, 6.0000000e+01, 3.0000000e+00, ..., 5.4195090e+04,
         1.8067020e+04, 1.6512504e+05],
        [4.3966000e+04, 2.7000000e+01

In [24]:
x_train_pca = x_train.copy()
y_train_pca = y_train.copy()
x_test_pca = x_test.copy()
y_test_pca = y_test.copy()
PCA_Feature(x_train_pca,y_train_pca)

In [25]:
'''
x_train_rfe = x_train.copy()
y_train_rfe = y_train.copy()
x_test_rfe = x_test.copy()
y_test_rfe = y_test.copy()
RFE_Feature(x_train_rfe,y_train_rfe,x_test_rfe,y_test_rfe)
#'''

'\nx_train_rfe = x_train.copy()\ny_train_rfe = y_train.copy()\nx_test_rfe = x_test.copy()\ny_test_rfe = y_test.copy()\nRFE_Feature(x_train_rfe,y_train_rfe,x_test_rfe,y_test_rfe)\n#'

In [26]:
print("Select Model")
RFCaccuracy(x_train_sm,x_test_sm,y_train_sm,y_test_sm)
print("*"*30)
print("Mutual Infomation")
RFCaccuracy(x_train_mi,x_test_mi,y_train_mi,y_test_mi)
print("*"*30)
print("PCA")
RFCaccuracy(x_train_pca,x_test_pca,y_train_pca,y_test_pca)
print("*"*30)

Select Model
Accuracy score for Random Forest Classifier:  0.9577869015824929
Time estimated:  9.66458773612976
******************************
Mutual Infomation
Accuracy score for Random Forest Classifier:  0.9590624626300475
Time estimated:  5.2014000415802
******************************
PCA
Accuracy score for Random Forest Classifier:  0.9577869015824929
Time estimated:  9.609281539916992
******************************


In [27]:
y.value_counts()

0    73012
1     3008
Name: TARGET, dtype: int64