In [2]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.preprocessing import power_transform
from sklearn.feature_extraction.text import CountVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.feature_selection import f_regression


In [3]:
data = pd.read_csv('/Users/abhil/Projects/bank_project/data_after_eda.csv')

In [4]:
print(len(data[data.duplicated(keep="last")]))
data = data.drop_duplicates() # Removed duplicate rows of data

163


In [5]:
data_x = data.loc[:, data.columns.drop('deposit')]
print("Shape of X:", data_x.shape)
data_y = data["deposit"]
print("Shape of Y:", data_y.shape)

Shape of X: (41025, 18)
Shape of Y: (41025,)


In [6]:
from sklearn.model_selection import train_test_split

X_rest, X_test, y_rest, y_test = train_test_split(data_x, data_y, test_size=0.2)
X_train, X_cv, y_train, y_cv = train_test_split(X_rest, y_rest, test_size=0.2)

print("X Train:", X_train.shape)
print("X CV:", X_cv.shape)
print("X Test:", X_test.shape)
print("Y Train:", y_train.shape)
print("Y CV:", y_cv.shape)
print("Y Test:", y_test.shape)

X Train: (26256, 18)
X CV: (6564, 18)
X Test: (8205, 18)
Y Train: (26256,)
Y CV: (6564,)
Y Test: (8205,)


In [7]:
def add_onehot_to_dataframe(sparse, df, vectorizer, name):
  '''
      This function will add the one hot encoded to the dataframe.

  '''
  for i, col in enumerate(vectorizer.get_feature_names()):
    colname = name+"_"+col
    df[colname] = sparse[:, i].toarray().ravel().tolist()
  
  return df

def OneHotEncoder(categorical_cols, X_train, X_test, X_cv=None, include_cv=False):
  '''
    This function takes categorical column names as inputs. The objective
    of this function is to take the column names iteratively and encode the 
    features using One hot Encoding mechanism and also adding the encoded feature
    to the respective dataframe.

    The include_cv parameter indicates whether we should include CV dataset or not.
    This is added specifically because when using GridSearchCV or RandomizedSearchCV,
    we only split the dataset into train and test to give more data to training purposes.
    This is done because GridSearchCV splits the data internally anyway.
  '''

  for i in categorical_cols:
    Vectorizer = CountVectorizer(token_pattern="[A-Za-z0-9-.]+")
    print("Encoding for feature: ", i)
    # Encoding training dataset 
    temp_cols = Vectorizer.fit_transform(X_train[i])
    X_train = add_onehot_to_dataframe(temp_cols, X_train, Vectorizer, i)

    # Encoding Cross validation dataset
    if include_cv:
        temp_cols = Vectorizer.transform(X_cv[i])
        X_cv = add_onehot_to_dataframe(temp_cols, X_cv, Vectorizer, i)

    # Encoding Test dataset
    temp_cols = Vectorizer.transform(X_test[i])
    X_test = add_onehot_to_dataframe(temp_cols, X_test, Vectorizer, i)


In [8]:
cats = [] # categorical features
nums = [] # numerical features
for col in X_train.columns:
    if data[col].dtype == 'object':
        cats.append(col)
    else:
        nums.append(col)

In [9]:
OneHotEncoder(cats, X_train, X_test, X_cv, True)

# Drop the categorical features as the one hot encoded representation is present
X_train = X_train.drop(cats, axis=1)
X_cv = X_cv.drop(cats, axis=1)
X_test = X_test.drop(cats, axis=1)

print("Shape of train: ", X_train.shape)
print("Shape of CV: ", X_cv.shape)
print("Shape of test: ", X_test.shape)

Encoding for feature:  job


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[colname] = sparse[:, i].toarray().ravel().tolist()


Encoding for feature:  marital
Encoding for feature:  education
Encoding for feature:  default
Encoding for feature:  housing
Encoding for feature:  loan
Encoding for feature:  contact
Encoding for feature:  month
Encoding for feature:  poutcome
Encoding for feature:  age_cat
Encoding for feature:  job_mapped
Encoding for feature:  pdays_cat
Shape of train:  (26256, 71)
Shape of CV:  (6564, 71)
Shape of test:  (8205, 71)


In [10]:
# transform the dataset
x_train = pd.DataFrame(X_train)
X_resample, y_resampled = SMOTE().fit_resample(X_train, y_train)

In [11]:
X_resample

Unnamed: 0,duration,campaign,previous,cons.price.idx,cons.conf.idx,nr.employed,job_admin.,job_blue-collar,job_entrepreneur,job_housemaid,...,job_mapped_collar,job_mapped_entrepreneur,job_mapped_other,job_mapped_pink,job_mapped_self-employed,job_mapped_technician,job_mapped_white,pdays_cat_month,pdays_cat_never,pdays_cat_week
0,0.828074,0.845048,0,92.893,-46.2,5099.1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
1,-2.513390,1.972611,0,92.893,-46.2,5099.1,0,0,0,0,...,0,0,0,0,0,1,0,0,1,0
2,-0.913819,0.245075,0,92.843,-50.0,5099.1,1,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,-0.926667,0.245075,0,93.918,-42.7,5228.1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
4,-0.863826,0.245075,0,94.465,-41.8,5228.1,0,1,0,0,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
46491,1.021512,-1.045497,2,93.369,-34.8,5008.7,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
46492,0.477140,-1.045497,0,92.431,-26.9,5017.5,0,0,0,0,...,0,0,1,0,0,0,0,0,1,0
46493,-0.169484,-0.183257,2,93.876,-40.0,5008.7,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
46494,0.225598,-0.971082,0,94.601,-49.5,4963.6,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


## Feature selection not implemented

In [12]:

from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestClassifier
# define model
rfc = RandomForestClassifier(n_estimators=30)
# feature extraction
rfe = RFE(rfc, n_features_to_select=30)
# fit on train set
fit = rfe.fit(X_resample, y_resampled)
# transform train set
recursive_features = fit.transform(X_resample)

In [13]:
recursive_features

array([[ 0.8280735 ,  0.84504757, 92.893     , ...,  0.        ,
         0.        ,  1.        ],
       [-2.51338996,  1.97261139, 92.893     , ...,  0.        ,
         0.        ,  1.        ],
       [-0.91381882,  0.24507501, 92.843     , ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.16948396, -0.18325678, 93.876     , ...,  0.        ,
         0.        ,  0.        ],
       [ 0.22559829, -0.97108181, 94.601     , ...,  0.        ,
         0.        ,  1.        ],
       [ 2.09143938,  0.81287899, 93.994     , ...,  1.        ,
         0.        ,  1.        ]])

In [15]:
X_resample = X_resample.to_csv('X_train.csv', index = False)
y_resampled = y_resampled.to_csv('y_train.csv', index = False)
X_cv = X_cv.to_csv('X_cv.csv', index = False)
y_cv = y_cv.to_csv('y_cv.csv', index = False)
X_test = X_test.to_csv('X_test.csv', index = False)
y_test = y_test.to_csv('y_test.csv', index = False)
