In [1]:
import sys #access to system parameters https://docs.python.org/3/library/sys.html
print("Python version: {}". format(sys.version))

import pandas as pd #collection of functions for data processing and analysis modeled after R dataframes with SQL like features
print("pandas version: {}". format(pd.__version__))

import matplotlib #collection of functions for scientific and publication-ready visualization
print("matplotlib version: {}". format(matplotlib.__version__))

import numpy as np #foundational package for scientific computing
print("NumPy version: {}". format(np.__version__))

import scipy as sp #collection of functions for scientific computing and advance mathematics
print("SciPy version: {}". format(sp.__version__)) 

import IPython
from IPython import display #pretty printing of dataframes in Jupyter notebook
print("IPython version: {}". format(IPython.__version__)) 

import sklearn #collection of machine learning algorithms
print("scikit-learn version: {}". format(sklearn.__version__))

#misc libraries
import random
import time


#ignore warnings

import warnings
warnings.filterwarnings('ignore')
print('-'*25)


Python version: 3.8.8 (default, Apr 13 2021, 15:08:03) [MSC v.1916 64 bit (AMD64)]
pandas version: 1.2.4
matplotlib version: 3.3.4
NumPy version: 1.20.1
SciPy version: 1.6.2
IPython version: 7.22.0
scikit-learn version: 0.24.1
-------------------------


In [2]:
#Common Model Algorithms
from sklearn import svm, tree, linear_model, neighbors, naive_bayes, ensemble, discriminant_analysis, gaussian_process
from xgboost import XGBClassifier

#Common Model Helpers
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn import feature_selection
from sklearn import model_selection
from sklearn import metrics

#Visualization
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

#Configure Visualization Defaults
#%matplotlib inline = show plots in Jupyter Notebook browser
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

In [3]:
from func import encode_features, remove_features

In [5]:
# Load the data
df_train = pd.read_csv('data/train_data.csv')
df_test = pd.read_csv('data/test_data.csv')
df_train.head()

Unnamed: 0,index,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y
0,0,33,unemployed,single,unknown,0,170,yes,0,unknown,3,6,1,-1,0,unknown,0
1,1,37,services,married,secondary,0,6089,yes,0,cellular,14,5,2,-1,0,unknown,1
2,2,35,blue-collar,married,secondary,0,137,no,1,unknown,4,10,1,808,12,failure,0
3,3,38,admin.,divorced,secondary,0,673,yes,0,cellular,7,5,1,170,3,other,0
4,4,44,services,divorced,secondary,0,2416,yes,0,unknown,14,5,3,-1,0,unknown,0


In [6]:
df_train.corr()['y']

index       0.005211
age         0.022991
default    -0.024070
balance     0.044079
loan       -0.066298
day        -0.032512
month       0.022443
campaign   -0.074181
pdays       0.101997
previous    0.110285
y           1.000000
Name: y, dtype: float64

In [9]:
df_train.describe()

Unnamed: 0,index,age,default,balance,loan,day,month,campaign,pdays,previous,y
count,22606.0,22606.0,22606.0,22606.0,22606.0,22606.0,22606.0,22606.0,22606.0,22606.0,22606.0
mean,11302.5,40.952137,0.018048,1349.075334,0.161196,15.840662,6.129125,2.770326,40.191763,0.576617,0.115589
std,6525.934429,10.573667,0.133129,2955.257344,0.36772,8.322493,2.395658,3.138755,99.916916,1.948676,0.319738
min,0.0,18.0,0.0,-8019.0,0.0,1.0,1.0,1.0,-1.0,0.0,0.0
25%,5651.25,33.0,0.0,69.0,0.0,8.0,5.0,1.0,-1.0,0.0,0.0
50%,11302.5,39.0,0.0,447.0,0.0,16.0,6.0,2.0,-1.0,0.0,0.0
75%,16953.75,48.0,0.0,1427.25,0.0,21.0,8.0,3.0,-1.0,0.0,0.0
max,22605.0,95.0,1.0,71188.0,1.0,31.0,12.0,63.0,871.0,58.0,1.0


In [10]:
data = pd.concat((df_train, df_test), axis=0, ignore_index=True)

In [12]:
# create bins for balance and age

data['balance_bins'] = pd.qcut(data['balance'], 8)
data['age_bins'] = pd.cut(data['age'], 6)

# Encoding these bins

features_to_encode = data.columns[1:]
data = encode_features(data, features_to_encode)


In [13]:
data_train = data[:df_train.shape[0]]
data_test = data[df_train.shape[0]:]
data_test.drop(['y'], axis=1, inplace=True)

In [14]:
data_train.head()

Unnamed: 0,index,age,job,marital,education,default,balance,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,y,balance_bins,age_bins
0,0,15,10,2,3,0,1086,1,0,2,2,5,0,0,0,3,0,2,1
1,1,19,7,1,1,0,5610,1,0,0,13,4,1,0,0,3,1,7,1
2,2,17,1,1,1,0,1053,0,1,2,3,9,0,550,12,0,0,2,1
3,3,20,0,0,1,0,1589,1,0,0,6,4,0,167,3,1,0,4,1
4,4,26,7,0,1,0,3290,1,0,2,13,4,2,0,0,3,0,6,2


In [15]:
data.columns

Index(['index', 'age', 'job', 'marital', 'education', 'default', 'balance',
       'housing', 'loan', 'contact', 'day', 'month', 'campaign', 'pdays',
       'previous', 'poutcome', 'y', 'balance_bins', 'age_bins'],
      dtype='object')

In [16]:
cols = ['index', 'age', 'job', 'marital', 'education', 'default', 'balance',
        'housing', 'loan', 'contact', 'day', 'month', 'campaign', 'pdays',
        'previous', 'poutcome', 'balance_bins', 'age_bins', 'y']


In [17]:
data_train = data_train[cols]

In [22]:
data_test.drop(['balance', 'age'], axis=1, inplace=True)


In [23]:
data_train.drop(['balance', 'age'], axis=1, inplace=True)


In [24]:
# Saving the data

data_train.to_csv('data_train_mod.csv', index=False)
data_test.to_csv('data_test_mod.csv', index=False)

In [25]:
data_train.head()

Unnamed: 0,index,job,marital,education,default,housing,loan,contact,day,month,campaign,pdays,previous,poutcome,balance_bins,age_bins,y
0,0,10,2,3,0,1,0,2,2,5,0,0,0,3,2,1,0
1,1,7,1,1,0,1,0,0,13,4,1,0,0,3,7,1,1
2,2,1,1,1,0,0,1,2,3,9,0,550,12,0,2,1,0
3,3,0,0,1,0,1,0,0,6,4,0,167,3,1,4,1,0
4,4,7,0,1,0,1,0,2,13,4,2,0,0,3,6,2,0
