In [56]:
import numpy as np, pandas as pd
import matplotlib.pyplot as plt, seaborn as sns
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline

# Data Prep

In [11]:
bank0 = pd.read_csv('bank marketing v2.zip', compression='zip')

In [12]:
bank0.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,poutcome,deposit,p_recency
0,59,admin.,married,secondary,no,2343,yes,no,unknown,5,may,1042,unknown,yes,
1,56,admin.,married,secondary,no,45,no,no,unknown,5,may,1467,unknown,yes,
2,41,technician,married,secondary,no,1270,yes,no,unknown,5,may,1389,unknown,yes,
3,55,services,married,secondary,no,2476,yes,no,unknown,5,may,579,unknown,yes,
4,54,admin.,married,tertiary,no,184,no,no,unknown,5,may,673,unknown,yes,


In [13]:
bank0.deposit.value_counts(normalize=True)

no     0.52616
yes    0.47384
Name: deposit, dtype: float64

#### Modifying the target variable to have 0/1 values

In [14]:
bank0.deposit = bank0.deposit.map({'yes':1, 'no':0})

In [15]:
bank0.deposit.value_counts(normalize=True)

0    0.52616
1    0.47384
Name: deposit, dtype: float64

**Dropping `day` and `duration` columns**

In [16]:
bank1 = bank0.drop(['duration','day'],axis=1)
bank1.columns

Index(['age', 'job', 'marital', 'education', 'default', 'balance', 'housing',
       'loan', 'contact', 'month', 'poutcome', 'deposit', 'p_recency'],
      dtype='object')

In [17]:
num_cols = bank1.select_dtypes('number').columns
num_cols = num_cols.drop('deposit')
num_cols

Index(['age', 'balance'], dtype='object')

#### Creating dummy variables for the categorical variables

Handling `default`, `loan`, `housing`

In [18]:
def binary_map(col):
    return col.map({'no':0, 'yes':1})

In [19]:
binary_cols = ['default', 'loan', 'housing']

In [20]:
bank1[binary_cols] = bank1[binary_cols].apply(binary_map)

**Creating dummy features for education, marital, p_recency, poutcome, contact, job, month**

In [21]:
dumm_cols = ['education', 'marital', 'p_recency', 'contact','poutcome', 'job', 'month']

In [22]:
bank_dummies = pd.get_dummies(bank1[dumm_cols], drop_first=True)

In [23]:
bank_dummies.head()

Unnamed: 0,education_secondary,education_tertiary,education_unknown,marital_married,marital_single,p_recency_a_6m,p_recency_b_1yr,p_recency_c_1yr+,contact_telephone,contact_unknown,...,month_dec,month_feb,month_jan,month_jul,month_jun,month_mar,month_may,month_nov,month_oct,month_sep
0,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
1,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
2,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
3,1,0,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0
4,0,1,0,1,0,0,0,0,0,1,...,0,0,0,0,0,0,1,0,0,0


In [24]:
bank_dummies.shape

(11162, 35)

**Concatenating dummies back on**

In [25]:
bank1 = pd.concat([bank1, bank_dummies],axis=1)
bank1.drop(dumm_cols, axis=1, inplace=True)
bank1.shape

(11162, 41)

In [26]:
bank1.columns

Index(['age', 'default', 'balance', 'housing', 'loan', 'deposit',
       'education_secondary', 'education_tertiary', 'education_unknown',
       'marital_married', 'marital_single', 'p_recency_a_6m',
       'p_recency_b_1yr', 'p_recency_c_1yr+', 'contact_telephone',
       'contact_unknown', 'poutcome_other', 'poutcome_success',
       'poutcome_unknown', 'job_blue-collar', 'job_entrepreneur',
       'job_housemaid', 'job_management', 'job_retired', 'job_self-employed',
       'job_services', 'job_student', 'job_technician', 'job_unemployed',
       'job_unknown', 'month_aug', 'month_dec', 'month_feb', 'month_jan',
       'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov',
       'month_oct', 'month_sep'],
      dtype='object')

---

#### Dividing into train and test datasets

In [27]:
from sklearn.model_selection import train_test_split

In [28]:
df_train, df_test = train_test_split(bank1, test_size=0.2, random_state=42, stratify=bank1.deposit)

In [29]:
df_train.shape, df_test.shape

((8929, 41), (2233, 41))

In [30]:
df_train.deposit.value_counts(normalize=True)

0    0.526151
1    0.473849
Name: deposit, dtype: float64

In [31]:
df_test.deposit.value_counts(normalize=True)

0    0.526198
1    0.473802
Name: deposit, dtype: float64

#### MinMax scaling for numeric features

In [32]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
df_train[['age', 'balance']] = scaler.fit_transform(df_train[['age', 'balance']])
df_test[['age', 'balance']] = scaler.transform(df_test[['age', 'balance']])
df_train[['age', 'balance']].describe()

Unnamed: 0,age,balance
count,8929.0,8929.0
mean,0.301283,0.095031
std,0.154103,0.034781
min,0.0,0.0
25%,0.181818,0.079136
50%,0.272727,0.084008
75%,0.402597,0.097387
max,1.0,1.0


In [34]:
df_train.to_csv("Azra Marketing - Train.csv",index=False)
df_test.to_csv("Azra Marketing - Test.csv",index=False)