In [187]:
import pandas as pd
import numpy as np

In [188]:
# load dataset
df = pd.read_csv('bank_marketing.csv')

# Chunking the Dataset

Separate bank-marketing dataset into three dataset.

In [189]:
# create client dataset
df_client = df[['client_id', 'age', 'job', 'marital', 'education', 'credit_default', 'mortgage']]

# create campaign dataset
df_campaign = df[['client_id', 'number_contacts', 'contact_duration', 'previous_campaign_contacts', 'previous_outcome', 
                  'campaign_outcome', 'day', 'month']]

# create economics dataset
df_economics = df[['client_id', 'cons_price_idx', 'euribor_three_months']]

# Data Cleaning

Cleaning each dataset as requirement.

### 1. Client Dataset

In [190]:
df_client.sample(10)

Unnamed: 0,client_id,age,job,marital,education,credit_default,mortgage
8537,8537,33,admin.,married,university.degree,no,no
12606,12606,47,technician,married,professional.course,no,no
34488,34488,30,admin.,married,university.degree,no,no
37977,37977,54,technician,married,professional.course,no,no
21089,21089,32,unknown,married,basic.9y,no,no
3717,3717,46,services,divorced,high.school,no,no
10774,10774,44,services,single,high.school,no,no
3389,3389,47,blue-collar,married,professional.course,unknown,no
39337,39337,27,admin.,single,university.degree,no,no
3408,3408,38,student,single,university.degree,no,no


In [191]:
# check datatype
df_client.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   client_id       41188 non-null  int64 
 1   age             41188 non-null  int64 
 2   job             41188 non-null  object
 3   marital         41188 non-null  object
 4   education       41188 non-null  object
 5   credit_default  41188 non-null  object
 6   mortgage        41188 non-null  object
dtypes: int64(2), object(5)
memory usage: 2.2+ MB


no missing values in client dataset.
column credit_default and mortgage should convert to boolean data type.

### credit_default column and mortgage column

In [192]:
# check unique values from the columns
print('Unique values in credit_default column before conversion: ', df_client.credit_default.unique())
print('Unique values in mortgage column before conversion: ', df_client.mortgage.unique())

# replace values
df_client['credit_default'] = np.where(df_client['credit_default'] == "yes", 1, 0)
df_client['mortgage'] = np.where(df_client['mortgage'] == "yes", 1, 0)

# check unique values from the columns
print('Unique values in credit_default column after conversion: ', df_client.credit_default.unique())
print('Unique values in mortgage column after conversion: ', df_client.mortgage.unique())

Unique values in credit_default column before conversion:  ['no' 'unknown' 'yes']
Unique values in mortgage column before conversion:  ['no' 'yes' 'unknown']
Unique values in credit_default column after conversion:  [0 1]
Unique values in mortgage column after conversion:  [0 1]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_client['credit_default'] = np.where(df_client['credit_default'] == "yes", 1, 0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_client['mortgage'] = np.where(df_client['mortgage'] == "yes", 1, 0)


In [193]:
df_client.dtypes

client_id          int64
age                int64
job               object
marital           object
education         object
credit_default     int64
mortgage           int64
dtype: object

In [194]:
# convert datatype
print("Type of credit_default column before conversion: ", df_client.credit_default.dtype)
df_client['credit_default'] = df.credit_default.astype('bool')
print("Type of credit_default column after conversion: ", df_client.credit_default.dtype)

print("Type of mortgage column before conversion: ", df_client.mortgage.dtype)
df_client['mortgage'] = df.mortgage.astype('bool')
print("Type of mortgage column after conversion: ", df_client.mortgage.dtype)

Type of credit_default column before conversion:  int64
Type of credit_default column after conversion:  bool
Type of mortgage column before conversion:  int64
Type of mortgage column after conversion:  bool


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_client['credit_default'] = df.credit_default.astype('bool')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_client['mortgage'] = df.mortgage.astype('bool')


In [195]:
df_client.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41188 entries, 0 to 41187
Data columns (total 7 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   client_id       41188 non-null  int64 
 1   age             41188 non-null  int64 
 2   job             41188 non-null  object
 3   marital         41188 non-null  object
 4   education       41188 non-null  object
 5   credit_default  41188 non-null  bool  
 6   mortgage        41188 non-null  bool  
dtypes: bool(2), int64(2), object(3)
memory usage: 1.6+ MB


### job column and education column

In [196]:
# check unique values
print("Unique values in job column before conversion: ", df_client.job.unique())

# change "." to "_"
df_client['job'] = df_client['job'].apply(lambda x: x.replace('.', '_'))

print("Unique values in job column after conversion: ", df_client.job.unique())

Unique values in job column before conversion:  ['housemaid' 'services' 'admin.' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student']
Unique values in job column after conversion:  ['housemaid' 'services' 'admin_' 'blue-collar' 'technician' 'retired'
 'management' 'unemployed' 'self-employed' 'unknown' 'entrepreneur'
 'student']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_client['job'] = df_client['job'].apply(lambda x: x.replace('.', '_'))


In [197]:
# check unique values
print("Unique values in education column before conversion: ", df_client.education.unique())

# change "." to "_" and "unknown" to "np.NaN"
df_client['education'] = df_client['education'].apply(lambda x: x.replace('.', '_'))
df_client['education'] = df_client['education'].replace('unknown', np.NaN)

print("Unique values in education column after conversion: ", df_client.education.unique())

Unique values in education column before conversion:  ['basic.4y' 'high.school' 'basic.6y' 'basic.9y' 'professional.course'
 'unknown' 'university.degree' 'illiterate']
Unique values in education column after conversion:  ['basic_4y' 'high_school' 'basic_6y' 'basic_9y' 'professional_course' nan
 'university_degree' 'illiterate']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_client['education'] = df_client['education'].apply(lambda x: x.replace('.', '_'))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_client['education'] = df_client['education'].replace('unknown', np.NaN)
