# Bank Marketing - Data Cleaning

## Importing Data

In [5]:
import pandas as pd
import numpy as np

In [6]:
df = pd.read_csv("workspace/sources/datacamp/project_01_bank_mkt/bank_marketing.csv")
print(df.head())

   client_id  age        job  marital    education credit_default mortgage  \
0          0   56  housemaid  married     basic.4y             no       no   
1          1   57   services  married  high.school        unknown       no   
2          2   37   services  married  high.school             no      yes   
3          3   40     admin.  married     basic.6y             no       no   
4          4   56   services  married  high.school             no       no   

  month  day  contact_duration  number_contacts  previous_campaign_contacts  \
0   may   13               261                1                           0   
1   may   19               149                1                           0   
2   may   23               226                1                           0   
3   may   27               151                1                           0   
4   may    3               307                1                           0   

  previous_outcome  cons_price_idx  euribor_three_months

## Building client.csv

### Important Columns for client.csv

In [7]:
# Selecting columns
df_client_cols_list = ['client_id', 'age', 'job', 'marital', 'education', 'credit_default', 'mortgage' ]
df_client = df[df_client_cols_list]
print(df_client.head())

   client_id  age        job  marital    education credit_default mortgage
0          0   56  housemaid  married     basic.4y             no       no
1          1   57   services  married  high.school        unknown       no
2          2   37   services  married  high.school             no      yes
3          3   40     admin.  married     basic.6y             no       no
4          4   56   services  married  high.school             no       no


### Checking Data Types

In [8]:
df_client.dtypes

client_id          int64
age                int64
job               object
marital           object
education         object
credit_default    object
mortgage          object
dtype: object

### Saving a copy

In [9]:
df_client_treated = df_client.copy()

### Converting to boolean

In [10]:
# Checking the boolean columns
boolean_cols_client = ["credit_default", "mortgage"]

for col in boolean_cols_client:
    print("--------------")
    print(df_client_treated[col].value_counts())
    print("--------------")

--------------
credit_default
no         32588
unknown     8597
yes            3
Name: count, dtype: int64
--------------
--------------
mortgage
yes        21576
no         18622
unknown      990
Name: count, dtype: int64
--------------


### Converting 'yes' and 'no' values to boolean (keeping 'unknown') and 'object' type

In [11]:
# Clean and convert client columns to bool data type
for col in ["credit_default", "mortgage"]:
  df_client_treated[col] = df_client_treated[col].map({"yes": 1,
                                                       "no": 0,
                                                       "unknown": 0})
  df_client_treated[col] = df_client_treated[col].astype(bool)


In [12]:
# Checking the boolean columns
boolean_cols_client = ["credit_default", "mortgage"]

for col in boolean_cols_client:
    print("--------------")
    print(df_client_treated[col].value_counts())
    print("--------------")

--------------
credit_default
False    41185
True         3
Name: count, dtype: int64
--------------
--------------
mortgage
True     21576
False    19612
Name: count, dtype: int64
--------------


### Converting 'unknown' values to NaN only for the education column

In [13]:
client_cols_with_unknown = ["education"]
df_client_treated[client_cols_with_unknown] = df_client_treated[client_cols_with_unknown].replace('unknown', np.nan)
print(df_client_treated[client_cols_with_unknown].head())
print(df_client_treated[client_cols_with_unknown].value_counts())

     education
0     basic.4y
1  high.school
2  high.school
3     basic.6y
4  high.school
education          
university.degree      12168
high.school             9515
basic.9y                6045
professional.course     5243
basic.4y                4176
basic.6y                2292
illiterate                18
Name: count, dtype: int64


### Cleaning education and job columns

In [14]:
# Replace "." with "_" in multiple columns without using lambda
client_columns_to_replace = ["education", "job"]
df_client_treated[client_columns_to_replace] = df_client_treated[client_columns_to_replace].apply(lambda x: x.str.replace('.', '_'))
df_client_treated[["education", "job"]].head(10)

Unnamed: 0,education,job
0,basic_4y,housemaid
1,high_school,services
2,high_school,services
3,basic_6y,admin_
4,high_school,services
5,basic_9y,services
6,professional_course,admin_
7,,blue-collar
8,professional_course,technician
9,high_school,services


### Saving client as CSV

In [15]:
# Saving dataframe as CSV
df_client_treated.to_csv("workspace/sources/datacamp/project_01_bank_mkt/client.csv", index=False)

## Building campaign.csv

In [16]:
# Selecting columns
df_campaign_cols_list = ['client_id', 'number_contacts', 'contact_duration', 'previous_campaign_contacts', 'previous_outcome', 'campaign_outcome', 'month', 'day']
df_campaign = df[df_campaign_cols_list]
print(df_campaign.head())

   client_id  number_contacts  contact_duration  previous_campaign_contacts  \
0          0                1               261                           0   
1          1                1               149                           0   
2          2                1               226                           0   
3          3                1               151                           0   
4          4                1               307                           0   

  previous_outcome campaign_outcome month  day  
0      nonexistent               no   may   13  
1      nonexistent               no   may   19  
2      nonexistent               no   may   23  
3      nonexistent               no   may   27  
4      nonexistent               no   may    3  


### Checking Data Types

In [17]:
df_campaign.dtypes

client_id                      int64
number_contacts                int64
contact_duration               int64
previous_campaign_contacts     int64
previous_outcome              object
campaign_outcome              object
month                         object
day                            int64
dtype: object

### Saving a copy

In [18]:
df_campaign_treated = df_campaign.copy()

### Converting to boolean

In [19]:
# Checking the boolean columns
boolean_cols_campaign = ["previous_outcome", "campaign_outcome"]

for col in boolean_cols_campaign:
    print("--------------")
    print(df_campaign_treated[col].value_counts())
    print("--------------")

--------------
previous_outcome
nonexistent    35563
failure         4252
success         1373
Name: count, dtype: int64
--------------
--------------
campaign_outcome
no     36548
yes     4640
Name: count, dtype: int64
--------------


### Converting 'yes/no' and "success/failure" values to boolean (keeping 'nonexistent') and 'object' type

In [20]:
# Convert yes/no column to 1 and 0
df_campaign_treated["campaign_outcome"] = df_campaign_treated["campaign_outcome"].map({"yes": 1, "no": 0})
print(df_campaign_treated["campaign_outcome"].unique())
df_campaign_treated["campaign_outcome"] = df_campaign_treated["campaign_outcome"].astype(bool)
print(df_campaign_treated["campaign_outcome"].dtype)

# Convert success/failure column to 1 and 0
df_campaign_treated["previous_outcome"] = df_campaign_treated["previous_outcome"].map({"success": 1, "failure": 0, "nonexistent": 0})
print(df_campaign_treated["previous_outcome"].unique())
df_campaign_treated["previous_outcome"] = df_campaign_treated["previous_outcome"].astype(bool)
print(df_campaign_treated["previous_outcome"].dtype)

[0 1]
bool
[0 1]
bool


In [21]:
# Checking the boolean columns
boolean_cols_campaign = ["previous_outcome", "campaign_outcome"]

for col in boolean_cols_campaign:
    print("--------------")
    print(df_campaign_treated[col].value_counts())
    print("--------------")

--------------
previous_outcome
False    39815
True      1373
Name: count, dtype: int64
--------------
--------------
campaign_outcome
False    36548
True      4640
Name: count, dtype: int64
--------------


In [22]:
print(df_campaign_treated["previous_outcome"].unique())
print(df_campaign_treated["campaign_outcome"].unique())
print(df_campaign_treated.isnull().sum())

[False  True]
[False  True]
client_id                     0
number_contacts               0
contact_duration              0
previous_campaign_contacts    0
previous_outcome              0
campaign_outcome              0
month                         0
day                           0
dtype: int64


### Converting 'nonexistent' values to NaN only for the previous_outcome column

In [23]:
campaign_cols_with_nonexistent = ["previous_outcome"]
df_campaign_treated[campaign_cols_with_nonexistent] = df_campaign_treated[campaign_cols_with_nonexistent].replace('nonexistent', np.nan)
print(df_campaign_treated[campaign_cols_with_nonexistent].value_counts())

previous_outcome
False               39815
True                 1373
Name: count, dtype: int64


### Creating last_contact_date column

In [24]:
# Check date-related columns
print(df_campaign_treated[["month", "day"]].dtypes)
print(df_campaign_treated["month"].unique())
print(df_campaign_treated["day"].unique())


month    object
day       int64
dtype: object
['may' 'jun' 'jul' 'aug' 'oct' 'nov' 'dec' 'mar' 'apr' 'sep']
[13 19 23 27  3  5 12 21  8  9 29 14  1  6  2 16 20 10 28 30 22 25 11 17
 15 26 18  4 24 31  7]


In [25]:
# Create year column
df_campaign_treated['year'] = '2022'

# Create a new column for concatenated values
df_campaign_treated['last_contact_date'] = ''

# Change day type from int to object
df_campaign_treated['day'] = df_campaign_treated['day'].astype(str)

# Concatenate the columns using the pd.concat() function
df_campaign_treated['last_contact_date'] = df_campaign_treated['year'] + '-' + df_campaign_treated['month'] + '-' + df_campaign_treated['day']

print(df_campaign_treated[['last_contact_date', 'year', 'month', 'day']].head())

df_campaign_treated['last_contact_date'] = pd.to_datetime(df_campaign_treated['last_contact_date'],
                                             # Attempt to infer format of each date
                                             infer_datetime_format=True,
                                             # Return NA for rows where conversion failed
                                             errors = 'coerce')
print("------------------------------")
print(df_campaign_treated['last_contact_date'].head())

# Treating the Date data
df_campaign_treated['last_contact_date'] = df_campaign_treated['last_contact_date'].dt.strftime("%Y-%m-%d")

  last_contact_date  year month day
0       2022-may-13  2022   may  13
1       2022-may-19  2022   may  19
2       2022-may-23  2022   may  23
3       2022-may-27  2022   may  27
4        2022-may-3  2022   may   3
------------------------------
0   2022-05-13
1   2022-05-19
2   2022-05-23
3   2022-05-27
4   2022-05-03
Name: last_contact_date, dtype: datetime64[ns]


  df_campaign_treated['last_contact_date'] = pd.to_datetime(df_campaign_treated['last_contact_date'],
  df_campaign_treated['last_contact_date'] = pd.to_datetime(df_campaign_treated['last_contact_date'],


In [26]:
# Selecting Important Columns
df_campaign_treated_cols = ['client_id', 'number_contacts', 'contact_duration', 'previous_campaign_contacts', 'previous_outcome', 'campaign_outcome', 'last_contact_date']
df_campaign_treated = df_campaign_treated[df_campaign_treated_cols]
print(df_campaign_treated.dtypes)

client_id                      int64
number_contacts                int64
contact_duration               int64
previous_campaign_contacts     int64
previous_outcome                bool
campaign_outcome                bool
last_contact_date             object
dtype: object


### Saving campaign as CSV

In [27]:
# Saving dataframe as CSV
df_campaign_treated.to_csv("workspace/sources/datacamp/project_01_bank_mkt/campaign.csv", index=False)

## Building economics.csv

In [28]:
# Selecting columns
df_economics_cols_list = ['client_id', 'cons_price_idx', 'euribor_three_months']
df_economics = df[df_economics_cols_list]
print(df_economics.head())

   client_id  cons_price_idx  euribor_three_months
0          0          93.994                 4.857
1          1          93.994                 4.857
2          2          93.994                 4.857
3          3          93.994                 4.857
4          4          93.994                 4.857


### Checking Data Types

In [29]:
df_economics.dtypes

client_id                 int64
cons_price_idx          float64
euribor_three_months    float64
dtype: object

### Saving a copy

In [30]:
df_economics_treated = df_economics.copy()
print(df_economics_treated.head())

   client_id  cons_price_idx  euribor_three_months
0          0          93.994                 4.857
1          1          93.994                 4.857
2          2          93.994                 4.857
3          3          93.994                 4.857
4          4          93.994                 4.857


### Saving economics as CSV

In [31]:
# Saving dataframe as CSV
df_economics_treated.to_csv("workspace/sources/datacamp/project_01_bank_mkt/economics.csv", index=False)