## Package imports

In [1]:
import pandas as pd
import matplotlib.pyplot as plt

## Display settings

In [2]:
pd.set_option('display.width', 140)

## Reading in the data

In [3]:
RETAIL = "../data/raw/Retail data.csv"
POTENTIAL_CUSTOMERS = "../data/raw/Potential Customers.csv"

In [4]:
# This variable is used to set the path to the data file
data_path = POTENTIAL_CUSTOMERS

In [5]:
# Read the data file while specifying the separator and the column names, and the data types
df = pd.read_csv(data_path, sep=";")

## Cleaning the data

### Removing redundant columns

In [6]:
df.drop("Cocunut", axis=1, inplace=True)

### Giving some columns more descriptive names

In [7]:
column_names = {
    "MARTIAL_STATUS": "MARITAL_STATUS",
    "CUST_INCOME": "INCOME",
    "CURRENT_ADDRESS_DATE": "ADDRESS_DATE",
    "CURRENT_JOB_DATE": "JOB_DATE",
    "CURRENT_WITH_BANK_DATE": "WITH_BANK_DATE",
    "CURRENT_BALANCE_EUR": "BALANCE"
}

if data_path == RETAIL:
    column_names["Mortgage_YN"] = "MORTGAGE"
    column_names["AGE_AT_ORIGINATION"] = "MORTGAGE_TAKING_AGE"

df.rename(columns=column_names, inplace=True)

### Setting correct column data types

In [8]:
# Set categorical columns
categorical_columns = [
    "MARITAL_STATUS",
    "EDUCATION",
    "EMPLOYMENT",
    "GENDER",
]

df[categorical_columns] = df[categorical_columns].astype("category")

In [9]:
# Set date columns while also setting invalid dates to NaT (Not a Time)
date_columns = [
    "ADDRESS_DATE",
    "JOB_DATE",
    "WITH_BANK_DATE",
]

df[date_columns] = df[date_columns].apply(pd.to_datetime, errors="coerce")

In [10]:
# Set monetary columns (decimal separator is comma)
monetary_columns = [
    "BALANCE",
    "INCOME",
]

df[monetary_columns] = df[monetary_columns].apply(lambda x: x.str.replace(",", ".").astype(float))

In [11]:
# Also set correct data types for the retail specific columns
if data_path == RETAIL:

    # Set mortgage column type as a categorical
    df["MORTGAGE"] = df["MORTGAGE"].astype("category")

    # Set mortgage taking age column type as nullable integer
    df["MORTGAGE_TAKING_AGE"] = df["MORTGAGE_TAKING_AGE"].astype("Int64")

### Removing future dates

In [12]:
# Setting future dates as NaT (Not a Time)
today = pd.to_datetime("today")

for column in date_columns:
    df.loc[df[column] > today, column] = pd.NaT