## Package imports

In [1]:
import pandas as pd

----
## Display settings

In [2]:
pd.set_option("display.width", 140)

----
## Reading in the data

In [3]:
dtypes = {
    "MORTGAGE": "category",
    "MORTGAGE_GETTING_AGE": "Int64",
    "AGE": "int64",
    "YEARS_WITH_BANK": "int64",
    "MARITAL_STATUS": "category",
    "EDUCATION": "category",
    "EMPLOYMENT": "category",
    "GENDER": "category",
    "INCOME": "float64",
    "BALANCE": "float64",
}

df = pd.read_csv(
    "../data/processed/clean/Retail data.csv",
    sep=";",
    dtype=dtypes,
    parse_dates=["ADDRESS_DATE", "JOB_DATE", "WITH_BANK_DATE"],
)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23983 entries, 0 to 23982
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   MORTGAGE              23983 non-null  category      
 1   MORTGAGE_GETTING_AGE  306 non-null    Int64         
 2   AGE                   23983 non-null  int64         
 3   YEARS_WITH_BANK       23983 non-null  int64         
 4   MARITAL_STATUS        23940 non-null  category      
 5   EDUCATION             23983 non-null  category      
 6   EMPLOYMENT            23983 non-null  category      
 7   GENDER                23983 non-null  category      
 8   INCOME                23983 non-null  float64       
 9   ADDRESS_DATE          23775 non-null  datetime64[ns]
 10  JOB_DATE              18212 non-null  datetime64[ns]
 11  WITH_BANK_DATE        23983 non-null  datetime64[ns]
 12  BALANCE               23983 non-null  float64       
dtypes: Int64(1), cat

----
## Estimating when the data was created

In [5]:
# Temporary column for possible data creation years
df["_CREATION_YEAR"] = df["WITH_BANK_DATE"].dt.year + df["YEARS_WITH_BANK"]

In [6]:
# Temporary value containing the year of the data creation
_creation_year: int = df["_CREATION_YEAR"].max()

# Temporary date with the month and day values closest to the actual data creation
_creation_date = pd.Timestamp(df[df["_CREATION_YEAR"] == _creation_year]["WITH_BANK_DATE"].max())

# Year and day corrections
DATA_CREATION = _creation_date + pd.DateOffset(years=_creation_year - _creation_date.year, days=1)

print(DATA_CREATION)

2017-07-03 00:00:00


In [7]:
# Remove temporary column
df.drop("_CREATION_YEAR", axis=1, inplace=True)