In [None]:
import pandas as pd
import numpy as np
from datetime import date

In [None]:
#Loading the data
today = str(date.today())
df_branch_service = pd.read_json("branch_service_transaction_info.json")
df_customer_transaction = pd.read_json("customer_transaction_info.json")

In [None]:
#Profiling the data
df_branch_service.head(10)

In [None]:
df_customer_transaction.head(10)

In [None]:
print(df_branch_service.shape)
print(df_customer_transaction.shape)

In [None]:
print(df_branch_service['txn_id'].nunique())
print(df_customer_transaction['txn_id'].nunique())

In [None]:
#Dropping duplicates
df_branch_service = df_branch_service.drop_duplicates(subset=['txn_id'])
df_customer_transaction = df_customer_transaction.drop_duplicates(subset=['txn_id'])

In [None]:
#confirming shape
print(df_branch_service.shape)
print(df_customer_transaction.shape)

In [None]:
#checking null values count per column
df_branch_service.isnull().sum()

In [None]:
df_customer_transaction.isnull().sum()

In [None]:
#fill up null values with forward fill for branch_name
df_branch_service['branch_name'].unique()

In [None]:
#'' and None are empty values, fill them with forward fill
# first, let '' be null
df_branch_service['branch_name'] = df_branch_service.replace('',np.nan).groupby('txn_id')['branch_name'].transform('first')
# then fill null, use forward and backward to fill
df_branch_service['branch_name'] = df_branch_service['branch_name'].ffill().bfill()

In [None]:
#confirming branch_name has no null
df_branch_service.isnull().sum()

In [None]:
df_branch_service['branch_name'].unique()

In [None]:
# group mean fill prices
df_branch_service['price'] = df_branch_service['price'].fillna(df_branch_service.groupby(['branch_name','service'])['price'].transform('mean'))

In [None]:
#confirming price has no null
df_branch_service.isnull().sum()

In [None]:
df_branch_service.head(10)

In [None]:
df_merged = pd.merge(df_customer_transaction, df_branch_service)

In [None]:
#profiling of merged dataframe
df_merged.isnull().sum()

In [None]:
df_merged.sample(20)

In [None]:
#filter alphabet only in last_name and first_name
df_merged['last_name'] = df_merged['last_name'].str.replace('\W', '', regex=True)
df_merged['first_name'] = df_merged['first_name'].str.replace('\W', '', regex=True)

In [None]:
#uppercase for last_name and first_name
df_merged['last_name'] = df_merged['last_name'].str.upper()
df_merged['first_name'] = df_merged['first_name'].str.upper()

In [None]:
#check if no special characters and all uppercase for names
df_merged.sample(20)

In [None]:
#profiling date columns
print(df_merged['birthday'].min())
print(df_merged['birthday'].max())

print(df_merged['avail_date'].min())
print(df_merged['avail_date'].max())

print(df_merged['avail_date'].describe)
print(df_merged['birthday'].describe)

print(df_merged[(df_merged['avail_date'] <= df_merged['birthday'])])

In [None]:
#data type of birthday and avail_date should be datetime instead of object
df_merged['avail_date'] = pd.to_datetime(df_merged['avail_date'], format='%Y-%m-%d')
df_merged['birthday'] = pd.to_datetime(df_merged['birthday'], format='%Y-%m-%d')

#confirming
print(df_merged['avail_date'].describe)
print(df_merged['birthday'].describe)

In [None]:
#removing rows with later date than current date
df_merged = df_merged[(df_merged['avail_date'] <= today) & (df_merged['birthday'] <= today)]

In [None]:
#removing rows when avail_date happens before birthday
df_merged = df_merged[(df_merged['avail_date'] > df_merged['birthday'])]

In [None]:
#confirming
print(df_merged['birthday'].max())
print(df_merged['avail_date'].max())

print(df_merged[(df_merged['avail_date'] <= df_merged['birthday'])])

In [None]:
df_merged.sample(20)

In [None]:
df_merged.shape