In [2]:
import pandas as pd
import numpy as np

## Loading the Data

In [3]:
df_branch_service = pd.read_json("branch_service_transaction_info.json")
df_customer_transaction = pd.read_json("customer_transaction_info.json")

## Profiling the Data

Checking the data high level

In [4]:
df_customer_transaction.head(10)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ORTIZ,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,NIENOW,LEA,2000-11-26
2,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
3,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
4,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
5,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
6,TXN-40462,2021-08-21,KUHN,TOD,2002-11-25
7,TXN-40462,2021-08-21,KUHN,TOD,2002-11-25
8,TXN-08102,2010-04-03,JOHNSON,MILTON,2003-07-10
9,TXN-08102,2010-04-03,JOHNSON,MILTON,2003-07-10


In [5]:
df_branch_service.head(10)

Unnamed: 0,txn_id,branch_name,service,price
0,TXN-24546,MallOfAsia,Manicure,
1,TXN-14642,Starmall,HairColor,
2,TXN-60295,SmallMall,FootSpa,
3,TXN-60295,Starmall,FootSpa,
4,TXN-60295,MayMall,FootSpa,
5,TXN-60295,FrankMall,FootSpa,
6,TXN-40462,Starmall,HairColor,
7,TXN-40462,Megamall,HairColor,
8,TXN-08102,RobinsonsMall,HairColor,
9,TXN-08102,SmallMall,HairColor,


In [6]:
df_branch_service.tail(10)

Unnamed: 0,txn_id,branch_name,service,price
130643,TXN-11897,FrankMall,FootSpa,100.12123
130644,TXN-11897,RobinsonsMall,FootSpa,100.12123
130645,TXN-11897,FrankMall,FootSpa,100.12123
130646,TXN-43876,,Haircut,66.123457
130647,TXN-65468,RobinsonsMall,Haircut,66.123457
130648,TXN-65468,MallOfAsia,Haircut,66.123457
130649,TXN-60822,MallOfAsia,Rebond,400.23123
130650,TXN-60822,FrankMall,Rebond,400.23123
130651,TXN-01784,RobinsonsMall,HairColor,88.09393
130652,TXN-01784,MallOfAsia,HairColor,88.09393


Checking the dataset shape

In [7]:
print(df_customer_transaction.shape)
print(df_branch_service.shape)

(130653, 5)
(130653, 4)


Checking the unique Transaction IDs

In [8]:
print(df_customer_transaction['txn_id'].nunique())
print(df_branch_service['txn_id'].nunique())

62354
62354


Checking the services

In [9]:
df_branch_service['service'].unique()

array(['Manicure', 'HairColor', 'FootSpa', 'Rebond', 'Haircut',
       'NailColor', 'Pedicure'], dtype=object)

In [10]:
df_branch_service['price'].describe()

count    110653.000000
mean         95.966969
std         115.874117
min           0.000000
25%          30.123790
50%          66.123457
75%          88.093930
max         400.231230
Name: price, dtype: float64

Checking the availment dates

In [11]:
print(df_customer_transaction['avail_date'].min())
print(df_customer_transaction['avail_date'].max())

2005-01-01
2030-12-30


## Creating the Data Pipeline Diagram

Go to draw.io

## Data Cleaning

Removing duplicates

In [12]:
print(df_customer_transaction.shape)
df_customer_transaction = df_customer_transaction.drop_duplicates()
print(df_customer_transaction.shape)

(130653, 5)
(72355, 5)


In [27]:
df_customer_transaction.to_parquet("customer_txn_duplicates_removed.parquet")

Converting last name to lower case

In [13]:
df_customer_transaction.head(2)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ORTIZ,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,NIENOW,LEA,2000-11-26


In [14]:
df_customer_transaction['last_name'] = df_customer_transaction['last_name'].str.lower()
df_customer_transaction.head(2)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ortiz,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,nienow,LEA,2000-11-26


In [28]:
df_customer_transaction.to_parquet("customer_txn_last_name_clean.parquet")

## Yearly Sales per Branch View

In [29]:
df_merged = pd.merge(df_customer_transaction, df_branch_service)
df_merged.to_parquet("merged_data.parquet")

df_merged['avail_date'] = pd.to_datetime(df_merged['avail_date'])
df_merged.groupby([df_merged.avail_date.dt.year, 'branch_name'])['price'].sum().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,price
avail_date,branch_name,Unnamed: 2_level_1
2005,,4298.024691
2005,FrankMall,56711.249909
2005,MallOfAsia,62502.141612
2005,MayMall,65314.419050
2005,Megamall,63722.520395
...,...,...
2030,Megamall,61514.464025
2030,,2048.417700
2030,RobinsonsMall,56816.266169
2030,SmallMall,60680.995068
