In [28]:
import pandas as pd
import numpy as np

## Loading the Data

In [83]:
df_branch_service = pd.read_json("branch_service_transaction_info.json")
df_customer_transaction = pd.read_json("customer_transaction_info.json")

## Profiling the Data

Checking the data high level

In [84]:
df_customer_transaction.head(10)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ORTIZ,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,NIENOW,LEA,2000-11-26
2,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
3,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
4,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
5,TXN-60295,2006-09-25,LESCH,FLETA,1993-05-22
6,TXN-40462,2021-08-21,KUHN,TOD,2002-11-25
7,TXN-40462,2021-08-21,KUHN,TOD,2002-11-25
8,TXN-08102,2010-04-03,JOHNSON,MILTON,2003-07-10
9,TXN-08102,2010-04-03,JOHNSON,MILTON,2003-07-10


In [58]:
df_branch_service.head(10)

Unnamed: 0,txn_id,branch_name,service,price
0,TXN-24546,MallOfAsia,Manicure,
1,TXN-14642,Starmall,HairColor,
2,TXN-60295,SmallMall,FootSpa,
3,TXN-60295,Starmall,FootSpa,
4,TXN-60295,MayMall,FootSpa,
5,TXN-60295,FrankMall,FootSpa,
6,TXN-40462,Starmall,HairColor,
7,TXN-40462,Megamall,HairColor,
8,TXN-08102,RobinsonsMall,HairColor,
9,TXN-08102,SmallMall,HairColor,


In [63]:
df_branch_service.tail(10)

Unnamed: 0,txn_id,branch_name,service,price
130641,TXN-16095,FrankMall,HairColor,88.09393
130643,TXN-11897,FrankMall,FootSpa,100.12123
130644,TXN-11897,RobinsonsMall,FootSpa,100.12123
130646,TXN-43876,,Haircut,66.123457
130647,TXN-65468,RobinsonsMall,Haircut,66.123457
130648,TXN-65468,MallOfAsia,Haircut,66.123457
130649,TXN-60822,MallOfAsia,Rebond,400.23123
130650,TXN-60822,FrankMall,Rebond,400.23123
130651,TXN-01784,RobinsonsMall,HairColor,88.09393
130652,TXN-01784,MallOfAsia,HairColor,88.09393


In [92]:
print(df_branch_service['branch_name'].unique())
print(df_branch_service['service'].unique())
print(df_branch_service['price'].unique())

['MallOfAsia' 'Starmall' 'SmallMall' 'MayMall' 'FrankMall' 'Megamall'
 'RobinsonsMall' '' None 'N/A']
['Manicure' 'HairColor' 'FootSpa' 'Rebond' 'Haircut' 'NailColor'
 'Pedicure']
[         nan   0.          30.1237897   66.12345678  77.987989
 100.12123    400.23123     55.2324      88.09393   ]


Checking the dataset shape

In [9]:
print(df_customer_transaction.shape)
print(df_branch_service.shape)

(130653, 5)
(130653, 4)


Checking the unique Transaction IDs

In [21]:
print(df_customer_transaction['txn_id'].nunique())
print(df_branch_service['txn_id'].nunique())

print(df_branch_service.columns)

62354
62354
Index(['txn_id', 'branch_name', 'service', 'price'], dtype='object')


Checking Unique Columns

In [48]:
print(df_branch_service['branch_name'].unique())   
print(df_branch_service['service'].unique())   

['MallOfAsia' 'Starmall' 'SmallMall' 'MayMall' 'FrankMall' 'Megamall'
 'RobinsonsMall' '' None 'N/A']
['Manicure' 'HairColor' 'FootSpa' 'Rebond' 'Haircut' 'NailColor'
 'Pedicure']


Checking the services

In [47]:
df_branch_service['service'].unique()

array(['Manicure', 'HairColor', 'FootSpa', 'Rebond', 'Haircut',
       'NailColor', 'Pedicure'], dtype=object)

In [28]:
df_branch_service['price'].describe()

count    110653.000000
mean         95.966969
std         115.874117
min           0.000000
25%          30.123790
50%          66.123457
75%          88.093930
max         400.231230
Name: price, dtype: float64

Checking the availment dates

In [37]:
print(df_customer_transaction['avail_date'].min())
print(df_customer_transaction['avail_date'].max())

2005-01-01
2030-12-30


## Creating the Data Pipeline Diagram

Go to draw.io

## Data Cleaning

Removing duplicates

In [49]:
print(df_customer_transaction.shape)
df_customer_transaction = df_customer_transaction.drop_duplicates()
print(df_customer_transaction.shape)

(130653, 5)
(72355, 5)


In [81]:
print(df_branch_service.shape)
df_branch_service = df_branch_service.drop_duplicates()
print(df_branch_service.shape)

df_customer_transaction.to_parquet('branch_service_nullbranch_removed.parquet')

(87831, 4)
(87831, 4)


In [82]:
df_branch_service = df_branch_service.dropna(subset=['branch_name'])
df_branch_service = df_branch_service.dropna(subset=['service'])
df_branch_service = df_branch_service.dropna(subset=['price'])
print(df_branch_service.shape)

df_customer_transaction.to_parquet('branch_service_nullbranch_removed.parquet')

(87831, 4)


In [79]:
df_branch_service.head(10)

Unnamed: 0,txn_id,branch_name,service,price
20000,TXN-05451,MayMall,HairColor,0.0
20001,TXN-05451,Starmall,HairColor,0.0
20002,TXN-02255,FrankMall,Rebond,0.0
20003,TXN-02255,Megamall,Rebond,0.0
20004,TXN-02255,MayMall,Rebond,0.0
20005,TXN-02255,SmallMall,Rebond,0.0
20006,TXN-59105,FrankMall,FootSpa,0.0
20007,TXN-59105,Starmall,FootSpa,0.0
20008,TXN-57176,MallOfAsia,Rebond,0.0
20010,TXN-57176,FrankMall,Rebond,0.0


In [69]:
df_customer_transaction.to_parquet('customer_txn_duplicates_removed.parquet')

Converting last name to lower case

In [15]:
df_customer_transaction.head(2)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ORTIZ,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,NIENOW,LEA,2000-11-26


In [33]:
df_customer_transaction['last_name'] = df_customer_transaction['last_name'].str.lower()
df_customer_transaction.head(2)

Unnamed: 0,txn_id,avail_date,last_name,first_name,birthday
0,TXN-24546,2030-09-08,ortiz,EDUARDO,1990-07-08
1,TXN-14642,2026-05-26,nienow,LEA,2000-11-26


In [80]:
df_customer_transaction.to_parquet("customer_txn_last_name_clean.parquet")

## Yearly Sales per Branch View

In [70]:
df_merged = pd.merge(df_customer_transaction, df_branch_service)
df_merged.to_parquet("merged_data.parquet")

df_merged['avail_date'] = pd.to_datetime(df_merged['avail_date'])
df_merged.groupby([df_merged.avail_date.dt.year, 'branch_name'])['price'].sum().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,price
avail_date,branch_name,Unnamed: 2_level_1
2005,,4298.024691
2005,FrankMall,121802.964985
2005,MallOfAsia,134477.566060
2005,MayMall,137079.079594
2005,Megamall,142955.903482
...,...,...
2030,Megamall,130639.080429
2030,,2048.417700
2030,RobinsonsMall,126197.920506
2030,SmallMall,131526.305978
