## Imports and Loading

In [1]:
import pandas as pd
import numpy as np
import re
import datetime
df_branch_service = pd.read_json("branch_service_transaction_info.json")
df_customer_transaction = pd.read_json("customer_transaction_info.json")

## Profiling the Data 
### Branch Service

In [2]:
print(df_branch_service.shape) # how many rows and columns

(130653, 4)


In [3]:
# get specific values for categories to check for consistency
df_branch_service['branch_name'].unique()

array(['MallOfAsia', 'Starmall', 'SmallMall', 'MayMall', 'FrankMall',
       'Megamall', 'RobinsonsMall', '', None, 'N/A'], dtype=object)

In [4]:
df_branch_service['service'].unique()

array(['Manicure', 'HairColor', 'FootSpa', 'Rebond', 'Haircut',
       'NailColor', 'Pedicure'], dtype=object)

## Data Cleaning
### Remove Duplicates

In [5]:
# remove same TXN_IDs
df_branch_service = df_branch_service.drop_duplicates()
df_branch_service.to_parquet("branch_service_duplicates_removed.parquet")

In [6]:
print(df_branch_service.shape) # how many rows and columns after duplicate removal

(118276, 4)


## Round off the prices of branch

In [7]:
df_branch_service['price'] = df_branch_service['price'].round(2)
df_branch_service['price'].unique()
df_branch_service.to_parquet("branch_service_prices_rounded.parquet")

## Fill in missing values of null prices

In [8]:
# conditions to fill in missing prices
cond1 = (df_branch_service['service'] == 'Manicure') & (df_branch_service['price'].isnull())
cond2 = (df_branch_service['service'] == 'HairColor') & (df_branch_service['price'].isnull())
cond3 = (df_branch_service['service'] == 'FootSpa') & (df_branch_service['price'].isnull())
cond4 = (df_branch_service['service'] == 'Rebond') & (df_branch_service['price'].isnull())
cond5 = (df_branch_service['service'] == 'Haircut') & (df_branch_service['price'].isnull())
cond6 = (df_branch_service['service'] == 'NailColor') & (df_branch_service['price'].isnull())
cond7 = (df_branch_service['service'] == 'Pedicure') & (df_branch_service['price'].isnull())
df_branch_service.loc[cond1, 'price'] = df_branch_service.loc[cond1, 'price'].fillna(55.23)
df_branch_service.loc[cond2, 'price'] = df_branch_service.loc[cond2, 'price'].fillna(88.09)
df_branch_service.loc[cond3, 'price'] = df_branch_service.loc[cond3, 'price'].fillna(100.12)
df_branch_service.loc[cond4, 'price'] = df_branch_service.loc[cond4, 'price'].fillna(400.23)
df_branch_service.loc[cond5, 'price'] = df_branch_service.loc[cond5, 'price'].fillna(66.12)
df_branch_service.loc[cond6, 'price'] = df_branch_service.loc[cond6, 'price'].fillna(30.12)
df_branch_service.loc[cond7, 'price'] = df_branch_service.loc[cond7, 'price'].fillna(77.99)

In [9]:
df_branch_service['price'].unique()
df_branch_service.to_parquet("branch_service_null_prices_filled.parquet")

## Drop null values and empty values for branch_name

In [10]:
#Not enough assumptions to fill in missing data.
df_branch_service = df_branch_service.drop(df_branch_service[df_branch_service['branch_name'] == ''].index)
df_branch_service = df_branch_service.drop(df_branch_service[df_branch_service['branch_name'] == 'N/A'].index)
df_branch_service = df_branch_service.drop(df_branch_service[df_branch_service['branch_name'].isnull()].index)
print(df_branch_service.shape)
df_branch_service.to_parquet("branch_service_null_branches_deleted.parquet")

(103677, 4)


## Profiling the Data
### Customer Transactions

In [11]:
print(df_customer_transaction.shape) # how many rows and columns

(130653, 5)


In [12]:
print(df_customer_transaction['txn_id'].nunique())

62354


## Cleaning the Data

In [13]:
#remove duplicates
df_customer_transaction = df_customer_transaction.drop_duplicates()
print(df_customer_transaction.shape)
df_customer_transaction.to_parquet("customer_transaction_removed_duplicates.parquet")

(72355, 5)


## Removing whitespaces and special characters in the names of the customers

In [14]:
df_customer_transaction['last_name'] = df_customer_transaction['last_name'].str.replace(r'[^a-z0-9]', '', regex=True, flags = re.IGNORECASE)
df_customer_transaction['first_name'] = df_customer_transaction['first_name'].str.replace(r'[^a-z0-9]', '', regex=True, flags = re.IGNORECASE)
df_customer_transaction.to_parquet("customer_transaction_cleaned_names.parquet")
print(df_customer_transaction)

           txn_id  avail_date last_name first_name    birthday
0       TXN-24546  2030-09-08     ORTIZ    EDUARDO  1990-07-08
1       TXN-14642  2026-05-26    NIENOW        LEA  2000-11-26
2       TXN-60295  2006-09-25     LESCH      FLETA  1993-05-22
6       TXN-40462  2021-08-21      KUHN        TOD  2002-11-25
8       TXN-08102  2010-04-03   JOHNSON     MILTON  2003-07-10
...           ...         ...       ...        ...         ...
130643  TXN-11897  2027-06-19   Reinger     Conrad  1996-12-03
130646  TXN-43876  2021-11-13    Jewess      Lucio  1996-07-05
130647  TXN-65468  2012-06-16  Cummings      Henry  2005-08-14
130649  TXN-60822  2020-09-28      Feil     Jermey  2010-06-15
130651  TXN-01784  2015-01-11   Schmidt     Emilie  1996-05-21

[72355 rows x 5 columns]


## Standardize the name format (Ex. Doe John)

In [15]:
df_customer_transaction['last_name'] = df_customer_transaction['last_name'].str.capitalize()
df_customer_transaction['first_name'] = df_customer_transaction['first_name'].str.capitalize()
df_customer_transaction.to_parquet("customer_transaction_formatted_names.parquet")
print(df_customer_transaction)

           txn_id  avail_date last_name first_name    birthday
0       TXN-24546  2030-09-08     Ortiz    Eduardo  1990-07-08
1       TXN-14642  2026-05-26    Nienow        Lea  2000-11-26
2       TXN-60295  2006-09-25     Lesch      Fleta  1993-05-22
6       TXN-40462  2021-08-21      Kuhn        Tod  2002-11-25
8       TXN-08102  2010-04-03   Johnson     Milton  2003-07-10
...           ...         ...       ...        ...         ...
130643  TXN-11897  2027-06-19   Reinger     Conrad  1996-12-03
130646  TXN-43876  2021-11-13    Jewess      Lucio  1996-07-05
130647  TXN-65468  2012-06-16  Cummings      Henry  2005-08-14
130649  TXN-60822  2020-09-28      Feil     Jermey  2010-06-15
130651  TXN-01784  2015-01-11   Schmidt     Emilie  1996-05-21

[72355 rows x 5 columns]


## Remove invalid dates from birthdays and avail dates

In [16]:
df_customer_transaction['avail_date'] = pd.to_datetime(df_customer_transaction['avail_date'])
df_customer_transaction['birthday'] = pd.to_datetime(df_customer_transaction['birthday'])
df_customer_transaction = df_customer_transaction.drop(df_customer_transaction[df_customer_transaction['avail_date'] > datetime.datetime.now()].index)
df_customer_transaction = df_customer_transaction.drop(df_customer_transaction[df_customer_transaction['birthday'] > datetime.datetime.now()].index)
df_customer_transaction.to_parquet("customer_transaction_removed_invalid_dates.parquet")
print(df_customer_transaction)

           txn_id avail_date last_name first_name   birthday
2       TXN-60295 2006-09-25     Lesch      Fleta 1993-05-22
6       TXN-40462 2021-08-21      Kuhn        Tod 2002-11-25
8       TXN-08102 2010-04-03   Johnson     Milton 2003-07-10
12      TXN-64262 2018-11-23   Hackett       Maci 2005-06-20
22      TXN-49423 2008-08-13     Berge    Rasheed 2010-02-05
...           ...        ...       ...        ...        ...
130637  TXN-17937 2016-01-04    Torphy     Garret 1998-11-26
130646  TXN-43876 2021-11-13    Jewess      Lucio 1996-07-05
130647  TXN-65468 2012-06-16  Cummings      Henry 2005-08-14
130649  TXN-60822 2020-09-28      Feil     Jermey 2010-06-15
130651  TXN-01784 2015-01-11   Schmidt     Emilie 1996-05-21

[52515 rows x 5 columns]
