IMPORTS

In [37]:
import pandas as pd
import numpy as np

# <b>Importing Data <b>

In [38]:
df_branch_service = pd.read_json("branch_service_transaction_info.json")

## DATA PROFILING

<b>INITIAL SHAPE ANG UNIQUE ROWS<b>

In [39]:
print(df_branch_service.shape)
print(df_branch_service['txn_id'].nunique())


(130653, 4)
62354


In [40]:
df_branch_service['price'].describe()

count    110653.000000
mean         95.966969
std         115.874117
min           0.000000
25%          30.123790
50%          66.123457
75%          88.093930
max         400.231230
Name: price, dtype: float64

In [41]:
print(df_branch_service['branch_name'].unique())   
print(df_branch_service['service'].unique()) 
print(df_branch_service['price'].unique()) 
  

['MallOfAsia' 'Starmall' 'SmallMall' 'MayMall' 'FrankMall' 'Megamall'
 'RobinsonsMall' '' None 'N/A']
['Manicure' 'HairColor' 'FootSpa' 'Rebond' 'Haircut' 'NailColor'
 'Pedicure']
[         nan   0.          30.1237897   66.12345678  77.987989
 100.12123    400.23123     55.2324      88.09393   ]


### REMOVE DUPLICATES


In [42]:
print(df_branch_service.shape)
df_branch_service = df_branch_service.drop_duplicates()
print(df_branch_service.shape)



(130653, 4)
(118276, 4)


REMOVE ALMOST EXACT DUPLICATES

In [43]:
cols = df_branch_service.columns.tolist()
cols.remove('txn_id')

for col in cols:
    observed_cols = df_branch_service.drop(col, axis=1).columns.tolist()
    df_branch_service.drop_duplicates(observed_cols, keep='first', inplace=True)

print(df_branch_service.shape)
print(df_branch_service.nunique())

(62355, 4)
txn_id         62354
branch_name        9
service            7
price              8
dtype: int64


In [44]:
df_branch_service.to_parquet('branch_service_duplicates_removed.parquet')

REMOVE NULL VALUES

In [45]:
df_branch_service = df_branch_service.dropna(subset=['branch_name'])
df_branch_service = df_branch_service.dropna(subset=['service'])
#df_branch_service = df_branch_service.dropna(subset=['price'])
print(df_branch_service.shape)



(53475, 4)


REMOVE MISSING VALUES

In [46]:
##df_branch_service = df_branch_service.drop(df_branch_service[df_branch_service['price'] == 0].index)
df_branch_service = df_branch_service.drop(df_branch_service[df_branch_service['branch_name'] == ''].index)
df_branch_service = df_branch_service.drop(df_branch_service[df_branch_service['branch_name'] == 'N/A'].index)



print(df_branch_service.shape)
df_branch_service.to_parquet('branch_service_nullbranch_removed.parquet')

(49785, 4)


### Formatting Values

CHANGE VALUES TO CAMELCASE

In [47]:
df_branch_service.loc[df_branch_service['branch_name'] == 'Starmall', 'branch_name'] = 'StarMall'
df_branch_service.loc[df_branch_service['branch_name'] == 'Megamall', 'branch_name'] = 'MegaMall'

ROUND  PRICES TO 2 DECIMAL PLACES

In [48]:
df_branch_service = df_branch_service.round({'price': 2})

In [49]:
df_branch_service.to_parquet('branch_service_formatted_values.parquet')

SHAUN TEST


In [50]:
# conditions to fill in missing prices
cond1 = (df_branch_service['service'] == 'Manicure') & (df_branch_service['price'].isnull())
cond2 = (df_branch_service['service'] == 'HairColor') & (df_branch_service['price'].isnull())
cond3 = (df_branch_service['service'] == 'FootSpa') & (df_branch_service['price'].isnull())
cond4 = (df_branch_service['service'] == 'Rebond') & (df_branch_service['price'].isnull())
cond5 = (df_branch_service['service'] == 'Haircut') & (df_branch_service['price'].isnull())
cond6 = (df_branch_service['service'] == 'NailColor') & (df_branch_service['price'].isnull())
cond7 = (df_branch_service['service'] == 'Pedicure') & (df_branch_service['price'].isnull())
df_branch_service.loc[cond1, 'price'] = df_branch_service.loc[cond1, 'price'].fillna(55.23)
df_branch_service.loc[cond2, 'price'] = df_branch_service.loc[cond2, 'price'].fillna(88.09)
df_branch_service.loc[cond3, 'price'] = df_branch_service.loc[cond3, 'price'].fillna(100.12)
df_branch_service.loc[cond4, 'price'] = df_branch_service.loc[cond4, 'price'].fillna(400.23)
df_branch_service.loc[cond5, 'price'] = df_branch_service.loc[cond5, 'price'].fillna(66.12)
df_branch_service.loc[cond6, 'price'] = df_branch_service.loc[cond6, 'price'].fillna(30.12)
df_branch_service.loc[cond7, 'price'] = df_branch_service.loc[cond7, 'price'].fillna(77.99)

CHECKER

In [51]:
##df_branch_service.sort_values(by=['txn_id'], inplace=True, ascending=False)     
df_branch_service.head(30)

Unnamed: 0,txn_id,branch_name,service,price
0,TXN-24546,MallOfAsia,Manicure,55.23
1,TXN-14642,StarMall,HairColor,88.09
2,TXN-60295,SmallMall,FootSpa,100.12
6,TXN-40462,StarMall,HairColor,88.09
8,TXN-08102,RobinsonsMall,HairColor,88.09
12,TXN-64262,SmallMall,Rebond,400.23
15,TXN-41934,RobinsonsMall,HairColor,88.09
19,TXN-39050,StarMall,Manicure,55.23
22,TXN-49423,RobinsonsMall,Rebond,400.23
25,TXN-08158,MayMall,NailColor,30.12
