IMPORTS

In [2]:
import pandas as pd
import numpy as np

# <b>Importing Data <b>

In [3]:
df_branch_service = pd.read_json("branch_service_transaction_info.json")

## DATA PROFILING

<b>INITIAL SHAPE ANG UNIQUE ROWS<b>

In [4]:
print(df_branch_service.shape)
print(df_branch_service['txn_id'].nunique())


(130653, 4)
62354


In [5]:
df_branch_service['price'].describe()

count    110653.000000
mean         95.966969
std         115.874117
min           0.000000
25%          30.123790
50%          66.123457
75%          88.093930
max         400.231230
Name: price, dtype: float64

In [6]:
print(df_branch_service['branch_name'].unique())   
print(df_branch_service['service'].unique()) 
print(df_branch_service['price'].unique()) 
  

['MallOfAsia' 'Starmall' 'SmallMall' 'MayMall' 'FrankMall' 'Megamall'
 'RobinsonsMall' '' None 'N/A']
['Manicure' 'HairColor' 'FootSpa' 'Rebond' 'Haircut' 'NailColor'
 'Pedicure']
[         nan   0.          30.1237897   66.12345678  77.987989
 100.12123    400.23123     55.2324      88.09393   ]


# END OF PROFILING

### REMOVE DUPLICATES


In [7]:
print(df_branch_service.shape)
df_branch_service = df_branch_service.drop_duplicates()
print(df_branch_service.shape)



(130653, 4)
(118276, 4)


REMOVE ALMOST EXACT DUPLICATES

In [8]:
cols = df_branch_service.columns.tolist()
cols.remove('txn_id')

for col in cols:
    observed_cols = df_branch_service.drop(col, axis=1).columns.tolist()
    df_branch_service.drop_duplicates(observed_cols, keep='first', inplace=True)

print(df_branch_service.shape)
print(df_branch_service.nunique())

(62355, 4)
txn_id         62354
branch_name        9
service            7
price              8
dtype: int64


In [9]:
df_branch_service.to_parquet('branch_service_duplicates_removed.parquet')

REMOVE NULL VALUES

In [10]:
df_branch_service = df_branch_service.dropna(subset=['branch_name'])
df_branch_service = df_branch_service.dropna(subset=['service'])
print('Null Values Dropped Currently at:',df_branch_service.shape)


Null Values Dropped Currently at: (53475, 4)


REMOVE / REPLACE MISSING VALUES

In [11]:
df_branch_service = df_branch_service.drop(df_branch_service[df_branch_service['branch_name'] == ''].index)
df_branch_service = df_branch_service.drop(df_branch_service[df_branch_service['branch_name'] == 'N/A'].index)

# conditions to fill in missing prices
serviceArray = ['Manicure', 'HairColor', 'FootSpa', 'Rebond', 'Haircut', 'NailColor', 'Pedicure']
priceArray = [55.23, 88.09, 100.12, 400.23, 66.12, 30.12, 77.99]

for i in range(len(serviceArray)):
    df_branch_service.loc[(df_branch_service['service'] == serviceArray[i]) & (df_branch_service['price'].isnull()), 'price'] = priceArray[i]


print(df_branch_service.shape)
df_branch_service.to_parquet('branch_service_nullbranch_removed.parquet')

(49785, 4)


### Formatting Values

CHANGE VALUES TO CAMELCASE

In [12]:
df_branch_service.loc[df_branch_service['branch_name'] == 'Starmall', 'branch_name'] = 'StarMall'
df_branch_service.loc[df_branch_service['branch_name'] == 'Megamall', 'branch_name'] = 'MegaMall'

ROUND  PRICES TO 2 DECIMAL PLACES

In [13]:
df_branch_service = df_branch_service.round({'price': 2})

In [14]:
df_branch_service.to_parquet('branch_service_formatted_values.parquet')

SHAUN TEST


CHECKER

In [15]:
df_branch_service.sort_values(by=['txn_id'], inplace=True, ascending=True)     
df_branch_service.head(30)

Unnamed: 0,txn_id,branch_name,service,price
105465,TXN-00000,MallOfAsia,HairColor,88.09
102747,TXN-00001,MegaMall,Haircut,66.12
83113,TXN-00005,RobinsonsMall,Haircut,66.12
88556,TXN-00006,FrankMall,Pedicure,77.99
50273,TXN-00007,SmallMall,HairColor,88.09
69179,TXN-00008,FrankMall,Manicure,55.23
99529,TXN-00009,RobinsonsMall,HairColor,88.09
42495,TXN-00010,StarMall,HairColor,88.09
71998,TXN-00012,StarMall,Rebond,400.23
82114,TXN-00013,SmallMall,Manicure,55.23
