## Buy Now, Pay Later Project
### MAST30034: Applied Data Science 
#### Notebook 2: Preprocessing Data 2: Data Aggregation

In [2]:
# create modeling spark session
from pyspark.sql import SparkSession
import pandas as pd

spark = (
    SparkSession.builder.appName('Project 2 test')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

In [3]:
full = spark.read.parquet('../data/curated/clean_full_dataset/')
full

postcode,user_id,merchant_abn,dollar_value,order_datetime,name,business_area,revenue_level,take_rate,consumer_id,state,gender,Median_age_persons,Median_mortgage_repay_monthly,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Average_num_psns_per_bedroom,Median_tot_hhd_inc_weekly,Average_household_size,business_area_type
3332,14,68004106739,35.066583865041444,2021-08-08,Nec Ante Ltd,"cable, satellite,...",a,5.61,1343547,VIC,Male,38,1733,797,350,2096,0.8,1955,2.9,Retail_trade
3432,4943,67609108741,87.09715299772967,2022-03-05,Metus Sit Amet In...,"cable, satellite,...",e,0.38,533495,VIC,Male,51,2000,791,550,2374,0.8,1792,2.7,Retail_trade
3332,14,70052129860,10.624562993647425,2021-07-25,Donec Tempus Lore...,"cable, satellite,...",b,3.37,1343547,VIC,Male,38,1733,797,350,2096,0.8,1955,2.9,Retail_trade
3432,4943,19237425345,36.91374841278192,2022-03-08,A Scelerisque Ass...,"cable, satellite,...",c,2.04,533495,VIC,Male,51,2000,791,550,2374,0.8,1792,2.7,Retail_trade
3332,14,79645157255,21.83858073111989,2021-04-16,Consectetuer Maur...,"cable, satellite,...",a,6.46,1343547,VIC,Male,38,1733,797,350,2096,0.8,1955,2.9,Retail_trade
3432,4943,67979471799,9.354001353615072,2022-03-16,Laoreet Ipsum Corp.,"cable, satellite,...",d,1.04,533495,VIC,Male,51,2000,791,550,2374,0.8,1792,2.7,Retail_trade
3332,14,21439773999,57.50723650600155,2021-06-01,Mauris Non Institute,"cable, satellite,...",a,6.1,1343547,VIC,Male,38,1733,797,350,2096,0.8,1955,2.9,Retail_trade
3432,4943,68501926042,72.64904714060505,2022-03-23,Dolor Corp.,"cable, satellite,...",a,5.78,533495,VIC,Male,51,2000,791,550,2374,0.8,1792,2.7,Retail_trade
3332,14,66370248931,24.001048123394057,2021-05-04,Morbi Non PC,"cable, satellite,...",b,3.15,1343547,VIC,Male,38,1733,797,350,2096,0.8,1955,2.9,Retail_trade
2082,4949,44454840859,34.990386491563186,2021-08-27,Erat Eget Ipsum PC,"cable, satellite,...",a,5.62,801928,NSW,Undisclosed,42,2500,977,580,2804,0.8,2449,2.9,Retail_trade


### Calculating the Gender Percentage for Consumers

Noting that the "male_percentage"/"female_percentage"/"undisclosed_percentage" here refer to the amount of transactions made by male/female/undisclosed consumers divided by the total amount of transactions. Multiple transaction records made by the same consumers are included in this calculation algorithm.

In [4]:
# count the number of comsumers by gender and merchant
gender_count_sdf = full.groupBy(["merchant_abn", "gender"]).count().sort("merchant_abn")
gender_count_sdf

merchant_abn,gender,count
10023283211,Undisclosed,272
10023283211,Male,1189
10023283211,Female,1215
10142254217,Undisclosed,253
10142254217,Male,1183
10142254217,Female,1070
10165489824,Female,2
10165489824,Undisclosed,1
10165489824,Male,1
10187291046,Undisclosed,33


In [5]:
# count the total number of consumers
total_count = gender_count_sdf.groupBy('merchant_abn').sum('count').sort("merchant_abn")
total_count

merchant_abn,sum(count)
10023283211,2676
10142254217,2506
10165489824,4
10187291046,290
10192359162,320
10206519221,8043
10255988167,682
10264435225,4147
10279061213,456
10323485998,8559


In [6]:
gender_count_sdf = gender_count_sdf.join(total_count, on='merchant_abn').sort("merchant_abn")
gender_count_sdf = gender_count_sdf.withColumnRenamed("sum(count)","total_transactions_count")
gender_count_sdf


merchant_abn,gender,count,total_transactions_count
10023283211,Undisclosed,272,2676
10023283211,Male,1189,2676
10023283211,Female,1215,2676
10142254217,Undisclosed,253,2506
10142254217,Male,1183,2506
10142254217,Female,1070,2506
10165489824,Female,2,4
10165489824,Undisclosed,1,4
10165489824,Male,1,4
10187291046,Undisclosed,33,290


In [7]:
# calculate the consumer gender percentage for each gender and for each merchants, save the percentage as "gender_percentage"
from pyspark.sql import functions as F
gender_count_sdf = gender_count_sdf.withColumn("gender_percentage", F.col("count")/F.col("total_transactions_count"))
gender_count_sdf

merchant_abn,gender,count,total_transactions_count,gender_percentage
10023283211,Undisclosed,272,2676,0.1016442451420029
10023283211,Male,1189,2676,0.4443198804185351
10023283211,Female,1215,2676,0.4540358744394618
10142254217,Undisclosed,253,2506,0.1009577015163607
10142254217,Male,1183,2506,0.4720670391061452
10142254217,Female,1070,2506,0.426975259377494
10165489824,Female,2,4,0.5
10165489824,Undisclosed,1,4,0.25
10165489824,Male,1,4,0.25
10187291046,Undisclosed,33,290,0.1137931034482758


In [8]:
# separate gender percentage by male, female and undisclosed
male_percentage = gender_count_sdf.filter("gender == 'Male'").select(F.col("merchant_abn"),F.col("gender_percentage")).withColumnRenamed("gender_percentage","male_consumer_percentage")
female_percentage = gender_count_sdf.filter("gender == 'Female'").select(F.col("merchant_abn"),F.col("gender_percentage")).withColumnRenamed("gender_percentage","female_consumer_percentage")
undisclosed_percentage = gender_count_sdf.filter("gender == 'Undisclosed'").select(F.col("merchant_abn"),F.col("gender_percentage")).withColumnRenamed("gender_percentage","undisclosed_consumer_percentage")

In [9]:
# observe one of the outcome dataframe
male_percentage

merchant_abn,male_consumer_percentage
10023283211,0.4443198804185351
10142254217,0.4720670391061452
10165489824,0.25
10187291046,0.4448275862068965
10192359162,0.459375
10206519221,0.453562103692652
10255988167,0.4486803519061583
10264435225,0.4458644803472389
10279061213,0.4342105263157895
10323485998,0.4632550531604159


In [10]:
# combine the 3 gender percentages together into 1 dataframe
agg_df = male_percentage.join(female_percentage, on="merchant_abn")
agg_df = agg_df.join(undisclosed_percentage, on="merchant_abn")
# add the total transaction count into aggregated dataframe and rename the column name
agg_df = agg_df.join(total_count, on="merchant_abn")
agg_df = agg_df.withColumnRenamed("sum(count)","total_transactions_count")
agg_df

merchant_abn,male_consumer_percentage,female_consumer_percentage,undisclosed_consumer_percentage,total_transactions_count
10023283211,0.4443198804185351,0.4540358744394618,0.1016442451420029,2676
10142254217,0.4720670391061452,0.426975259377494,0.1009577015163607,2506
10165489824,0.25,0.5,0.25,4
10187291046,0.4448275862068965,0.4413793103448276,0.1137931034482758,290
10192359162,0.459375,0.45625,0.084375,320
10206519221,0.453562103692652,0.4482158398607485,0.0982220564465995,8043
10255988167,0.4486803519061583,0.4560117302052786,0.095307917888563,682
10264435225,0.4458644803472389,0.4528574873402459,0.101278032312515,4147
10279061213,0.4342105263157895,0.4736842105263157,0.0921052631578947,456
10323485998,0.4632550531604159,0.4303072788877205,0.1064376679518635,8559


In [11]:
full.groupBy("merchant_abn") \
    .agg(F.mean("Median_age_persons").alias("ave_age"), \
         F.mean("Median_tot_prsnl_inc_weekly").alias("ave_income"), \
         F.mean("Median_rent_weekly").alias("ave_rent") \
     ).limit(10)

merchant_abn,ave_age,ave_income,ave_rent
19839532017,43.40066225165563,942.0645695364238,307.20364238410593
38700038932,43.48916936721815,824.3348115299335,301.4045710387174
57798993346,45.11764705882353,941.1764705882352,335.11764705882354
45339304653,43.78842105263158,797.061052631579,294.7052631578948
51561881468,43.61951219512195,784.0390243902439,294.2829268292683
66610548417,44.0,768.1141304347826,307.3097826086956
98545158925,43.553484036355165,812.2920065252855,302.18433931484503
46331355995,43.15366289458011,857.8147706968433,303.5092316855271
90568944804,43.4614516311096,815.516198216503,301.28027994130264
43719937438,43.21381847914963,797.8556827473426,300.1749795584628
