## Buy Now, Pay Later Project
### MAST30034: Applied Data Science 
#### Notebook 2: Preprocessing Data 2: Data Aggregation

In [29]:
# create modeling spark session
from pyspark.sql import SparkSession
import pandas as pd

spark = (
    SparkSession.builder.appName('Project 2 test')
    .config('spark.sql.repl.eagerEval.enabled', True) 
    .config('spark.sql.parquet.cacheMetadata', 'true')
    .config("spark.sql.session.timeZone", "Etc/UTC")
    .config("spark.executor.memory", "8g")
    .config("spark.driver.memory", "8g")
    .getOrCreate()
)

In [30]:
full = spark.read.parquet('../data/curated/clean_full_dataset/')
full

postcode,user_id,merchant_abn,dollar_value,order_datetime,name,business_area,revenue_level,take_rate,consumer_id,state,gender,Median_age_persons,Median_mortgage_repay_monthly,Median_tot_prsnl_inc_weekly,Median_rent_weekly,Median_tot_fam_inc_weekly,Average_num_psns_per_bedroom,Median_tot_hhd_inc_weekly,Average_household_size,business_area_type,annual_turnover_percentage
3332,14,68004106739,35.066583865041444,2021-08-08,Nec Ante Ltd,"cable, satellite,...",a,5.61,1343547,VIC,Male,38,1733,797,350,2096,0.8,1955,2.9,Retail trade,21.9
3722,15147,47663262928,94.21710419172264,2021-04-29,Eget Lacus LLP,"cable, satellite,...",a,6.66,432764,VIC,Male,47,1600,756,331,1790,0.7,1360,2.3,Retail trade,21.9
3332,14,70052129860,10.624562993647425,2021-07-25,Donec Tempus Lore...,"cable, satellite,...",b,3.37,1343547,VIC,Male,38,1733,797,350,2096,0.8,1955,2.9,Retail trade,21.9
3722,15147,57021295485,221.6034024277662,2021-04-22,Enim Etiam Imperd...,"cable, satellite,...",a,5.75,432764,VIC,Male,47,1600,756,331,1790,0.7,1360,2.3,Retail trade,21.9
3332,14,79645157255,21.83858073111989,2021-04-16,Consectetuer Maur...,"cable, satellite,...",a,6.46,1343547,VIC,Male,38,1733,797,350,2096,0.8,1955,2.9,Retail trade,21.9
3722,15147,21439773999,48.68734843879943,2021-04-26,Mauris Non Institute,"cable, satellite,...",a,6.1,432764,VIC,Male,47,1600,756,331,1790,0.7,1360,2.3,Retail trade,21.9
3332,14,21439773999,57.50723650600155,2021-06-01,Mauris Non Institute,"cable, satellite,...",a,6.1,1343547,VIC,Male,38,1733,797,350,2096,0.8,1955,2.9,Retail trade,21.9
3722,15147,94472466107,21.050051462382854,2021-12-26,Eu Dolor Egestas PC,"cable, satellite,...",a,6.23,432764,VIC,Male,47,1600,756,331,1790,0.7,1360,2.3,Retail trade,21.9
3332,14,66370248931,24.001048123394057,2021-05-04,Morbi Non PC,"cable, satellite,...",b,3.15,1343547,VIC,Male,38,1733,797,350,2096,0.8,1955,2.9,Retail trade,21.9
3722,15147,21439773999,141.83408413991097,2021-12-09,Mauris Non Institute,"cable, satellite,...",a,6.1,432764,VIC,Male,47,1600,756,331,1790,0.7,1360,2.3,Retail trade,21.9


### Calculating the Gender Percentage for Consumers

Noting that the "male_percentage"/"female_percentage"/"undisclosed_percentage" here refer to the amount of transactions made by male/female/undisclosed consumers divided by the total amount of transactions. Multiple transaction records made by the same consumers are included in this calculation algorithm.

In [31]:
# count the number of comsumers by gender and merchant
gender_count_sdf = full.groupBy(["merchant_abn", "gender"]).count().sort("merchant_abn")
gender_count_sdf

                                                                                

merchant_abn,gender,count
10023283211,Undisclosed,272
10023283211,Male,1189
10023283211,Female,1215
10142254217,Undisclosed,253
10142254217,Male,1183
10142254217,Female,1070
10187291046,Female,128
10187291046,Undisclosed,33
10187291046,Male,129
10192359162,Female,146


In [32]:
# count the total number of consumers
total_count = gender_count_sdf.groupBy('merchant_abn').sum('count').sort("merchant_abn")
total_count

merchant_abn,sum(count)
10023283211,2676
10142254217,2506
10187291046,290
10192359162,320
10206519221,8043
10255988167,682
10264435225,4147
10279061213,456
10323485998,8559
10342410215,708


In [33]:
gender_count_sdf = gender_count_sdf.join(total_count, on='merchant_abn').sort("merchant_abn")
gender_count_sdf = gender_count_sdf.withColumnRenamed("sum(count)","total_transactions_count")
gender_count_sdf


                                                                                

merchant_abn,gender,count,total_transactions_count
10023283211,Undisclosed,272,2676
10023283211,Male,1189,2676
10023283211,Female,1215,2676
10142254217,Undisclosed,253,2506
10142254217,Male,1183,2506
10142254217,Female,1070,2506
10187291046,Female,128,290
10187291046,Undisclosed,33,290
10187291046,Male,129,290
10192359162,Female,146,320


In [34]:
# calculate the consumer gender percentage for each gender and for each merchants, save the percentage as "gender_percentage"
from pyspark.sql import functions as F
gender_count_sdf = gender_count_sdf.withColumn("gender_percentage", F.col("count")/F.col("total_transactions_count"))
gender_count_sdf

merchant_abn,gender,count,total_transactions_count,gender_percentage
10023283211,Undisclosed,272,2676,0.1016442451420029
10023283211,Male,1189,2676,0.4443198804185351
10023283211,Female,1215,2676,0.4540358744394618
10142254217,Undisclosed,253,2506,0.1009577015163607
10142254217,Male,1183,2506,0.4720670391061452
10142254217,Female,1070,2506,0.426975259377494
10187291046,Female,128,290,0.4413793103448276
10187291046,Undisclosed,33,290,0.1137931034482758
10187291046,Male,129,290,0.4448275862068965
10192359162,Female,146,320,0.45625


In [35]:
# separate gender percentage by male, female and undisclosed
male_percentage = gender_count_sdf.filter("gender == 'Male'").select(F.col("merchant_abn"),F.col("gender_percentage")).withColumnRenamed("gender_percentage","male_consumer_percentage")
female_percentage = gender_count_sdf.filter("gender == 'Female'").select(F.col("merchant_abn"),F.col("gender_percentage")).withColumnRenamed("gender_percentage","female_consumer_percentage")
undisclosed_percentage = gender_count_sdf.filter("gender == 'Undisclosed'").select(F.col("merchant_abn"),F.col("gender_percentage")).withColumnRenamed("gender_percentage","undisclosed_consumer_percentage")

In [36]:
# observe one of the outcome dataframe
male_percentage

                                                                                

merchant_abn,male_consumer_percentage
10023283211,0.4443198804185351
10142254217,0.4720670391061452
10187291046,0.4448275862068965
10192359162,0.459375
10206519221,0.453562103692652
10255988167,0.4486803519061583
10264435225,0.4458644803472389
10279061213,0.4342105263157895
10323485998,0.4632550531604159
10342410215,0.4477401129943503


In [37]:
# combine the 3 gender percentages together into 1 dataframe
agg_df = male_percentage.join(female_percentage, on="merchant_abn")
agg_df = agg_df.join(undisclosed_percentage, on="merchant_abn")
# add the total transaction count into aggregated dataframe and rename the column name
agg_df = agg_df.join(total_count, on="merchant_abn")
agg_df = agg_df.withColumnRenamed("sum(count)","total_transactions_count")
agg_df

                                                                                

merchant_abn,male_consumer_percentage,female_consumer_percentage,undisclosed_consumer_percentage,total_transactions_count
10023283211,0.4443198804185351,0.4540358744394618,0.1016442451420029,2676
10142254217,0.4720670391061452,0.426975259377494,0.1009577015163607,2506
10187291046,0.4448275862068965,0.4413793103448276,0.1137931034482758,290
10192359162,0.459375,0.45625,0.084375,320
10206519221,0.453562103692652,0.4482158398607485,0.0982220564465995,8043
10255988167,0.4486803519061583,0.4560117302052786,0.095307917888563,682
10264435225,0.4458644803472389,0.4528574873402459,0.101278032312515,4147
10279061213,0.4342105263157895,0.4736842105263157,0.0921052631578947,456
10323485998,0.4632550531604159,0.4303072788877205,0.1064376679518635,8559
10342410215,0.4477401129943503,0.4519774011299435,0.1002824858757062,708


In [38]:
# calculate the average 
temp = full.groupBy("merchant_abn") \
    .agg(F.mean("Median_age_persons").alias("avg_comsumer_age"), \
         F.mean("Median_tot_prsnl_inc_weekly").alias("avg_consumer_weekly_income"), \
         F.mean("Median_rent_weekly").alias("avg_comsumer_weekly_rent"), \
         F.mean("dollar_value").alias("avg_total_value"),\
     )

In [39]:
temp

                                                                                

merchant_abn,avg_comsumer_age,avg_consumer_weekly_income,avg_comsumer_weekly_rent,avg_total_value
19839532017,43.40066225165563,942.0645695364238,307.20364238410593,157.0
38700038932,43.48916936721815,824.3348115299335,301.4045710387174,1344.882392211653
57798993346,45.11764705882353,941.1764705882352,335.11764705882354,968.0714002697254
45339304653,43.78842105263158,797.061052631579,294.7052631578948,80.71460424560924
51561881468,43.61951219512195,784.0390243902439,294.2829268292683,44.42169968085328
66610548417,44.0,768.1141304347826,307.3097826086956,921.2551840167838
98545158925,43.553484036355165,812.2920065252855,302.18433931484503,38.80709025200576
46331355995,43.15366289458011,857.8147706968433,303.5092316855271,446.212715726676
90568944804,43.4614516311096,815.516198216503,301.28027994130264,897.483749240815
43719937438,43.21381847914963,797.8556827473426,300.1749795584628,99.17411864759902


In [40]:
agg_df = agg_df.join(temp, on="merchant_abn")

In [41]:
temp2 = full.select("merchant_abn","name", "business_area", "revenue_level", "take_rate", "annual_turnover_percentage")
temp2

merchant_abn,name,business_area,revenue_level,take_rate,annual_turnover_percentage
68004106739,Nec Ante Ltd,"cable, satellite,...",a,5.61,21.9
47663262928,Eget Lacus LLP,"cable, satellite,...",a,6.66,21.9
70052129860,Donec Tempus Lore...,"cable, satellite,...",b,3.37,21.9
57021295485,Enim Etiam Imperd...,"cable, satellite,...",a,5.75,21.9
79645157255,Consectetuer Maur...,"cable, satellite,...",a,6.46,21.9
21439773999,Mauris Non Institute,"cable, satellite,...",a,6.1,21.9
21439773999,Mauris Non Institute,"cable, satellite,...",a,6.1,21.9
94472466107,Eu Dolor Egestas PC,"cable, satellite,...",a,6.23,21.9
66370248931,Morbi Non PC,"cable, satellite,...",b,3.15,21.9
21439773999,Mauris Non Institute,"cable, satellite,...",a,6.1,21.9


In [42]:
agg_df = agg_df.join(temp2, on="merchant_abn")


In [43]:
agg_df = agg_df.distinct()
agg_df 

                                                                                

merchant_abn,male_consumer_percentage,female_consumer_percentage,undisclosed_consumer_percentage,total_transactions_count,avg_comsumer_age,avg_consumer_weekly_income,avg_comsumer_weekly_rent,avg_total_value,name,business_area,revenue_level,take_rate,annual_turnover_percentage
10023283211,0.4443198804185351,0.4540358744394618,0.1016442451420029,2676,43.45104633781764,788.5007473841555,298.20216741405085,216.00966289621505,Felis Limited,"furniture, home f...",e,0.18,17.0
10342410215,0.4477401129943503,0.4519774011299435,0.1002824858757062,708,43.59039548022599,783.3333333333334,305.15677966101697,380.3336273255781,Facilisis Facilis...,"computers, comput...",a,6.34,21.9
10385163239,0.4603174603174603,0.4444444444444444,0.0952380952380952,63,41.96825396825397,805.4761904761905,321.5873015873016,350.60472561042235,Sed Et Company,florists supplies...,a,6.61,21.9
10648956813,0.4522796352583587,0.4437137330754352,0.1040066316662061,18095,43.596684166896935,852.708040895275,300.661619231832,64.57669527052369,Proin Nisl Institute,"computers, comput...",a,6.66,21.9
10714068705,0.462478184991274,0.4317626527050611,0.1057591623036649,2865,43.59581151832461,790.0024432809773,302.73228621291446,125.69312268344446,Sollicitudin Comm...,"furniture, home f...",c,2.51,17.0
11024352823,0.4441489361702128,0.4441489361702128,0.1117021276595744,376,43.56117021276596,776.75,304.156914893617,214.44147093989704,Aliquet Metus Urn...,antique shops - s...,c,2.62,21.9
11076688542,0.4545454545454545,0.2954545454545454,0.25,44,43.04545454545455,794.0909090909091,313.72727272727275,698.1462248423107,Amet Diam Industries,"stationery, offic...",c,2.52,21.9
11243046390,0.4559585492227979,0.4352331606217616,0.1088082901554404,386,43.24870466321244,780.6709844559585,291.0854922279793,245.76489261479708,Ornare Placerat F...,"books, periodical...",c,1.94,21.9
11633090957,0.4596491228070175,0.4421052631578947,0.0982456140350877,285,44.02105263157895,780.2385964912281,308.5543859649123,149.43110458918156,Cum Sociis Incorp...,lawn and garden s...,b,4.0,21.9
11788487195,0.437125748502994,0.4910179640718562,0.0718562874251497,167,43.952095808383234,805.1736526946107,285.5149700598802,1105.417010503279,Id Magna LLP,music shops - mus...,b,3.51,21.9


### Calculate AP_rate 

AP_rate of a merchant is the percentage of existing customers of this merchant who will potentially uses BNPL(Afterpay) service, indicating the amount of potential consumers that this merchant can bring after cooperation. 
This feature will be calculated based on the consumer gender, the consumer age and the consumer income/rent.

It is observed that the average age is about the same for all merchants so we decide to remove the age feature. 

In [44]:
agg_df = agg_df.drop("avg_comsumer_age")

In [45]:
agg_df

                                                                                

merchant_abn,male_consumer_percentage,female_consumer_percentage,undisclosed_consumer_percentage,total_transactions_count,avg_consumer_weekly_income,avg_comsumer_weekly_rent,avg_total_value,name,business_area,revenue_level,take_rate,annual_turnover_percentage
10023283211,0.4443198804185351,0.4540358744394618,0.1016442451420029,2676,788.5007473841555,298.20216741405085,216.00966289621505,Felis Limited,"furniture, home f...",e,0.18,17.0
10342410215,0.4477401129943503,0.4519774011299435,0.1002824858757062,708,783.3333333333334,305.15677966101697,380.3336273255781,Facilisis Facilis...,"computers, comput...",a,6.34,21.9
10385163239,0.4603174603174603,0.4444444444444444,0.0952380952380952,63,805.4761904761905,321.5873015873016,350.60472561042235,Sed Et Company,florists supplies...,a,6.61,21.9
10648956813,0.4522796352583587,0.4437137330754352,0.1040066316662061,18095,852.708040895275,300.661619231832,64.57669527052369,Proin Nisl Institute,"computers, comput...",a,6.66,21.9
10714068705,0.462478184991274,0.4317626527050611,0.1057591623036649,2865,790.0024432809773,302.73228621291446,125.69312268344446,Sollicitudin Comm...,"furniture, home f...",c,2.51,17.0
11024352823,0.4441489361702128,0.4441489361702128,0.1117021276595744,376,776.75,304.156914893617,214.44147093989704,Aliquet Metus Urn...,antique shops - s...,c,2.62,21.9
11076688542,0.4545454545454545,0.2954545454545454,0.25,44,794.0909090909091,313.72727272727275,698.1462248423107,Amet Diam Industries,"stationery, offic...",c,2.52,21.9
11243046390,0.4559585492227979,0.4352331606217616,0.1088082901554404,386,780.6709844559585,291.0854922279793,245.76489261479708,Ornare Placerat F...,"books, periodical...",c,1.94,21.9
11633090957,0.4596491228070175,0.4421052631578947,0.0982456140350877,285,780.2385964912281,308.5543859649123,149.43110458918156,Cum Sociis Incorp...,lawn and garden s...,b,4.0,21.9
11788487195,0.437125748502994,0.4910179640718562,0.0718562874251497,167,805.1736526946107,285.5149700598802,1105.417010503279,Id Magna LLP,music shops - mus...,b,3.51,21.9


##### AP_rate: income/rent

We suggests that the probablity of a consumer uses BNPL or not might depend on the spare money he/she owns. If a person has less spare money, then she/he is more likely to use BNPL service. We compute the 'spare_money' by "weekly_income"-"weekly_rent" as an approximation.

In [46]:
agg_df = agg_df.withColumn('avg_consumer_weekly_spare_money', F.col('avg_consumer_weekly_income') - F.col('avg_comsumer_weekly_rent'))
# remove income and rent
agg_df = agg_df.drop('avg_consumer_weekly_income')
agg_df = agg_df.drop('avg_comsumer_weekly_rent')

In [47]:
agg_df

                                                                                

merchant_abn,male_consumer_percentage,female_consumer_percentage,undisclosed_consumer_percentage,total_transactions_count,avg_total_value,name,business_area,revenue_level,take_rate,annual_turnover_percentage,avg_consumer_weekly_spare_money
10023283211,0.4443198804185351,0.4540358744394618,0.1016442451420029,2676,216.00966289621505,Felis Limited,"furniture, home f...",e,0.18,17.0,490.29857997010464
10342410215,0.4477401129943503,0.4519774011299435,0.1002824858757062,708,380.3336273255781,Facilisis Facilis...,"computers, comput...",a,6.34,21.9,478.1765536723164
10385163239,0.4603174603174603,0.4444444444444444,0.0952380952380952,63,350.60472561042235,Sed Et Company,florists supplies...,a,6.61,21.9,483.8888888888889
10648956813,0.4522796352583587,0.4437137330754352,0.1040066316662061,18095,64.57669527052369,Proin Nisl Institute,"computers, comput...",a,6.66,21.9,552.046421663443
10714068705,0.462478184991274,0.4317626527050611,0.1057591623036649,2865,125.69312268344446,Sollicitudin Comm...,"furniture, home f...",c,2.51,17.0,487.2701570680629
11024352823,0.4441489361702128,0.4441489361702128,0.1117021276595744,376,214.44147093989704,Aliquet Metus Urn...,antique shops - s...,c,2.62,21.9,472.593085106383
11076688542,0.4545454545454545,0.2954545454545454,0.25,44,698.1462248423107,Amet Diam Industries,"stationery, offic...",c,2.52,21.9,480.3636363636364
11243046390,0.4559585492227979,0.4352331606217616,0.1088082901554404,386,245.76489261479708,Ornare Placerat F...,"books, periodical...",c,1.94,21.9,489.5854922279793
11633090957,0.4596491228070175,0.4421052631578947,0.0982456140350877,285,149.43110458918156,Cum Sociis Incorp...,lawn and garden s...,b,4.0,21.9,471.68421052631584
11788487195,0.437125748502994,0.4910179640718562,0.0718562874251497,167,1105.417010503279,Id Magna LLP,music shops - mus...,b,3.51,21.9,519.6586826347304


Now we need to standardise the weekly spare money with mean 0. We uses mean=0 because after standardisation, we need to invert the sign as the poor people is more likely to use BNPL so they should have a higher AP_rate, whereas the richers should have a lower rate.

In [48]:
spare_money_list = agg_df.select('avg_consumer_weekly_spare_money').rdd.flatMap(lambda x: x).collect()

                                                                                

In [49]:
from sklearn import preprocessing
# normal standardise the spare money and reverse the signs
scaled = preprocessing.scale(spare_money_list) * -1
scaled = scaled.tolist()
# convert to pandas and append list as new column
agg_df_pandas = agg_df.toPandas()
agg_df_pandas['consumer_scaled_spare_money'] = scaled

                                                                                

In [50]:
# convert back to pyspark
agg_df = spark.createDataFrame(agg_df_pandas) 
agg_df = agg_df.drop("avg_consumer_weekly_spare_money")

In [51]:
agg_df

merchant_abn,male_consumer_percentage,female_consumer_percentage,undisclosed_consumer_percentage,total_transactions_count,avg_total_value,name,business_area,revenue_level,take_rate,annual_turnover_percentage,consumer_scaled_spare_money
10023283211,0.4443198804185351,0.4540358744394618,0.1016442451420029,2676,216.00966289621505,Felis Limited,"furniture, home f...",e,0.18,17.0,0.1933527945132402
10342410215,0.4477401129943503,0.4519774011299435,0.1002824858757062,708,380.3336273255781,Facilisis Facilis...,"computers, comput...",a,6.34,21.9,0.27941998039967986
10385163239,0.4603174603174603,0.4444444444444444,0.0952380952380952,63,350.60472561042235,Sed Et Company,florists supplies...,a,6.61,21.9,0.23886202376332882
10648956813,0.4522796352583587,0.4437137330754352,0.1040066316662061,18095,64.57669527052369,Proin Nisl Institute,"computers, comput...",a,6.66,21.9,-0.24506095237630954
10714068705,0.462478184991274,0.4317626527050611,0.1057591623036649,2865,125.69312268344446,Sollicitudin Comm...,"furniture, home f...",c,2.51,17.0,0.2148547967734017
11024352823,0.4441489361702128,0.4441489361702128,0.1117021276595744,376,214.44147093989704,Aliquet Metus Urn...,antique shops - s...,c,2.62,21.9,0.31906297532066624
11076688542,0.4545454545454545,0.2954545454545454,0.25,44,698.1462248423107,Amet Diam Industries,"stationery, offic...",c,2.52,21.9,0.2638915491029093
11243046390,0.4559585492227979,0.4352331606217616,0.1088082901554404,386,245.76489261479708,Ornare Placerat F...,"books, periodical...",c,1.94,21.9,0.1984157644942609
11633090957,0.4596491228070175,0.4421052631578947,0.0982456140350877,285,149.43110458918156,Cum Sociis Incorp...,lawn and garden s...,b,4.0,21.9,0.3255160447587661
11788487195,0.437125748502994,0.4910179640718562,0.0718562874251497,167,1105.417010503279,Id Magna LLP,music shops - mus...,b,3.51,21.9,-0.01510587003712...


##### AP_rate: gender

Now we have "male_consumer_percentage", "female_consumer_percentage" and "undisclosed_consumer_percentage" amongst all the existing consumers for each merchants. 

According to a research from Roy Morgan, women are significantly more likely to use buy-now-pay-later payment services, over one-in-ten women (11.6%) report using a buy-now-pay-later service in the last year compared to only 5.5% of men. (https://www.roymorgan.com/findings/women-more-likely-to-use-buy-now-pay-later-services). Therefore, it is reasonable that we believe that the AP_rate also depends on gender. 

Now we have 
1. "male_consumer_percentage", "female_consumer_percentage" and "undisclosed_consumer_percentage" amongst all the existing consumers for each merchants
2. "male_ap_percentage"= 5.5 %, "female_ap_percentage" = 11.6%
3. the mean of female and male afterpay percentage would be used as the percentage for undisclosed gender, so "undisclosed_ap_percentage" = (5.5+11.6)/2 = 8.55%

We calculate the percentage of existing consumers using BNPL for each merchant using the following formula: 
ap_percentage_by_gender = male_consumer_percentage * male_ap_percentage + female_consumer_percentage * female_ap_percentage + disclosed_consumer_percentage * disclosed_ap_percentage.

In [52]:
import numpy as np
male_ap_percentage = 0.055
female_ap_percentage = 0.116
undisclosed_ap_percentage = np.mean([male_ap_percentage, female_ap_percentage])

In [53]:
gender = agg_df.select("male_consumer_percentage", "female_consumer_percentage","undisclosed_consumer_percentage")
gender = gender.withColumn("ap_percentage_by_gender", F.col("male_consumer_percentage")*male_ap_percentage + F.col("female_consumer_percentage")*female_ap_percentage + F.col("undisclosed_consumer_percentage")*undisclosed_ap_percentage)

In [54]:
agg_df.join(gender, on=["male_consumer_percentage", "female_consumer_percentage","undisclosed_consumer_percentage"]).drop("male_consumer_percentage", "female_consumer_percentage", "undisclosed_consumer_percentage")

                                                                                

merchant_abn,total_transactions_count,avg_total_value,name,business_area,revenue_level,take_rate,annual_turnover_percentage,consumer_scaled_spare_money,ap_percentage_by_gender
21532935983,3443,38.66947142585576,Eleifend Nec Inco...,"cable, satellite,...",a,5.58,21.9,0.0182907208117619,0.0849064769096718
55501929396,6314,187.68267696817625,Mauris Sagittis C...,health and beauty...,b,3.5,2.4,0.0996724442744066,0.0853502534051314
37621714049,891,681.8839848218494,Ut Sem Company,"hobby, toy and ga...",a,5.9,21.9,-0.5407985266186767,0.0845072951739618
93110588804,17,11965.797133836164,Aenean Sed Pede C...,"jewelry, watch, c...",b,3.86,21.9,0.3378917014679146,0.0890882352941176
78916025936,51,336.2989263016063,Urna Nec Corporation,florists supplies...,e,0.37,21.9,0.0911992147399022,0.0890882352941176
25078409316,3045,146.79002731620812,Vel LLP,florists supplies...,a,6.6,21.9,0.1139140764833553,0.0842779967159277
32897338221,160,77.14942422013578,Vel Nisl Incorpor...,"gift, card, novel...",a,6.2,21.9,0.3423553458560171,0.08855
46017523620,256,501.3433974296067,Lorem Donec Eleme...,health and beauty...,c,2.79,2.4,0.1489062839703425,0.0869296874999999
69655310165,107,1054.232169868249,Elit Pharetra Cor...,"hobby, toy and ga...",b,3.21,21.9,0.0590452969717677,0.0783738317757009
27093785141,21319,376.5372786372577,Placerat Orci Ins...,"stationery, offic...",c,2.73,21.9,-0.054786025007017,0.085072236033585


In [55]:
agg_df.count()

3782

#### AP_rate: combining "gender" and "scaled spare money"