In [78]:
import sys, os, time
sys.path.insert(1, '../../scripts/')
from s3_support import *

import pandas as pd

# load data

## forms

In [79]:
q = '''select id, org, type, template from form'''
forms = redshift_query_read(q, schema='production')

In [80]:
forms['type'].unique()

array([1, 2, 3, 4, 5])

## transactions

In [81]:
q = '''select * from transactions 
        where 
            status='A' and
            year>=2023'''
trans = redshift_query_read(q, schema='production')

In [82]:
trans.head(3)

Unnamed: 0,id,org,form,status,amount,donations_amt,purchases_amt,events_amt,registrations_amt,events_tickets,...,is_fraud,channel,useragent,gift_assist_count,gift_assist_amt,qgiv_fee,platform,recurring_origin,is_new_form,isexpresscheckout
0,18705059,437286,867448,A,25.0,25.0,0.0,0.0,0.0,0,...,False,0,,0,0.0,0.0,,0,0,False
1,18706899,436247,928464,A,19.0,19.0,0.0,0.0,0.0,0,...,False,0,,0,0.0,0.0,,0,0,False
2,18706291,436247,964981,A,100.0,100.0,0.0,0.0,0.0,0,...,False,0,,0,0.0,0.0,,0,0,False


In [83]:
drop_cols =['id', 'status', 'source_id', 'hour', 'day',
            'month', 'year', 'zip', 'state', 'email',
            'is_fraud']
trans = trans.drop(drop_cols, axis=1)

In [84]:
sum_cols = ['donations_count', 'purchases_count', 'events_count',
            'registrations_count', 'recurring', 'matchinggifts_count',
            'smspledge_count', 'auctionpurchase_count',
            'gift_assist_count', 'isexpresscheckout']

feat_grpd = trans.groupby('form')[sum_cols].sum().reset_index()
form_totals = trans.groupby('form')['qgiv_fee'].agg(['count', 'sum']).reset_index()

In [85]:
form_totals['fee_count'] = form_totals['count']
form_totals['fee_vol'] = form_totals['sum']
form_totals.drop(['count', 'sum'], axis=1, inplace=True)

feat_grpd = feat_grpd.merge(form_totals, on='form')

# analysis

## correlation

In [86]:
flag_grpd = feat_grpd>1
flag_grpd['form'] = feat_grpd['form']
flag_grpd['fee_vol'] = feat_grpd['fee_vol']
flag_grpd['fee_count'] = feat_grpd['fee_count']
flag_grpd = flag_grpd.merge(forms[['id', 'type']], left_on='form', right_on='id').drop('id', axis=1)
flag_grpd.head()

Unnamed: 0,form,donations_count,purchases_count,events_count,registrations_count,recurring,matchinggifts_count,smspledge_count,auctionpurchase_count,gift_assist_count,isexpresscheckout,fee_count,fee_vol,type
0,1,True,False,True,False,True,False,False,False,True,False,2892,15650.62,1
1,3,True,False,True,False,True,False,False,False,True,False,657,5795.78,1
2,9,False,False,False,False,True,False,False,False,False,False,1,5.85,1
3,11,True,False,False,False,True,False,False,False,False,False,55,806.02,1
4,14,True,False,False,False,True,False,False,False,True,False,293,3233.37,1


In [87]:
flag_grpd.drop(['form', 'type'], axis=1).corr()[['fee_count', 'fee_vol']]

Unnamed: 0,fee_count,fee_vol
donations_count,0.040582,0.064206
purchases_count,0.006991,0.032437
events_count,-0.005651,0.026024
registrations_count,0.021603,0.03595
recurring,0.071783,0.072329
matchinggifts_count,,
smspledge_count,,
auctionpurchase_count,-0.000992,0.049088
gift_assist_count,0.032311,0.066992
isexpresscheckout,0.026939,0.035579


## form types

In [95]:
flag_grpd['fee_vol'].agg(['mean', 'median'])

mean      679.199206
median     71.450000
Name: fee_vol, dtype: float64

In [88]:
flag_grpd.groupby('type')['fee_vol'].agg(['mean', 'median', 'count']).reset_index()

Unnamed: 0,type,mean,median,count
0,1,602.747516,54.66,23527
1,3,883.505179,179.855,3524
2,5,1512.819498,760.975,1294


## feature averages

In [89]:
grp_cols = [c for c in flag_grpd.columns if c not in ['form', 'fee_count', 'fee_vol', 'type']]
fee_groups = flag_grpd.groupby(grp_cols)[['fee_count', 'fee_vol']].sum().reset_index()
fee_groups['fee_vol_per_form'] = fee_groups['fee_vol'] / fee_groups['fee_count']

In [90]:
d = []
for c in grp_cols:
    mn = flag_grpd[flag_grpd[c]]['fee_vol'].mean()
    mdn = flag_grpd[flag_grpd[c]]['fee_vol'].median()
    len_group = len(flag_grpd[flag_grpd[c]]['form'].unique())
    d.append({
        'feature': c.replace('_count', ''),
        'count': len_group,
        'mean fees': mn,
        'median fees': mdn
    })
pd.DataFrame(d).sort_values('median fees', ascending=False)

Unnamed: 0,feature,count,mean fees,median fees
7,auctionpurchase,963,1872.773666,1082.08
1,purchases,619,1669.103473,850.1
3,registrations,2288,1232.402832,408.335
9,isexpresscheckout,625,1759.663408,246.16
2,events,6863,889.150396,204.37
8,gift_assist,14312,981.683266,193.495
0,donations,19966,868.861195,116.35
4,recurring,9603,1139.957201,107.83
5,matchinggifts,0,,
6,smspledge,0,,


In [91]:
fee_groups[fee_groups[c]]

Unnamed: 0,donations_count,purchases_count,events_count,registrations_count,recurring,matchinggifts_count,smspledge_count,auctionpurchase_count,gift_assist_count,isexpresscheckout,fee_count,fee_vol,fee_vol_per_form
5,False,False,False,False,True,False,False,False,True,True,56,153.03,2.732679
21,True,False,False,False,False,False,False,False,False,True,2783,9027.0,3.243622
23,True,False,False,False,False,False,False,False,True,True,5793,43591.24,7.524813
26,True,False,False,False,True,False,False,False,False,True,45089,66919.94,1.484174
28,True,False,False,False,True,False,False,False,True,True,248671,773987.72,3.112497
35,True,False,True,False,False,False,False,False,True,True,4944,42117.28,8.518867
39,True,False,True,False,True,False,False,False,False,True,8044,56238.24,6.991328
41,True,False,True,False,True,False,False,False,True,True,24245,107755.18,4.444429


In [92]:
fee_groups[fee_groups['fee_count']>10].sort_values('fee_vol_per_form', ascending=False).head(10)

Unnamed: 0,donations_count,purchases_count,events_count,registrations_count,recurring,matchinggifts_count,smspledge_count,auctionpurchase_count,gift_assist_count,isexpresscheckout,fee_count,fee_vol,fee_vol_per_form
42,True,True,False,False,False,False,False,False,False,False,1105,30337.57,27.454814
43,True,True,False,False,False,False,False,False,True,False,2164,39781.15,18.383156
18,False,True,True,False,False,False,False,True,False,False,857,13669.18,15.950035
17,False,True,True,False,False,False,False,False,True,False,32,509.13,15.910312
37,True,False,True,False,False,False,False,True,True,False,82556,1032829.08,12.510648
50,True,True,True,False,False,False,False,False,True,False,470,5233.76,11.13566
52,True,True,True,False,False,False,False,True,True,False,35851,374922.66,10.457802
51,True,True,True,False,False,False,False,True,False,False,4282,44327.31,10.352011
36,True,False,True,False,False,False,False,True,False,False,14785,147542.3,9.979188
0,False,False,False,False,False,False,False,False,False,False,3281,32357.62,9.862121
