In [1]:
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

import pandas as pd
import numpy as np
%matplotlib inline

In [8]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error as mse

# data load

In [2]:
q = '''select
            t.org,
            t.year,
            count(distinct(t.form)) as forms,
            count(distinct(case when t.recurring=0 then t.id else null end)) as onetime_count,
            sum(case when t.recurring=0 then t.amount else 0 end) as onetime_vol,
            count(distinct(case when t.recurring!=0 then t.id else null end)) as recurring_count,
            sum(case when t.recurring!=0 then t.amount else 0 end) as recurring_vol,
            sum(ga.views) as pageviews
        from transactions as t
            left join ga on t.form=ga.form and t.date=ga.date
        group by t.org, t.year'''
df = redshift_query_read(q, schema='production')

In [16]:
len(df)

36263

In [3]:
df.head(3)

Unnamed: 0,org,year,forms,onetime_count,onetime_vol,recurring_count,recurring_vol,pageviews
0,421,2023,8,1941,3100821.23,234,48271.04,111659.0
1,31764,2020,3,4370,55338783.33,1485,2817450.91,1251094.0
2,454,2017,2,862,1706266.46,635,255265.2,45131.0


# analysis

In [34]:
df.drop(['org', 'year'], axis=1).corr()

Unnamed: 0,forms,onetime_count,onetime_vol,recurring_count,recurring_vol,pageviews
forms,1.0,0.544619,0.101189,0.709724,0.151844,0.247688
onetime_count,0.544619,1.0,0.234251,0.516381,0.252048,0.40326
onetime_vol,0.101189,0.234251,1.0,0.114271,0.176638,0.225696
recurring_count,0.709724,0.516381,0.114271,1.0,0.308386,0.416093
recurring_vol,0.151844,0.252048,0.176638,0.308386,1.0,0.652579
pageviews,0.247688,0.40326,0.225696,0.416093,0.652579,1.0


In [20]:
drop_cols = ['onetime_count', 'onetime_vol', 'recurring_count', 
             'recurring_vol', 'org']
df_train = df.fillna(0)

In [21]:
for c in drop_cols:
    if c == 'org':
        continue
    rf = RandomForestRegressor(oob_score=True)
    rf.fit(df_train.drop(drop_cols, axis=1), df_train[c])
    rmse = mse(df_train[c], rf.oob_prediction_, squared=False)
    print("{}: {}".format(c, rmse))

onetime_count: 1654.9885401069694
onetime_vol: 2528555243.677813
recurring_count: 1592.0758819630687
recurring_vol: 22165578.141259126


In [32]:
print("Binning forms")
bins=(1, 2, 3, 4, 5, 6, 7, 8, 10, 20, 50, 100, 500)
df['bin'] = pd.cut(df['forms'], bins=bins)
df.groupby('bin')[['onetime_count', 'recurring_count']].agg(['mean', 'median', 'std']).reset_index()

Binning forms


Unnamed: 0_level_0,bin,onetime_count,onetime_count,onetime_count,recurring_count,recurring_count,recurring_count
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,std,mean,median,std
0,"(1, 2]",319.022709,96.0,903.957612,98.907531,4.0,436.946228
1,"(2, 3]",432.181947,167.0,964.949764,140.148319,12.0,452.366825
2,"(3, 4]",568.933968,222.0,1365.737237,203.648254,18.0,992.392623
3,"(4, 5]",799.678704,258.0,5125.286544,262.848148,28.0,1075.652956
4,"(5, 6]",643.037037,306.0,1287.609805,223.161866,41.0,954.855182
5,"(6, 7]",700.691622,360.0,1301.038703,268.411765,50.0,725.492192
6,"(7, 8]",970.553377,409.0,2289.99262,416.400871,72.0,1367.242174
7,"(8, 10]",1257.846986,621.0,3444.652229,501.703246,81.0,1617.219589
8,"(10, 20]",1595.345763,851.5,2383.265737,456.367797,90.0,1248.926955
9,"(20, 50]",3943.007547,2084.0,6126.248623,1731.026415,477.0,5602.619344


In [33]:
print("Binning traffic")
bins = (0, 100, 500, 1000, 10000, 25000, 50000, 100000, 500000, 1000000, 5000000)
df['bin'] = pd.cut(df['pageviews'], bins=bins)
df.groupby('bin')[['onetime_count', 'recurring_count']].agg(['mean', 'median', 'std']).reset_index()

Binning traffic


Unnamed: 0_level_0,bin,onetime_count,onetime_count,onetime_count,recurring_count,recurring_count,recurring_count
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,std,mean,median,std
0,"(0, 100]",60.133365,10.0,281.860881,21.104775,0.0,61.990584
1,"(100, 500]",137.038793,49.0,342.950173,53.823851,9.0,136.96046
2,"(500, 1000]",205.867031,88.0,432.881826,97.369156,14.0,271.491246
3,"(1000, 10000]",391.277003,206.0,875.452016,151.372954,24.0,431.538057
4,"(10000, 25000]",750.195266,459.0,1120.440768,242.081226,40.0,629.986972
5,"(25000, 50000]",1081.047025,675.0,1362.912781,359.386756,47.5,961.682824
6,"(50000, 100000]",1557.777354,955.5,2023.560448,532.058524,77.0,1335.982904
7,"(100000, 500000]",2646.02572,1641.0,3996.923342,976.912551,84.0,2970.664702
8,"(500000, 1000000]",4604.060606,3640.5,3988.149365,2053.878788,580.0,4074.633963
9,"(1000000, 5000000]",7617.527363,5155.0,8705.130593,3591.233831,648.0,7150.03209
