In [1]:
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

import numpy as np

# load data

## orgs

In [3]:
q = '''select id as org, state, segment from organization'''
orgs = redshift_query_read(q, schema='production')

In [4]:
print("{:,} entries".format(len(orgs)))
print("{:,} unique org's".format(len(orgs['org'].unique())))

14,600 entries
14,600 unique org's


## transactions

In [5]:
q = '''select
            org,
            date_trunc('week', date) as week,
            count(id) as count_all,
            sum(amount) as vol_all,
            count(distinct(case when recurring=0 then id else null end)) as count_onetime,
            count(distinct(case when recurring_origin=1 then id else null end)) as count_recurring,
            avg(case when recurring=0 then amount else null end) as avg_onetime,
            avg(case when recurring_origin=1 then amount else null end) as avg_recurring
        from transactions
        where status='A' and year >= 2018 and source != 'vt' and source != 'mobilevt'
        group by org, date_trunc('week', date)'''
trans = redshift_query_read(q, schema='production')

In [6]:
print("{:,} entries".format(len(trans)))
print("{:,} unique org's".format(len(trans['org'].unique())))
print("{} - {}".format(trans['week'].min(), trans['week'].max()))

569,355 entries
7,509 unique org's
2018-01-01 00:00:00 - 2024-03-11 00:00:00


## traffic

In [7]:
q = '''select
            org,
            date_trunc('week', date) as week,
            sum(views) as pageviews
        from ga
        where date >= 2018
        group by org, date_trunc('week', date)'''
traffic = redshift_query_read(q, schema='production')

In [8]:
print("{:,} traffic entries".format(len(traffic)))
print("{:,} unique org's".format(len(traffic['org'].unique())))
print("{} - {}".format(traffic['week'].min(), traffic['week'].max()))

646,071 traffic entries
8,644 unique org's
2018-01-01 00:00:00 - 2024-02-26 00:00:00


# merge

In [9]:
df = traffic.merge(trans, on=['org', 'week'], how='left').merge(orgs, on='org').fillna(0)
df['conversion_ot'] = df['count_onetime'] / df['pageviews']
df['conversion_rec'] = df['count_recurring'] / df['pageviews']

In [10]:
len(trans), len(traffic.merge(trans, on=['org', 'week'])), len(traffic.merge(trans, on=['org', 'week'], how='left'))

(569355, 393393, 646071)

In [11]:
print("{:,} entries".format(len(df)))
print("{:,} unique org's".format(len(df['org'].unique())))
print("{} - {}".format(df['week'].min(), df['week'].max()))

645,669 entries
8,612 unique org's
2018-01-01 00:00:00 - 2024-02-26 00:00:00


In [12]:
df.drop('org', axis=1).describe()

Unnamed: 0,pageviews,count_all,vol_all,count_onetime,count_recurring,avg_onetime,avg_recurring,conversion_ot,conversion_rec
count,645669.0,645669.0,645669.0,645669.0,645669.0,645669.0,645669.0,644509.0,644220.0
mean,248.794799,19.56064,2395.276,11.60069,0.47772,146.832417,13.252489,inf,inf
std,2193.703746,160.673246,14482.27,79.461898,10.534647,593.431831,189.546722,,
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,6.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,23.0,2.0,100.0,1.0,0.0,1.0,0.0,0.005479452,0.0
75%,102.0,9.0,1146.0,5.0,0.0,120.0,0.0,0.1,0.0
max,473098.0,30760.0,1880078.0,24073.0,3034.0,73500.0,61800.0,inf,inf


In [13]:
conv_ot = df['conversion_ot'].replace([np.inf, -np.inf], np.nan).dropna().describe()
conv_rec = df['conversion_rec'].replace([np.inf, -np.inf], np.nan).dropna().describe()

pd.DataFrame([conv_ot, conv_rec]).transpose()

Unnamed: 0,conversion_ot,conversion_rec
count,644186.0,644186.0
mean,0.513355,0.015985
std,6.794323,0.603654
min,0.0,0.0
25%,0.0,0.0
50%,0.005405,0.0
75%,0.1,0.0
max,2320.5,379.25


In [14]:
count_zero = len(df[df['count_all']==0])
traffic_zero = len(df[df['pageviews']==0])
onetime_zero = len(df[df['count_onetime']==0])
rec_zero = len(df[df['count_recurring']==0])

trans_gt_traff = len(df[df['count_all']>df['pageviews']])
rec_gt_ot = len(df[df['count_onetime']<df['count_recurring']])
ottrans_gt_traff = len(df[df['count_onetime']>df['pageviews']])
rectrans_gt_traff = len(df[df['count_recurring']>df['pageviews']])
rectrans_gt_std = len(df[df['count_recurring']>13])

conv_ot_mean = df['conversion_ot'].replace([np.inf, -np.inf], np.nan).dropna().mean()
conv_ot_median = df['conversion_ot'].replace([np.inf, -np.inf], np.nan).dropna().median()
conv_rec_mean = df['conversion_rec'].replace([np.inf, -np.inf], np.nan).dropna().mean()
conv_rec_median = df['conversion_rec'].replace([np.inf, -np.inf], np.nan).dropna().median()
conv_mean = conv_ot_mean + conv_rec_mean
conv_median = conv_ot_median + conv_rec_median

len_all = len(df)

print("Zero transactions: {:,} ({:.1f}%)".format(count_zero, (count_zero / len_all) * 100.))
print("Zero traffic: {:,} ({:.1f}%)".format(traffic_zero, (traffic_zero / len_all) * 100.))
print("Zero one time: {:,} ({:.1f}%)".format(onetime_zero, (onetime_zero / len_all) * 100.))
print("Zero recurring: {:,} ({:.1f}%)".format(rec_zero, (rec_zero / len_all) * 100.))
print()
print("Trans > traffic: {:,} ({:.1f}%)".format(trans_gt_traff, (trans_gt_traff / len_all) * 100.))
print("One time trans > traffic: {:,} ({:.1f}%)".format(ottrans_gt_traff, (ottrans_gt_traff / len_all) * 100.))
print("Recurring trans > traffic: {:,} ({:.1f}%)".format(rectrans_gt_traff, (rectrans_gt_traff / len_all) * 100.))
print("Recurring trans > one time: {:,} ({:.1f}%)".format(rec_gt_ot, (rec_gt_ot / len_all) * 100.))
print("Recurring trans > std: {:,} ({:.1f}%)".format(rectrans_gt_std, (rectrans_gt_std / len_all) * 100.))
print()
print("Conversion mean: {:.2f}%".format(conv_mean * 100.))
print("Conversion median: {:.2f}%".format(conv_median * 100.))
print("Conversion one time mean: {:.2f}%".format(conv_ot_mean * 100.))
print("Conversion one time median: {:.2f}%".format(conv_ot_median * 100.))
print("Conversion recurring mean: {:.2f}%".format(conv_rec_mean * 100.))
print("Conversion recurring median: {:.2f}%".format(conv_rec_median * 100.))

Zero transactions: 252,277 (39.1%)
Zero traffic: 1,483 (0.2%)
Zero one time: 311,798 (48.3%)
Zero recurring: 572,131 (88.6%)

Trans > traffic: 55,247 (8.6%)
One time trans > traffic: 32,260 (5.0%)
Recurring trans > traffic: 1,115 (0.2%)
Recurring trans > one time: 8,511 (1.3%)
Recurring trans > std: 2,122 (0.3%)

Conversion mean: 52.93%
Conversion median: 0.54%
Conversion one time mean: 51.34%
Conversion one time median: 0.54%
Conversion recurring mean: 1.60%
Conversion recurring median: 0.00%


In [15]:
# drop 0 traffic observations since we're concerned with donor behavior, not org performance
df = df[df['pageviews']!=0]

In [16]:
# clean up states
df['state'] = df['state'].apply(clean_states)

In [17]:
df['state'].unique()

array(['CA', 'NV', 'ID', 'DC', 'NC', 'VA', 'TX', 'NY', 'MD', 'CO', 'WY',
       'N/A', 'TN', 'MI', 'PA', 'MA', 'OH', 'FL', 'OR', 'AZ', 'ME', 'LA',
       'RI', 'CT', 'MO', 'WA', 'KY', 'NE', 'UT', 'SC', 'AL', 'IA', 'SD',
       'WI', 'KS', 'OK', 'IL', 'IN', 'NJ', 'GA', 'ND', 'MS', 'WV', 'NM',
       'MN', 'MT', 'NH', 'HI', 'BC', 'AR', 'DE', 'ON', 'AK', 'VT', 'AB',
       'MB', 'PR', 'GU', 'PE', 'QC', 'NB', 'VI', 'NS', 'AS', 'UM'],
      dtype=object)

In [18]:
#df.to_csv("seg_state.csv", index=False)

# one hot encode

In [19]:
segments = ['E', 'R', 'O', 'P', 'A', 'S', 'D', 'G', 'W', 'X', 'B', 'L',
            'C', 'K', 'F', 'I', 'T', 'Q', 'U', 'H', 'J', 'N', 'M', 'V', 
            'Y', 'D', 'T', 'R', 'Y', 'Z']
for segment in segments:
    df["segment_{}".format(segment)] = df['segment'].str.contains('{} - '.format(segment))
    df["segment_{}".format(segment)] = df["segment_{}".format(segment)].fillna(False)

In [20]:
df[[c for c in df.columns if 'segment' in c]].head(2)

Unnamed: 0,segment,segment_E,segment_R,segment_O,segment_P,segment_A,segment_S,segment_D,segment_G,segment_W,...,segment_T,segment_Q,segment_U,segment_H,segment_J,segment_N,segment_M,segment_V,segment_Y,segment_Z
0,B - Educational Institutions,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,B - Educational Institutions,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [21]:
df[[c for c in df.columns if 'segment' in c]].sum()

  """Entry point for launching an IPython kernel.


segment_E    31520
segment_R     9038
segment_O    56624
segment_P    92363
segment_A    39988
segment_S    18726
segment_D    20953
segment_G    21639
segment_W     9014
segment_X    79588
segment_B    73604
segment_L    22522
segment_C    18465
segment_K    13231
segment_F    15163
segment_I     8516
segment_T    21565
segment_Q    14111
segment_U     1246
segment_H     4408
segment_J     4010
segment_N    11702
segment_M     4489
segment_V     1742
segment_Y     1980
segment_Z    14672
dtype: int64

In [22]:
print("Segment observations:")
df.groupby('segment')['org'].count().agg(['mean', 'median'])

Segment observations:


mean      20780.193548
median    14111.000000
Name: org, dtype: float64

In [23]:
grpd = df.groupby(['segment', 'week']).count().reset_index()

In [24]:
grpd['org'].agg(['mean', 'median', 'min', 'max'])

mean       71.968048
median     46.000000
min         1.000000
max       441.000000
Name: org, dtype: float64

In [25]:
len_sml = len(grpd[grpd['org']<20])
print("{:,} ({:.1f}%) entries with fewer than 20 observations".format(len_sml, (len_sml / len(grpd)) * 100.))

2,688 (30.0%) entries with fewer than 20 observations


In [26]:
print(df.columns)
df.head(2)

Index(['org', 'week', 'pageviews', 'count_all', 'vol_all', 'count_onetime',
       'count_recurring', 'avg_onetime', 'avg_recurring', 'state', 'segment',
       'conversion_ot', 'conversion_rec', 'segment_E', 'segment_R',
       'segment_O', 'segment_P', 'segment_A', 'segment_S', 'segment_D',
       'segment_G', 'segment_W', 'segment_X', 'segment_B', 'segment_L',
       'segment_C', 'segment_K', 'segment_F', 'segment_I', 'segment_T',
       'segment_Q', 'segment_U', 'segment_H', 'segment_J', 'segment_N',
       'segment_M', 'segment_V', 'segment_Y', 'segment_Z'],
      dtype='object')


Unnamed: 0,org,week,pageviews,count_all,vol_all,count_onetime,count_recurring,avg_onetime,avg_recurring,state,...,segment_T,segment_Q,segment_U,segment_H,segment_J,segment_N,segment_M,segment_V,segment_Y,segment_Z
0,442289,2021-02-08,118,4.0,300.0,4.0,0.0,75.0,0.0,CA,...,False,False,False,False,False,False,False,False,False,False
1,442289,2023-01-23,3464,0.0,0.0,0.0,0.0,0.0,0.0,CA,...,False,False,False,False,False,False,False,False,False,False


In [27]:
drop_cols = ['org', 'week']
segment_columns = [col for col in df.columns if 'segment_' in col]
agg_cols = [c for c in df.columns if 'segment_' not in c and c not in drop_cols]

In [28]:
data = {}
for segment in segment_columns:
    data[segment] = df[df[segment]][agg_cols].median()

  This is separate from the ipykernel package so we can avoid doing imports until


In [29]:
pd.DataFrame(data).transpose()

Unnamed: 0,pageviews,count_all,vol_all,count_onetime,count_recurring,avg_onetime,avg_recurring,conversion_ot,conversion_rec
segment_E,24.0,1.0,72.79,1.0,0.0,0.0,0.0,0.002356,0.0
segment_R,18.0,1.0,50.0,0.0,0.0,0.0,0.0,0.0,0.0
segment_O,32.0,2.0,204.0,1.0,0.0,38.75,0.0,0.017857,0.0
segment_P,24.0,2.0,148.25,1.0,0.0,25.0,0.0,0.015873,0.0
segment_A,22.0,1.0,26.25,0.0,0.0,0.0,0.0,0.0,0.0
segment_S,13.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
segment_D,33.0,7.0,410.0,3.0,0.0,43.363636,0.0,0.037559,0.0
segment_G,25.0,1.0,52.47,1.0,0.0,2.777778,0.0,0.008403,0.0
segment_W,18.0,1.0,30.0,0.0,0.0,0.0,0.0,0.0,0.0
segment_X,22.0,5.0,525.0,2.0,0.0,71.4825,0.0,0.058824,0.0


In [30]:
state_data = {}
for segment in segment_columns:
    state_data[segment] = df[df[segment]&(df['pageviews']>0)].groupby('state')['conversion_ot'].median()

In [31]:
pd.DataFrame(state_data).fillna(0)

Unnamed: 0_level_0,segment_E,segment_R,segment_O,segment_P,segment_A,segment_S,segment_D,segment_G,segment_W,segment_X,...,segment_T,segment_Q,segment_U,segment_H,segment_J,segment_N,segment_M,segment_V,segment_Y,segment_Z
state,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AB,0.024923,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.500000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
AK,0.000000,0.009693,0.000000,0.019231,0.000000,0.00000,0.000000,0.226190,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
AL,0.400000,0.000000,0.013699,0.062782,0.037437,0.00000,0.032520,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
AR,0.000000,0.000000,0.040000,0.000000,0.000000,0.00000,0.000000,0.000000,0.5,0.000000,...,0.000000,0.000000,0.000000,0.024070,0.0,0.000000,0.000000,0.0,0.0,0.0
AS,0.000000,0.000000,0.000000,0.000000,0.000000,0.00000,0.000000,0.000000,0.0,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
VT,0.100000,0.000000,0.123743,0.020619,0.000000,0.25000,0.118056,0.000000,0.0,0.068966,...,0.000000,0.000000,0.000000,0.028007,0.0,0.019615,0.000000,0.0,0.0,0.0
WA,0.000000,0.000000,0.035857,0.040000,0.000000,0.00000,0.090909,0.000000,0.0,0.109563,...,0.000000,0.083333,0.007192,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0
WI,0.013799,0.047619,0.044211,0.022222,0.000000,0.00000,0.015385,0.049683,0.0,0.010526,...,0.000000,0.292857,0.027588,0.000000,0.0,0.000000,0.048902,0.0,0.0,0.0
WV,0.000000,0.000000,0.000000,0.005952,0.000000,0.00000,0.000000,0.000000,0.0,0.078148,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0


In [38]:
state_segment_aggs = pd.DataFrame(state_data).fillna(0)
state_segment_aggs.describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
segment_E,63.0,0.024512,0.063455,0.0,0.0,0.0,0.01768,0.4
segment_R,63.0,0.013218,0.038778,0.0,0.0,0.0,0.0,0.239086
segment_O,63.0,0.026279,0.049089,0.0,0.0,0.014286,0.033333,0.353571
segment_P,63.0,0.026391,0.066792,0.0,0.0,0.005511,0.030922,0.5
segment_A,63.0,0.021074,0.110766,0.0,0.0,0.0,0.003787,0.875
segment_S,63.0,0.022794,0.077179,0.0,0.0,0.0,0.0,0.52381
segment_D,63.0,0.041251,0.083154,0.0,0.0,0.0,0.049845,0.527778
segment_G,63.0,0.046577,0.138002,0.0,0.0,0.0,0.025358,0.820225
segment_W,63.0,0.0282,0.100834,0.0,0.0,0.0,0.0,0.5
segment_X,63.0,0.064639,0.077305,0.0,0.0,0.043478,0.104781,0.380682


In [None]:
#df.to_csv("seg_state.onehot.csv", index=False)