### Primary metrics

1. acquisition growth rate
2. retention rate
3. recurring growth rate

In [22]:
import sys
import numpy as np
sys.path.insert(1, '../../scripts/')
from s3_support import *
%matplotlib inline

In [3]:
gt_dates = [
    '2022-11-29',
    '2021-11-30',
    '2020-12-01',
    '2019-12-03',
    '2018-11-27',
    '2017-11-28',
    '2016-11-29',
    '2015-12-01',
    '2014-12-02',
    '2013-12-03',
    '2012-11-27'
]

In [4]:
q = '''select
            id,
            org,
            email,
            amount,
            recurring,
            recurring_origin,
            year
        from transactions
        where
            status='A' and
            date in ({})'''.format(', '.join(["'{}'".format(d) for d in gt_dates]))
trans = redshift_query_read(q, schema='production')

In [5]:
trans.groupby('year')[['id', 'org']].nunique().reset_index()

Unnamed: 0,year,id,org
0,2012,758,238
1,2013,1811,452
2,2014,3153,506
3,2015,5773,704
4,2016,7268,681
5,2017,10877,852
6,2018,14804,1117
7,2019,20032,1321
8,2020,40318,1702
9,2021,40490,1821


In [6]:
org_years = trans.groupby(['year', 'org'])['amount'].sum().reset_index()
org_years.tail(3)

Unnamed: 0,year,org,amount
11047,2022,449556,476.25
11048,2022,449557,3533.8
11049,2022,449561,1.0


In [7]:
primary_metrics = []
for o in trans['org'].unique():
    years = trans[trans['org']==o]['year'].unique()
    
    if len(years) > 1:
        for i in range(1, len(years)):
            this_years_donors = trans[(trans['year']==years[i])&(trans['org']==o)]['email'].tolist()
            last_years_donors = trans[(trans['year']==years[i-1])&(trans['org']==o)]['email'].tolist()
            
            acquisition = len(set(this_years_donors) - set(last_years_donors)) / len(this_years_donors)
            retention = len(list(set(this_years_donors) & set(last_years_donors))) / len(last_years_donors)
            primary_metrics.append({
                'org': o,
                'year': years[i],
                'retention': retention,
                'acquisition': acquisition
            })

In [8]:
org_years = org_years.merge(pd.DataFrame(primary_metrics), on=['org', 'year'])

In [9]:
org_years.tail(3)

Unnamed: 0,year,org,amount,retention,acquisition
7502,2022,447806,5184.05,0.105263,0.86
7503,2022,447844,17021.24,0.0,0.894737
7504,2022,447856,5835.1,0.25,0.736842


In [10]:
new_data = None

for o in org_years['org'].unique():
    this_df = org_years[org_years['org']==o].sort_values("year", ascending=True).copy()
    this_df['acquisition_growth'] = this_df['acquisition'].pct_change()
    
    new_data = pd.concat([new_data, this_df])

In [16]:
new_data['acquisition_growth'].describe()

count    5154.000000
mean             inf
std              NaN
min        -1.000000
25%        -0.057405
50%         0.000000
75%         0.051097
max              inf
Name: acquisition_growth, dtype: float64

In [54]:
new_data.groupby('year')['org'].nunique().rolling(window=3).mean()

year
2012            NaN
2013            NaN
2014     258.666667
2015     363.666667
2016     426.333333
2017     501.666667
2018     576.000000
2019     656.000000
2020     899.666667
2021    1122.333333
2022    1276.333333
Name: org, dtype: float64

### 1. What is the difference in “primary metrics” between orgs with majority revenue from small donors (<5k) vs those with majority revenue from large donors (>5k)?

In [18]:
print("Orgs > $5k")
new_data[new_data['amount']>5000][['retention', 'acquisition', 'acquisition_growth']].agg(['mean', 'median'])

Orgs > $5k


Unnamed: 0,retention,acquisition,acquisition_growth
mean,0.098132,0.86619,inf
median,0.076923,0.887496,0.0


In [20]:
print("Orgs < $5k")
new_data[new_data['amount']<5000][['retention', 'acquisition', 'acquisition_growth']].agg(['mean', 'median'])

Orgs < $5k


Unnamed: 0,retention,acquisition,acquisition_growth
mean,0.051703,0.89856,inf
median,0.0,1.0,0.0


In [24]:
print("Orgs > $5k")
new_data.replace([np.inf, -np.inf], np.nan).dropna()[new_data['amount']>5000][['retention', 'acquisition', 'acquisition_growth']].agg(['mean', 'median'])

Orgs > $5k


  


Unnamed: 0,retention,acquisition,acquisition_growth
mean,0.095695,0.870287,0.017657
median,0.074074,0.888889,0.0


In [25]:
print("Orgs < $5k")
new_data.replace([np.inf, -np.inf], np.nan).dropna()[new_data['amount']<5000][['retention', 'acquisition', 'acquisition_growth']].agg(['mean', 'median'])

Orgs < $5k


  


Unnamed: 0,retention,acquisition,acquisition_growth
mean,0.051203,0.902311,0.017739
median,0.0,1.0,0.0


### 6. Is there a difference in the “primary metrics” between orgs with a goal vs those without a goal?

In [46]:
goal_dates = pd.DataFrame(gt_dates)
goal_dates.columns = ['date']
goal_dates['date'] = pd.to_datetime(goal_dates['date'])
goal_dates['next'] = goal_dates['date'] + pd.Timedelta(days=1)
goal_dates_list = goal_dates['date'].dt.date.tolist() + goal_dates['next'].dt.date.tolist()

goal_dates_list = ["{:%Y-%m-%d}".format(d) for d in goal_dates_list]

In [47]:
q = '''select t.id, t.form, f.org, t.enddate
        from production.thermometers as t
        left join production.form as f on t.form=f.id
        where t.enddate in ({})'''.format(",".join(["'{}'".format(d) for d in goal_dates_list]))
goals = redshift_query_read(q, schema='production')

In [48]:
goals['enddate'] = pd.to_datetime(goals['enddate'])
goals['year'] = goals['enddate'].dt.year
goals.tail(3)

Unnamed: 0,id,form,org,enddate,year
29,7230,995883,449392,2022-11-29,2022
30,1377,930481,442065,2017-11-29,2017
31,7332,966270,442510,2022-11-30,2022


In [49]:
def org_has_goal(r):
    return len(goals[(goals['org']==r['org'])&(goals['year']==r['year'])])>0

org_years['has_goal'] = org_years.apply(org_has_goal, axis=1)
new_data['has_goal'] = new_data.apply(org_has_goal, axis=1)

In [50]:
org_years['has_goal'].value_counts()

False    7489
True       16
Name: has_goal, dtype: int64

In [None]:
org_years.groupby('has_goal')[['retention', 'acquisition', 'acquisition_growth']].agg(['mean', 'median']).reset_index()

In [32]:
new_data.replace([np.inf, -np.inf], np.nan).dropna().groupby('has_goal')[['retention', 'acquisition', 'acquisition_growth']].agg(['mean', 'median']).reset_index()

Unnamed: 0_level_0,has_goal,retention,retention,acquisition,acquisition,acquisition_growth,acquisition_growth
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median,mean,median
0,False,0.057584,0.0,0.897704,1.0,0.017761,0.0
1,True,0.09171,0.07125,0.889046,0.943357,0.000231,0.0


### 8. Is there a difference in “primary metrics” between organisations with who use social media payment plugins/transactions looped through a social media platform with other orgs that do not use social media plugins? Which platforms?

In [33]:
q = '''select org, year, payment_type
        from production.transactions
        where 
            date in ({}) and
            status='A'
        group by org, year, payment_type'''.format(', '.join(["'{}'".format(d) for d in gt_dates]))
payment_types = redshift_query_read(q, schema='production')

In [34]:
fb = payment_types[payment_types['payment_type']=='FB']
fb.tail(3)

Unnamed: 0,org,year,payment_type
21457,444668,2020,FB
23978,445561,2021,FB
24527,441701,2020,FB


In [35]:
def org_used_fb(r):
    return len(fb[(fb['org']==r['org'])&(fb['year']==r['year'])]) > 0

org_years['used_fb'] = org_years.apply(org_used_fb, axis=1)
new_data['used_fb'] = new_data.apply(org_used_fb, axis=1)

In [36]:
print("Orgs that used Facebook fundraising")
org_years.groupby('used_fb')[['retention', 'acquisition']].agg(['mean', 'median']).reset_index()

Orgs that used Facebook fundraising


Unnamed: 0_level_0,used_fb,retention,retention,acquisition,acquisition
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median
0,False,0.057159,0.0,0.895328,1.0
1,True,0.081199,0.052946,0.701291,0.749446


In [38]:
print("Orgs that used Facebook fundraising")
new_data.replace([np.inf, -np.inf], np.nan).dropna().groupby('used_fb')[['retention', 'acquisition', 'acquisition_growth']].agg(['mean', 'median']).reset_index()

Orgs that used Facebook fundraising


Unnamed: 0_level_0,used_fb,retention,retention,acquisition,acquisition,acquisition_growth,acquisition_growth
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median,mean,median
0,False,0.057616,0.0,0.898218,1.0,0.017828,0.0
1,True,0.072298,0.050336,0.73866,0.789474,-0.014356,-0.134412


### 9. What is the difference in “primary metrics” between organisations that use mobile payment methods vs those who do not?

In [39]:
q = '''select org, year
        from production.transactions
        where
            date in ('2022-11-29', '2021-11-30', '2020-12-01', '2019-12-03', '2018-11-27', '2017-11-28', '2016-11-29', '2015-12-01', '2014-12-02', '2013-12-03', '2012-11-27') and
            status='A' and
            source='mobile'
        group by org, year'''
mobile = redshift_query_read(q, schema='production')

In [40]:
def used_mobile(r):
    return len(mobile[(mobile['org']==r['org'])&(mobile['year']==r['year'])])>0

org_years['used_mobile'] = org_years.apply(used_mobile, axis=1)
new_data['used_mobile'] = new_data.apply(used_mobile, axis=1)

In [41]:
org_years.groupby('used_mobile')[['retention', 'acquisition']].agg(['mean', 'median']).reset_index()

Unnamed: 0_level_0,used_mobile,retention,retention,acquisition,acquisition
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median
0,False,0.050704,0.0,0.893859,1.0
1,True,0.068064,0.0,0.896255,0.952381


In [42]:
new_data.replace([np.inf, -np.inf], np.nan).dropna().groupby('used_mobile')[['retention', 'acquisition', 'acquisition_growth']].agg(['mean', 'median']).reset_index()

Unnamed: 0_level_0,used_mobile,retention,retention,acquisition,acquisition,acquisition_growth,acquisition_growth
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median,mean,median
0,False,0.049236,0.0,0.898527,1.0,0.010117,0.0
1,True,0.068511,0.0,0.896599,0.947368,0.027503,0.0


### 10. What is the difference in “primary metrics” between organisations that use non-mobile payment methods vs those who do not?

In [44]:
for p in payment_types['payment_type'].unique():
    def used_pm(r):
        return len(payment_types[(payment_types['payment_type']==p)&(payment_types['year']==r['year'])&(payment_types['org']==r['org'])])>0
    new_data['used_pm'] = new_data.apply(used_pm, axis=1)
    cols = ['retention', 'acquisition', 'acquisition_growth']
    print(p)
    print(new_data.replace([np.inf, -np.inf], np.nan).dropna().groupby('used_pm')[cols].agg(['mean', 'median']).reset_index().transpose())
    print()
    print("-"*40)

AM
                                  0         1
used_pm                       False      True
retention          mean    0.050396  0.068424
                   median       0.0       0.0
acquisition        mean    0.900236  0.893906
                   median       1.0  0.933333
acquisition_growth mean    0.008645  0.031152
                   median       0.0       0.0

----------------------------------------
MC
                                  0         1
used_pm                       False      True
retention          mean    0.043487  0.065689
                   median       0.0       0.0
acquisition        mean    0.905427  0.893301
                   median       1.0  0.947368
acquisition_growth mean    0.006699  0.023957
                   median       0.0       0.0

----------------------------------------
VS
                                  0         1
used_pm                       False      True
retention          mean    0.027943  0.061608
                   median       0