Calculate year over year donor growth for P2P forms

In [1]:
!pwd

/home/ubuntu/recommendation


In [23]:
import datetime
print(f"last run: {datetime.datetime.now():%Y-%m-%d}")

last run: 2025-01-17


In [2]:
import sys
sys.path.insert(1, "/home/ubuntu/recommendation/scripts/")
from s3_support import *

# By form

In [112]:
q = """select
        form,
        year,
        sum(donations_count) as donations_count,
        sum(donations_amt) as donations_vol,
        email
    from transactions
    where
        status='A' and
        source='p2p' and
        donations_count>0 and
        date>='2018-01-01'
    group by form, email, year
"""
trans = redshift_query_read(q, schema='production')

In [125]:
print(f"{len(trans):,} transactions")
print(f"{trans['year'].min()} to {trans['year'].max()}")
print(f"{len(trans['form'].unique()):,} unique forms")
print(f"{len(trans['email'].unique()):,} unique donors")

years_per_form = trans.groupby('form')['year'].nunique().mean()
print(f"{years_per_form:,.2f} years per form")

mn_donors = trans.groupby(['form', 'year'])['email'].nunique().mean()
mdn_donors = trans.groupby(['form', 'year'])['email'].nunique().median()
print(f"{mn_donors:,.2f} mean donors; {mdn_donors:,.2f} median donors")

mn_trans = trans.groupby(['form', 'year'])['donations_count'].sum().mean()
mdn_trans = trans.groupby(['form', 'year'])['donations_count'].sum().median()
print(f"{mn_trans:,.2f} mean donations per form; {mdn_trans:,.2f} median donations per form")

1,768,560 transactions
2018 to 2025
10,791 unique forms
1,432,855 unique donors
1.28 years per form
128.43 mean donors; 30.00 median donors
151.59 mean donations per form; 34.00 median donations per form


We have an average of 1.28 years of data per form. This immediately indicates that the majority of forms are not year to year carried over events. _We should re-run these numbers keying off of organization rather than form. This may give a better representation of the data point we're looking for here._

Presuming we cannot rely upon the year over year change per form, the next most reliable indicator of donor growth would be the average number of donors per form by year.

In [129]:
print("Donors per form, by year:")
print("-"*40)
annual_data = []
for year in sorted(trans['year'].unique()):
    _df = trans[trans['year']==year]
    annual_data.append({
        'year': year,
        'donors_mean': _df.groupby('form')['email'].nunique().mean(),
        'donors_median': _df.groupby('form')['email'].nunique().median(),
        'forms': len(_df['form'].unique())
    })
ad_df = pd.DataFrame(annual_data).sort_values('year', ascending=True)
ad_df['donors_median_change'] = ad_df['donors_median'].pct_change()
ad_df

Donors per form, by year:
----------------------------------------


Unnamed: 0,year,donors_mean,donors_median,forms,donors_median_change
0,2018,135.438938,44.0,1130,
1,2019,133.350379,42.0,1584,-0.045455
2,2020,121.882916,28.0,1879,-0.333333
3,2021,119.016004,26.0,1937,-0.071429
4,2022,127.456174,29.0,2065,0.115385
5,2023,143.022779,31.0,2195,0.068966
6,2024,145.139001,36.0,2482,0.16129
7,2025,14.765531,2.0,499,-0.944444


Now let's calculate the average growth by form. We will calculate this with a minimum donor filter of varying value to omit events that were not seriously implemented.

In [137]:
min_donor_threshold = 1
form_data = None
for form in trans['form'].unique():
    _df = trans[(trans['form']==form)&(trans['year']!=2025)]
    if len(_df['year'].unique()) > 1:
        _df.sort_values('year', ascending=True)
        annual = _df.groupby('year')['email'].nunique().reset_index().sort_values('year', ascending=True)
        annual = annual[annual['email']>min_donor_threshold]
        
        if len(annual) > 0:
            annual['growth'] = annual['email'].pct_change()
            annual.dropna(inplace=True)
            annual['form'] = form
            
            form_data = pd.concat([form_data, annual])

In [138]:
print(f"Forms with minimum donor threshold of {min_donor_threshold}")
print(f"{len(form_data):,} rows")
print(f"{form_data['growth'].mean() * 100.:,.2f}% mean growth")
print(f"{form_data['growth'].median() * 100.:,.2f}% median growth")

Forms with minimum donor threshold of 1
1,698 rows
887.85% mean growth
-34.10% median growth


`emails` = unique donors; `rows` = forms

| donor filter | rows  | mean growth | median growth |
|-------------|-------|-------------|---------------|
| emails>1    | 1,283 |   591.34%   |    -47.53%    |
| emails>5    |   715 |   364.10%   |    -20.83%    | 
| emails>10   |   544 |   276.49%   |    -17.66%    | 
| emails>15   |   634 |   282.79%   |    -10.65%    | 
| emails>25   |   469 |   237.43%   |    -13.68%    | 
| emails>50   |   275 |   211.83%   |    -10.77%    | 
| emails>1, removing 2025  | 1,698  | 887.85% | -34.10% |
| emails>10, removing 2025 |   750  | 375.21% |  -4.08% |
| emails>50, removing 2025 |   278  | 234.69% |  -1.38% |
| emails>100, removing 2025 |   153  | 279.29% |  8.99% |

In [139]:
print(f"Donor growth by year, minimum donor threshold of {min_donor_threshold}")
print("-"*40)

form_data.groupby('year')['growth'].agg(['mean', 'median']).reset_index()

Donor growth by year, minimum donor threshold of 1
----------------------------------------


Unnamed: 0,year,mean,median
0,2019,36.699471,0.344
1,2020,8.089181,0.06991
2,2021,4.483434,-0.644444
3,2022,4.554727,-0.5
4,2023,7.057821,-0.355556
5,2024,9.291306,-0.259804


In [140]:
mean_years = form_data.groupby('form')['year'].count().mean()
print(f"This filtered dataset represents {len(form_data['form'].unique()):,} forms with {mean_years:,.2f} mean years per form")

This filtered dataset represents 1,275 forms with 1.33 mean years per form


# By org

In [141]:
q = """select
        org,
        year,
        sum(donations_count) as donations_count,
        sum(donations_amt) as donations_vol,
        email
    from transactions
    where
        status='A' and
        source='p2p' and
        donations_count>0 and
        date>='2018-01-01'
    group by org, email, year
"""
trans = redshift_query_read(q, schema='production')

In [142]:
print(f"{len(trans):,} transactions")
print(f"{trans['year'].min()} to {trans['year'].max()}")
print(f"{len(trans['org'].unique()):,} unique orgs")
print(f"{len(trans['email'].unique()):,} unique donors")

years_per_org = trans.groupby('org')['year'].nunique().mean()
print(f"{years_per_org:,.2f} years per org")

mn_donors = trans.groupby(['org', 'year'])['email'].nunique().mean()
mdn_donors = trans.groupby(['org', 'year'])['email'].nunique().median()
print(f"{mn_donors:,.2f} mean donors; {mdn_donors:,.2f} median donors")

mn_trans = trans.groupby(['org', 'year'])['donations_count'].sum().mean()
mdn_trans = trans.groupby(['org', 'year'])['donations_count'].sum().median()
print(f"{mn_trans:,.2f} mean donations per org; {mdn_trans:,.2f} median donations per org")

1,750,766 transactions
2018 to 2025
1,590 unique orgs
1,433,134 unique donors
2.97 years per org
370.30 mean donors; 140.50 median donors
441.64 mean donations per org; 157.00 median donations per org


We have nearly 3 years per org on average which means we could very well see a much better year over year representation in the _by org_ dataset than we saw in the _by form_ dataset

In [143]:
print("Donors per org, by year:")
print("-"*40)
annual_data = []
for year in sorted(trans['year'].unique()):
    _df = trans[trans['year']==year]
    annual_data.append({
        'year': year,
        'donors_mean': _df.groupby('org')['email'].nunique().mean(),
        'donors_median': _df.groupby('org')['email'].nunique().median(),
        'orgs': len(_df['org'].unique())
    })
ad_df = pd.DataFrame(annual_data).sort_values('year', ascending=True)
ad_df['donors_median_change'] = ad_df['donors_median'].pct_change()
ad_df

Donors per org, by year:
----------------------------------------


Unnamed: 0,year,donors_mean,donors_median,orgs,donors_median_change
0,2018,412.233062,146.0,369,
1,2019,419.924,164.5,500,0.126712
2,2020,376.515,143.0,600,-0.130699
3,2021,355.729688,146.5,640,0.024476
4,2022,363.921788,151.0,716,0.030717
5,2023,414.217333,168.5,750,0.115894
6,2024,413.577236,164.0,861,-0.026706
7,2025,26.688356,4.5,292,-0.972561


In [160]:
min_donor_threshold = 300
org_data = None
for org in trans['org'].unique():
    _df = trans[(trans['org']==org)&(trans['year']!=2025)]
    if len(_df['year'].unique()) > 1:
        _df.sort_values('year', ascending=True)
        annual = _df.groupby('year')['email'].nunique().reset_index().sort_values('year', ascending=True)
        annual = annual[annual['email']>min_donor_threshold]
        
        if len(annual) > 0:
            annual['growth'] = annual['email'].pct_change()
            annual.dropna(inplace=True)
            annual['org'] = org
            
            org_data = pd.concat([org_data, annual])

In [161]:
print(f"Orgs with minimum donor threshold of {min_donor_threshold}")
print(f"{len(org_data):,} rows")
print(f"{org_data['growth'].mean() * 100.:,.2f}% mean growth")
print(f"{org_data['growth'].median() * 100.:,.2f}% median growth")

org_data.groupby('year')['growth'].agg(['mean', 'median']).reset_index()

Orgs with minimum donor threshold of 300
847 rows
13.99% mean growth
-2.54% median growth


Unnamed: 0,year,mean,median
0,2019,0.039283,-0.025169
1,2020,0.398512,-0.15271
2,2021,0.239828,-0.004412
3,2022,0.11136,-0.002976
4,2023,0.077901,-0.017945
5,2024,0.074126,-0.02439


`emails` = unique donors; `rows` = orgs

|        donor filter        | rows  | mean growth | median growth |
|----------------------------|-------|-------------|---------------|
| emails>1, omitting 2025    | 2,742 |   232.16%   |     -7.05%    |
| emails>10, omitting 2025   | 2,556 |    48.28%   |     -6.25%    |
| emails>25, omitting 2025   | 2,362 |    29.96%   |     -5.71%    |
| emails>50, omitting 2025   | 2,146 |    21.57%   |     -5.50%    |
| emails>100, omitting 2025  | 1,752 |    15.32%   |     -4.77%    |
| emails>200, omitting 2025  | 1,193 |    11.56%   |     -3.22%    |
| emails>300, omitting 2025  |   847 |    13.99%   |     -2.54%    |

In [157]:
mean_years = org_data.groupby('org')['year'].count().mean()
print(f"This filtered dataset represents {len(org_data['org'].unique()):,} orgs with {mean_years:,.2f} mean years per org")

This filtered dataset represents 626 orgs with 2.80 mean years per org


1. who should create that minimum table list? _kyle ebberly_ non-issue if read replicas works out