In [19]:
import sys, datetime
import pandas as pd
import numpy as np

sys.path.insert(1, '../../../scripts/')
from s3_support import *

# Build transaction dataset

In [20]:
# query for transactions grouped by organization, source, is_recurring
q_onetime = "select org, date_trunc('month', date) as month, source, count(id) as trans_count from transactions where events_count=0 and recurring=0 and status='A' group by org, date, source"
data_onetime = redshift_query_read(q_onetime)

q_rec = "select org, date_trunc('month', date) as month, source, count(id) as trans_count from transactions where events_count=0 and recurring!=0 and status='A' group by org, date, source"
data_rec = redshift_query_read(q_rec)

q_events = "select org, date_trunc('month', date) as month, source, count(id) as trans_count from transactions where events_count!=0 and status='A' group by org, date, source"
data_events = redshift_query_read(q_events)

In [21]:
data_onetime['is_recurring'] = False
data_onetime['is_events'] = False

data_rec['is_recurring'] = True
data_rec['is_events'] = False

data_events['is_recurring'] = False
data_events['is_events'] = True

In [22]:
data = data_onetime.append(data_rec).append(data_events)

def is_yearround(row):
    source_filter = row['source'] in ['don_form', 'mobile', 'fb']
    return source_filter and not row['is_events'] and not row['is_recurring']

data['is_yearround'] = data.apply(is_yearround, axis=1)
data = data[data['month'].isin(data['month'].sort_values().unique()[-3:])]

data.head(3)

Unnamed: 0,org,month,source,trans_count,is_recurring,is_events,is_yearround
341837,378883,2021-03-01,don_form,1,False,False,True
341838,541,2021-03-01,don_form,8,False,False,True
341839,444400,2021-03-01,p2p,4,False,False,False


In [23]:
data.groupby('is_yearround')['org'].count()

is_yearround
False    77588
True     39974
Name: org, dtype: int64

In [24]:
df_orgs = get_dataframe_from_file("qgiv-stats-data", "orgs.csv")

In [25]:
orgs = df_orgs[df_orgs['status']==1]['id'].unique()
pivoted_data = []

print("iterating through {} orgs".format(len(orgs)))
counter = 0

for org in orgs:
    # mask and pivot
    ex = data[data['org']==org].groupby(['org', 'month', 'is_yearround'])['trans_count'].sum().reset_index()
    ex = ex.pivot(index='month', columns='is_yearround', values='trans_count').fillna(0).reset_index()
    
    if len(ex) == 0:
        continue
    
    # verify columns
    if True not in ex.columns:
        ex[True] = 0
    elif False not in ex.columns:
        ex[False] = 0
    ex = ex[['month', True, False]]
    
    # rename columns and add org
    ex.columns = ['month', 'yearround', 'not_yearround']
    
    ex = ex.sort_values('month', ascending=False)[:6]
    
    ex_data = {
        'org': org,
        'last_month': ex['month'].iloc[0],
        'yearround': ex['yearround'].sum(),
        'not_yearround': ex['not_yearround'].sum()
    }
    
    # append to list
    pivoted_data.append(ex_data)
        
    counter += 1
    if counter % 1000 == 0:
        print("\tdone with {} orgs".format(counter))

iterating through 4641 orgs
	done with 1000 orgs
	done with 2000 orgs


In [26]:
len(pivoted_data), len(orgs)

(2781, 4641)

In [27]:
pvt_df = pd.DataFrame(pivoted_data)
pvt_df['yearround/not'] = pvt_df['yearround'] / pvt_df['not_yearround']
pvt_df.head()

Unnamed: 0,org,last_month,yearround,not_yearround,yearround/not
0,6,2021-05-01,1.0,524.0,0.001908
1,13,2021-05-01,37.0,30.0,1.233333
2,31,2021-05-01,33.0,34.0,0.970588
3,33,2021-05-01,0.0,13.0,0.0
4,39,2021-05-01,22.0,3.0,7.333333


In [28]:
# save_dataframe_to_file("trans-records", "year-round-month.csv", pvt_df)

# Filtering for low year round

In [29]:
# pvt_df = get_dataframe_from_file("trans-records", "year-round-month.csv")

In [30]:
zero_orgs = pvt_df[pvt_df['yearround/not']==0]
len(zero_orgs), "{:.2f}%".format((float(len(zero_orgs)) / float(len(pvt_df))) * 100.)

(555, '19.96%')

In [31]:
prior_preds = pd.read_csv("prior_sends.csv")
pred_orgs = zero_orgs[~zero_orgs['org'].isin(prior_preds['org'].tolist())].sort_values('not_yearround', ascending=False).head(50)['org'].tolist()

In [32]:
org_names = get_dataframe_from_file("qgiv-stats-data", "organizations.clean.csv")

In [33]:
for _, r in org_names[org_names['id'].isin(pred_orgs)].iterrows():
    print("{} ({})".format(r['org_name'], r['id']))

A Piece of My Heart Foundation (442516)


In [34]:
pred_orgs

[445489, 442516, 446765]

In [17]:
#pred_orgs = org_names[org_names['id'].isin(pred_orgs)]['id'].tolist()

In [35]:
prior_data = []
for p in pred_orgs:
    prior_data.append({
        'date': '2021-05-17',
        'org': p
    })
prior_preds.append(pd.DataFrame(prior_data)).to_csv("prior_sends.csv", index=False)

In [18]:
pd.read_csv("prior_sends.csv")['date'].unique()[-3:]

array(['2021-04-05', '2021-04-19', '2021-05-10'], dtype=object)

# Spot checking

In [40]:
org = 226307
df = redshift_query_read('''select 
            count(id) as count,
            sum(amount) as amount,
            date_trunc('month', date) as month, 
            source
        from transactions where org={} and status='A' 
        group by date, source'''.format(org))

In [41]:
df.head()

Unnamed: 0,count,amount,month,source
0,1,10.0,2015-01-01,sms
1,3,145.0,2015-01-01,sms
2,3,40.0,2015-02-01,sms
3,1,250.0,2015-02-01,sms
4,1,10.0,2015-03-01,sms


In [35]:
df['status'] = df['status'].astype(str)
df.groupby('source')['id'].count()

source
don_form     77
mobile       56
sms         553
vt            1
Name: id, dtype: int64