In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import pickle as pkl
from sklearn.model_selection import train_test_split
import datetime

import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *
sys.path.insert(1, '../code/')
from support import *

  from numpy.core.umath_tests import inner1d


# data prep

In [3]:
print("\treading in logs, integrations, transactions, and orgs")
df_orgs = get_dataframe_from_file("qgiv-stats-data", "organizations.names.csv")
df_logs = redshift_query_read("select * from logs")
df_integrations = get_dataframe_from_file("qgiv-stats-data", 'integrations.csv')
df_trans = redshift_query_read("select * from transactions where status='A' and date>=DATEADD('month', -6, CURRENT_DATE)")

print("\tisolating churned orgs")
df_churned_orgs = df_orgs[~df_orgs['date_closed'].isnull()]

print("\t\tbuilding volume growth data")
orgs_growth = get_diff_mean_growth_churned(df_churned_orgs['id'].tolist())
print("\t\tbuilding login history")
logged_in_3_months = get_orgs_logged_in_last_3_months()

print("\tgetting P2P org list from transaction history")
p2p_orgs = df_trans[df_trans.source=='p2p']['org'].unique().tolist()

print("\tfiltering orgs with fewer than 100 transactions")
orgs_trans_counts = df_trans.groupby('org')['id'].count().reset_index()
orgs_never_viable = orgs_trans_counts[orgs_trans_counts['id']<100]['org'].tolist()

	reading in logs, integrations, transactions, and orgs
	isolating churned orgs
		building volume growth data
		building login history
	getting P2P org list from transaction history
	filtering orgs with fewer than 100 transactions


In [4]:
print("\tprepping logs data")
print("\t\tset message labels and date values")
df_logs['created'] = pd.to_datetime(df_logs['created'])
df_logs['month'] = df_logs['created'].dt.month
df_logs['year'] = df_logs['created'].dt.year
df_logs = df_logs[~df_logs['org'].isin(orgs_never_viable)]
df_logs['monthyear'] = df_logs.apply(lambda x: str(x['year'])+'/'+str(x['month']), axis=1)
df_logs['message_label'] = df_logs['message'].apply(label_log_entry)
df_logs = df_logs.merge(pd.get_dummies(df_logs['message_label'],prefix='label'), left_index=True, right_index=True)

print("\t\taggregate log label values per org per month")
message_label_cols = [c for c in df_logs.columns if 'label_' in c]
log_agg = df_logs.groupby(['org', 'monthyear'])[message_label_cols].mean().reset_index()

print("\t\textract last 12 months of log entries per organization")
agged_org_data = []
log_agg['monthyear'] = pd.to_datetime(log_agg['monthyear'])

for o in log_agg['org'].unique():
    _agg = log_agg[log_agg['org']==o].copy()
    _agg.sort_values('monthyear', ascending=False, inplace=True)
    _this_data = _agg.iloc[-12:].copy()
    _this_data['reindex'] = 0
    counter = 0
    for _, r in _this_data.iterrows():
        r['reindex'] = counter
        agged_org_data.append(r.to_dict())
        counter += 1

df_agged = pd.DataFrame(agged_org_data)
df_agged['churned'] = df_agged['org'].isin(df_churned_orgs['id'].tolist())
df_agged = df_agged[df_agged['org']!=0]

	prepping logs data
		set message labels and date values
		aggregate log label values per org per month
		extract last 12 months of log entries per organization


In [5]:
print("\tcompiling integrations with logs")
df_agged['integrations'] = df_agged['org'].isin(df_integrations['org'].unique())

print("\tcompiling growth mean difference and log ins")
df_agged = df_agged.merge(orgs_growth, on="org")
df_agged['logged_in_recently'] = df_agged['org'].isin(logged_in_3_months['org'].tolist())

	compiling integrations with logs
	compiling growth mean difference and log ins


# feature selection modeling

In [10]:
df_agged.dropna(inplace=True)

In [11]:
ftr_cols = [c for c in df_agged.columns if 'label_' in c] + ['integrations', 'logged_in_recently', 'mean_diff_growth_churned']
target_col = 'churned'

print("\ttrain simple model for feature selection")
train_X, test_X, train_y, test_y = train_test_split(df_agged[ftr_cols], df_agged[target_col], test_size=0.33)

rf = RandomForestClassifier(n_estimators=1000)
rf.fit(train_X, train_y)

feats = {} # a dict to hold feature_name: feature_importance
for feature, importance in zip(train_X.columns, rf.feature_importances_):
    feats[feature] = importance #add the name/value pair 

# isolate the most important features from the simple model for the real model
importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
important_ftrs = importances.sort_values(by='Gini-importance').iloc[-10:].reset_index()["index"]
# list formatting the important features output
important_ftrs = list(important_ftrs)

	train simple model for feature selection


# full modeling

In [12]:
print("\tfull model training with best features from simple model")
print("\t\treformatting training data for optimized features")
# reformat dataset to include the most important features for the last 12 months of logs so that we have 1 row per organization
reformatted_data = []
for o in df_agged['org'].unique():
    _df = df_agged[df_agged['org']==o][important_ftrs+['org', 'reindex', 'churned']]
    
    _this_org_data = {}
    for _, r in _df.sort_values('reindex', ascending=True).iterrows():
        for c in _df.columns:
            if 'label_' in c:
                _this_org_data["month_{}_{}".format(r['reindex'], c.replace('.0', ''))] = r[c]
            elif c not in _this_org_data:
                _this_org_data[c] = r[c]
    reformatted_data.append(_this_org_data)
                
df_reformatted = pd.DataFrame(reformatted_data)
df_reformatted['integrations'] = df_reformatted['org'].isin(df_integrations['org'].unique())

print("\t\ttraining full model")
train_X, test_X, train_y, test_y = train_test_split(df_reformatted.drop(['org', 'churned', 'reindex'], axis=1).fillna(0.0), df_reformatted[target_col], test_size=0.33)

rf = RandomForestClassifier(n_estimators=1000)
rf.fit(train_X, train_y)

	full model training with best features from simple model
		reformatting training data for optimized features
		training full model


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
print("\tfiltering to orgs with activity within the past 6 months")
target_active_orgs = df_orgs[df_orgs['status']=='active']['id'].tolist()

six_months_ago = datetime.date.today() - datetime.timedelta(6*365/12)
df_trans['date'] = pd.to_datetime(df_trans['date'])
df_trans_orgdate = df_trans[df_trans['date']>=pd.Timestamp(six_months_ago)][['date', 'org']]
df_trans_agg = df_trans_orgdate.groupby('org')['date'].count().reset_index()
# narrow org list to active orgs with more than 10 transactions in the last 6 months
target_active_orgs = df_trans_agg[(df_trans_agg['date']>=10)&(df_trans_agg['org'].isin(target_active_orgs))]['org'].tolist()

# filter to active orgs
try:
    df_priorpreds = get_dataframe_from_file("qgiv-stats-data", "preds.churn.csv")
    print("\t{} prior preds found".format(len(df_priorpreds)))
except:
    df_priorpreds = pd.DataFrame(columns=['org', 'date_predicted'])
    print("\tno prior preds found")
    
df_reformatted = df_reformatted[df_reformatted['org'].isin(target_active_orgs)]
ftrs = df_reformatted[(~df_reformatted['churned'])&(~df_reformatted['org'].isin(df_priorpreds['org'].tolist()))].drop(['churned', 'org', 'reindex'], axis=1).fillna(0.)

print("\tperform prediction")
y_pred = rf.predict_proba(ftrs)
y_pred_df = pd.DataFrame(y_pred)
y_pred_df['org'] = df_reformatted[~df_reformatted['churned']]['org']

print("\t\t{} unfiltered predicions".format(len(y_pred_df)))
top_preds = y_pred_df[~y_pred_df['org'].isin(df_priorpreds['org'].tolist())].sort_values(1, ascending=False).head(20).dropna()

predicted_orgs = top_preds['org'].tolist()

	filtering to orgs with activity within the past 6 months
	313 prior preds found
	perform prediction
		955 unfiltered predicions


# report

In [15]:
labels = []
for f in ftrs.columns:
    if 'month_0' in f:
        labels.append(f.replace('month_0_', ''))

for o in predicted_orgs:
    inp = df_reformatted[df_reformatted['org']==o]
    
    try:
        org_name = df_orgs[df_orgs['id']==int(o)]['org_name'].iloc[0]
    except:
        org_name = '(Not found)'
    
    print("{} ({})".format(org_name, int(o)))
    label_diffs = {}
    for c in inp.drop(['org', 'churned', 'reindex'], axis=1).columns:
        if c in ['integrations', 'logged_in_recently', 'mean_diff_growth_churned']:
            this_input = inp[c].iloc[0]
            if c == 'mean_diff_growth_churned':
                if this_input > 0:
                    this_input = 'positive'
                else:
                    this_input = 'negative'
                    
            print("\t{}: {}".format(c.replace('_', ' '), this_input))
        else:
            label_diffs[c] = inp[c].fillna(0).iloc[0] - df_reformatted[c].mean()
            
    these_vals_means = {}
    for l in labels:
        these_vals = []
        for k in label_diffs.keys():
            if l in k:
                these_vals.append(label_diffs[k])
        
        these_vals_means[entry_labels[int(l.replace('label_', ''))]] = np.mean(these_vals)
    
    decision_statement = []
    for e in sorted(these_vals_means.items(), key=lambda kv: kv[1]):
        if abs(e[1]) > 0.015:
            if e[1] > 0.:
                decision_statement.append("{} (up)".format(e[0]))
            else:
                decision_statement.append("{} (down)".format(e[0]))
    print("\t{}".format(", ".join(decision_statement)))

San Francisco City Impact (1519)
	integrations: True
	logged in recently: True
	mean diff growth churned: negative
	cloned a new form (down), set form (down), added recipient (down), changed organization (down)
All Things Possible Ministries (537)
	integrations: True
	logged in recently: True
	mean diff growth churned: negative
	cloned a new form (down), set form (down), added recipient (down), changed organization (down)
Kansas City Pet Project (29758)
	integrations: False
	logged in recently: True
	mean diff growth churned: negative
	cloned a new form (down), set form (down), changed organization (down), added recipient (down)
Immerse Arkansas (1445)
	integrations: True
	logged in recently: True
	mean diff growth churned: negative
	cloned a new form (down), changed organization (down), set form (up)
Kansas Food Bank (31764)
	integrations: False
	logged in recently: True
	mean diff growth churned: negative
	cloned a new form (down), changed organization (down), set form (up), added re

# scratch

In [2]:
q = '''select
            org,
            count(distinct form) as forms,
            date_trunc('month', date) as month,
            count(id) as count,
            sum(amount) as volume
        from transactions
            where status='A'
            group by org, date_trunc('month', date)
            order by date_trunc('month', date) desc;'''
df_trans_agg = redshift_query_read(q)
df_trans_agg['month'] = pd.to_datetime(df_trans_agg['month'])
df_trans_agg.sort_values('month', ascending=True, inplace=True)

In [29]:
df_orgs = get_dataframe_from_file("qgiv-stats-data", "organizations.names.csv")
df_orgs['churned'] = ~df_orgs['date_closed'].isnull()
churned_orgs = df_orgs[df_orgs['churned']]['id'].tolist()
churned_growth_data = []
for o in churned_orgs:
    _df = df_trans_agg[df_trans_agg['org']==o].copy()
    if len(_df) < 12:
        continue
    _df['growth'] = _df['volume'].diff() / _df['volume'].shift(1)
    churned_growth_data.append({
        'org': o,
        'growth': _df.tail(6)['growth'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
        'count': len(_df.tail(6))
    })
churned_growth_df = pd.DataFrame(churned_growth_data)
mean_churned_growth_rate = churned_growth_df['growth'].mean()
print("mean churned growth rate: {:.2f}".format(mean_churned_growth_rate))
print("median: {:.2f}".format(churned_growth_df['growth'].median()))

mean churned growth rate: 7.94
median: 0.34


Source data: month over month growth rate for the last 6 months for a given organization, only looking at those with at least 12 months of processing history. Churned org median month over month growth is 34% so there's a healthy, positive growth rate. For non-churned orgs, the median is 57%.

In [35]:
nonchurned_growth_data = []
for o in df_orgs[~df_orgs['id'].isin(churned_orgs)]['id'].tolist():
    _df = df_trans_agg[df_trans_agg['org']==o].copy()
    if len(_df) < 12:
        continue
    _df['growth'] = _df['volume'].diff() / _df['volume'].shift(1)
    nonchurned_growth_data.append({
        'org': o,
        'growth': _df.tail(6)['growth'].replace([np.inf, -np.inf], np.nan).dropna().mean(),
        'count': len(_df.tail(6))
    })
nonchurned_growth_df = pd.DataFrame(nonchurned_growth_data)
print("mean churned growth rate: {:.2f}".format(nonchurned_growth_df['growth'].mean()))
print("median: {:.2f}".format(nonchurned_growth_df['growth'].median()))

mean churned growth rate: 5.81
median: 0.57


In [36]:
len(churned_growth_df), len(nonchurned_growth_df)

(665, 2267)

In [32]:
ex_org = df_trans_agg[df_trans_agg['org']==430217].copy()
ex_org['growth'] = ex_org['volume'].diff() / ex_org['volume'].shift(1)
ex_org['mean_diff_growth_churned'] = ex_org['growth'] - mean_churned_growth_rate
ex_org.tail(6)

Unnamed: 0,org,forms,month,count,volume,growth,mean_diff_growth_churned
11330,430217,7,2020-04-01,185,7759.99,0.998252,-6.94445
9907,430217,7,2020-05-01,153,4974.89,-0.358905,-8.301606
6993,430217,6,2020-06-01,289,10322.34,1.074888,-6.867813
5049,430217,6,2020-07-01,302,13800.14,0.33692,-7.605781
2588,430217,6,2020-08-01,361,15637.14,0.133115,-7.809587
14,430217,6,2020-09-01,365,20627.95,0.319164,-7.623537
