Re-creating churn script with our thumb on the scales. We have defined top influencing factors to be extreme rareties in churned organizations so we will first filter orgs that do not match these factors and then sort by growth mean diff similarity to historical churned organization growth trends.

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *

# data load

In [28]:
def get_orgs_logged_in_last_3_months():
    q = '''select
                distinct(users.org) as org
            from login
                left join users on login.user_id=users._id
            where
                login.original_timestamp >= add_months(current_date, -3);'''
    
    return redshift_query_read(q, schema="secure")


def get_diff_mean_growth_churned(churned_orgs_ids):
    q = '''select
                org,
                count(distinct form) as forms,
                date_trunc('month', date) as month,
                count(id) as count,
                sum(amount) as volume
            from transactions
                where status='A'
                group by org, date_trunc('month', date)
                order by date_trunc('month', date) desc;'''
    df_trans_agg = redshift_query_read(q)
    df_trans_agg['month'] = pd.to_datetime(df_trans_agg['month'])
    df_trans_agg.sort_values('month', ascending=True, inplace=True)

    org_growth_data = []

    for org in df_trans_agg['org'].unique():
        this_df = df_trans_agg[df_trans_agg['org']==org].copy()
        if len(this_df) <= 1:
            continue
        this_df['growth'] = this_df['volume'].diff() / this_df['volume'].shift(1)

        org_growth_data.append({
            'org': org,
            'growth': this_df['growth'].replace([np.inf, -np.inf], np.nan).dropna().mean()
        })

    growth_df = pd.DataFrame(org_growth_data)
    growth_df['churned'] = growth_df['org'].isin(churned_orgs_ids)
    
    mean_churned_growth_rate = growth_df[growth_df['churned']]['growth'].mean()
    
    growth_df['mean_diff_growth_churned'] = growth_df['growth'] - mean_churned_growth_rate
    
    return growth_df[['org', 'mean_diff_growth_churned']]


def get_diff_mean_growth_churned_six_months(churned_orgs_ids):
    q = '''select
                org,
                count(distinct form) as forms,
                date_trunc('month', date) as month,
                count(id) as count,
                sum(amount) as volume
            from transactions
                where status='A'
                group by org, date_trunc('month', date)
                order by date_trunc('month', date) desc;'''
    df_trans_agg = redshift_query_read(q)
    df_trans_agg['month'] = pd.to_datetime(df_trans_agg['month'])
    df_trans_agg.sort_values('month', ascending=True, inplace=True)

    org_growth_data = []

    for org in df_trans_agg['org'].unique():
        this_df = df_trans_agg[df_trans_agg['org']==org].copy()
        if len(this_df) <= 1:
            continue
        this_df['growth'] = this_df['volume'].diff() / this_df['volume'].shift(1)

        org_growth_data.append({
            'org': org,
            'growth': this_df['growth'].tail(6).replace([np.inf, -np.inf], np.nan).dropna().mean()
        })

    growth_df = pd.DataFrame(org_growth_data)
    growth_df['churned'] = growth_df['org'].isin(churned_orgs_ids)
    
    mean_churned_growth_rate = growth_df[growth_df['churned']]['growth'].mean()
    
    growth_df['mean_diff_growth_churned_six_months'] = growth_df['growth'] - mean_churned_growth_rate
    
    return growth_df[['org', 'mean_diff_growth_churned_six_months']]


def get_orgs_created_users():
    q = "select org, timestamp 'epoch' + created_at * interval '1 second' as created_at, status from users order by created_at desc"
    df_users = redshift_query_read(q, schema='secure')
    df_users['created_at'] = pd.to_datetime(df_users['created_at']).dt.date
    return df_users.groupby('org')['created_at'].count().reset_index()

In [21]:
# load orgs, integrations, transactions, recent logins, recent account creations, growth trend
df_orgs = get_dataframe_from_file("qgiv-stats-data", "organizations.names.csv")
df_integrations = get_dataframe_from_file("qgiv-stats-data", 'integrations.csv')
df_trans = redshift_query_read("select * from transactions where status='A' and date>=DATEADD('month', -6, CURRENT_DATE)")
logged_in_3_months = get_orgs_logged_in_last_3_months()
created_users = get_orgs_created_users()

In [22]:
zd = get_dataframe_from_file("qgiv-stats-data", "zendesk.mrgd.csv")
zd = zd[['external_id', 'created_at', 'updated_at']]
zd = zd[zd['external_id'].apply(lambda x: str(x).isnumeric())]
zd['org'] = zd['external_id'].astype(int, errors='ignore')
zd.drop('external_id', axis=1, inplace=True)
zd['created_at'] = pd.to_datetime(zd['created_at'])

In [29]:
# prep data
orgs_trans_counts = df_trans.groupby('org')['id'].count().reset_index()
inactive_orgs = df_orgs[(df_orgs['status']=='active')&(~df_orgs['id'].isin(orgs_trans_counts['org'].tolist()))]['id'].tolist()
orgs_never_viable = orgs_trans_counts[orgs_trans_counts['id']<100]['org'].tolist() + inactive_orgs

df_orgs['churned'] = ~df_orgs['date_closed'].isnull()

orgs_growth = get_diff_mean_growth_churned(df_orgs[df_orgs['churned']]['id'].tolist())
df_orgs = df_orgs.merge(orgs_growth, right_on="org", left_on="id")
orgs_growth_six_month = get_diff_mean_growth_churned_six_months(df_orgs[df_orgs['churned']]['id'].tolist())
df_orgs = df_orgs.merge(orgs_growth_six_month, right_on="org", left_on="id")

df_orgs['integrations'] = df_orgs['id'].isin(df_integrations['org'].tolist())
df_orgs['recent_logins'] = df_orgs['id'].isin(logged_in_3_months['org'].tolist())
df_orgs['recent_created_users'] = df_orgs['id'].isin(created_users['org'].tolist())
df_orgs['zendesk_active'] = df_orgs['id'].isin(zd['org'].tolist())
df_orgs['feature_sum'] = df_orgs[['zendesk_active', 'integrations', 'recent_logins', 'recent_created_users']].sum(axis=1)
df_orgs['never_viable'] = df_orgs['id'].isin(orgs_never_viable)

# limit cols
cols = ['id', 'org_name', 'status', 'churned', 'mean_diff_growth_churned', 
        'integrations', 'recent_logins', 'recent_created_users', 'zendesk_active', 
        'feature_sum', 'never_viable', 'mean_diff_growth_churned_six_months']
df_orgs = df_orgs[cols]
# omit never viable orgs
df_orgs = df_orgs[~df_orgs['never_viable']]

**Sum the critical factors (recent logins, recent account creations, active integrations) and sort by this sum and absolute mean churned growth diff simultaneously in order to prioritize churn targets.**

In [12]:
len(df_orgs[(df_orgs['status']=='active')&(~df_orgs['churned'])&(df_orgs['feature_sum']==0)])

4

In [13]:
potential_churns['mean_diff_growth_churned_diff'] = np.abs(potential_churns['mean_diff_growth_churned'].abs() - df_orgs[df_orgs['churned']]['mean_diff_growth_churned'].mean())
potential_churns.sort_values(['mean_diff_growth_churned_diff', 'feature_sum'], ascending=True).head()

NameError: name 'potential_churns' is not defined

In [30]:
churned_mean = df_orgs[df_orgs['churned']]['mean_diff_growth_churned'].mean()
nonchurned_mean = df_orgs[~df_orgs['churned']]['mean_diff_growth_churned'].mean()
print("Mean growth for churned orgs: {:.2f}".format(churned_mean))
print("Mean growth for non-churned orgs: {:.2f}".format(nonchurned_mean))

Mean growth for churned orgs: 2.19
Mean growth for non-churned orgs: 210.06


In [31]:
churned_mean = df_orgs[df_orgs['churned']]['mean_diff_growth_churned_six_months'].mean()
nonchurned_mean = df_orgs[~df_orgs['churned']]['mean_diff_growth_churned_six_months'].mean()
print("6 months mean growth diff churned:")
print("Mean growth for churned orgs: {:.2f}".format(churned_mean))
print("Mean growth for non-churned orgs: {:.2f}".format(nonchurned_mean))

6 months mean growth diff churned:
Mean growth for churned orgs: 2.11
Mean growth for non-churned orgs: 174.65


# integrity checks

In [22]:
df_trans['date'].min(), df_trans['date'].max()

(Timestamp('2020-03-19 00:00:00'), Timestamp('2020-09-14 00:00:00'))

In [14]:
prior_preds = [445054, 442539, 430217, 1486, 444475, 445355]

In [15]:
df_orgs[df_orgs['id'].isin(prior_preds)]

Unnamed: 0,id,org_name,status,churned,mean_diff_growth_churned,integrations,recent_logins,recent_created_users,zendesk_active,feature_sum,never_viable
3047,445054,National Vietnam War Museum,active,False,-88.990329,False,False,False,False,0,False
3136,442539,Noahs Lost Ark Inc,active,False,-89.81863,False,False,False,True,1,False
4049,430217,Stars and Stripes Forever PAC,active,False,6.618124,False,False,False,False,0,False
4872,1486,"World Outreach Church of Des Moines, Inc.",active,False,-88.275824,False,False,False,False,0,False
4873,444475,World War II Veterans Committee,active,False,-83.055505,False,False,False,False,0,False
4878,445355,Wounded Paw Project,active,False,143.697249,False,False,False,True,1,False


In [16]:
orgs_trans_counts[orgs_trans_counts['org'].isin(prior_preds)]

Unnamed: 0,org,id
232,1486,121
924,430217,1666
1531,442539,2921
2208,444475,381
2423,445054,330
2542,445355,239


In [26]:
inactive_orgs = df_orgs[(df_orgs['status']=='active')&(~df_orgs['id'].isin(orgs_trans_counts['org'].tolist()))]['id'].tolist()