Here we're going to look at distributions for the most influential factors

- integrations
- recent logins
- growth volumes

In [19]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *

# data load

In [20]:
def get_orgs_logged_in_last_3_months():
    q = '''select
                distinct(users.org) as org
            from login
                left join users on login.user_id=users._id
            where
                login.original_timestamp >= add_months(current_date, -3);'''
    
    return redshift_query_read(q, schema="secure")


def get_diff_mean_growth_churned(churned_orgs_ids):
    q = '''select
                org,
                count(distinct form) as forms,
                date_trunc('month', date) as month,
                count(id) as count,
                sum(amount) as volume
            from transactions
                where status='A'
                group by org, date_trunc('month', date)
                order by date_trunc('month', date) desc;'''
    df_trans_agg = redshift_query_read(q)
    df_trans_agg['month'] = pd.to_datetime(df_trans_agg['month'])
    df_trans_agg.sort_values('month', ascending=True, inplace=True)

    org_growth_data = []

    for org in df_trans_agg['org'].unique():
        this_df = df_trans_agg[df_trans_agg['org']==org].copy()
        if len(this_df) <= 1:
            continue
        this_df['growth'] = this_df['volume'].diff() / this_df['volume'].shift(1)

        org_growth_data.append({
            'org': org,
            'growth': this_df['growth'].replace([np.inf, -np.inf], np.nan).dropna().mean()
        })

    growth_df = pd.DataFrame(org_growth_data)
    growth_df['churned'] = growth_df['org'].isin(churned_orgs_ids)
    
    mean_churned_growth_rate = growth_df[growth_df['churned']]['growth'].mean()
    
    growth_df['mean_diff_growth_churned'] = growth_df['growth'] - mean_churned_growth_rate
    
    return growth_df[['org', 'mean_diff_growth_churned']]


def get_orgs_created_users():
    q = "select org, timestamp 'epoch' + created_at * interval '1 second' as created_at, status from users order by created_at desc"
    df_users = redshift_query_read(q, schema='secure')
    df_users['created_at'] = pd.to_datetime(df_users['created_at']).dt.date
    return df_users.groupby('org')['created_at'].count().reset_index()

In [34]:
# load orgs, integrations, transactions, recent logins, growth trend
df_orgs = get_dataframe_from_file("qgiv-stats-data", "organizations.names.csv")
df_integrations = get_dataframe_from_file("qgiv-stats-data", 'integrations.csv')
df_trans = redshift_query_read("select * from transactions where status='A' and date>=DATEADD('month', -6, CURRENT_DATE)")
logged_in_3_months = get_orgs_logged_in_last_3_months()
created_users = get_orgs_created_users()

In [35]:
orgs_trans_counts = df_trans.groupby('org')['id'].count().reset_index()
orgs_never_viable = orgs_trans_counts[orgs_trans_counts['id']<100]['org'].tolist()

In [36]:
# prep data
orgs_growth = get_diff_mean_growth_churned(df_churned_orgs['id'].tolist())

df_orgs['churned'] = ~df_orgs['date_closed'].isnull()
df_orgs = df_orgs.merge(orgs_growth, right_on="org", left_on="id")
df_orgs['integrations'] = df_orgs['id'].isin(df_integrations['org'].tolist())
df_orgs['recent_logins'] = df_orgs['id'].isin(logged_in_3_months['org'].tolist())
df_orgs['recent_created_users'] = df_orgs['id'].isin(created_users['org'].tolist())
df_orgs['never_viable'] = df_orgs['id'].isin(orgs_never_viable)

In [38]:
df_orgs[['id', 'churned', 'integrations', 'recent_logins', 'recent_created_users', 'mean_diff_growth_churned']].head(3)

Unnamed: 0,id,churned,integrations,recent_logins,recent_created_users,mean_diff_growth_churned
0,1045,True,False,False,False,-79.739528
1,444449,False,False,True,True,322.537552
2,442134,False,False,False,True,-91.050222


In [39]:
for f in ['integrations', 'recent_logins', 'recent_created_users', 'mean_diff_growth_churned']:
    print(f)
    if f == 'mean_diff_growth_churned':
        print(df_orgs.groupby('churned')[f].mean())
    else:
        print(df_orgs[df_orgs[f]]['churned'].value_counts())
    print("-"*20)

integrations
False    829
True      57
Name: churned, dtype: int64
--------------------
recent_logins
False    2355
True       42
Name: churned, dtype: int64
--------------------
recent_created_users
False    3439
True       94
Name: churned, dtype: int64
--------------------
mean_diff_growth_churned
churned
False    1.361681e+02
True     2.658609e-14
Name: mean_diff_growth_churned, dtype: float64
--------------------


In [49]:
len_all = len(df_orgs[(df_orgs['churned'])&(df_orgs['integrations'])&(df_orgs['recent_logins'])&(df_orgs['recent_created_users'])])
len_churned = len(df_orgs[df_orgs['churned']])

print("{} of {} churned orgs had integrations, recent logins, and recent users created".format(len_all, len_churned))

10 of 1170 churned orgs had integrations, recent logins, and recent users created
