Refocusing the churn modeling on only actionable features that will be more useful to CX

previous features
- log types
- growth volume
- integrations
- recent logins

changes
- try removing log entries, they don't seem particularly useful/actionable

In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *

# data prep

In [4]:
def get_orgs_logged_in_last_3_months():
    q = '''select
                distinct(users.org) as org
            from login
                left join users on login.user_id=users._id
            where
                login.original_timestamp >= add_months(current_date, -3);'''
    
    return redshift_query_read(q, schema="secure")


def get_diff_mean_growth_churned(churned_orgs_ids):
    q = '''select
                org,
                count(distinct form) as forms,
                date_trunc('month', date) as month,
                count(id) as count,
                sum(amount) as volume
            from transactions
                where status='A'
                group by org, date_trunc('month', date)
                order by date_trunc('month', date) desc;'''
    df_trans_agg = redshift_query_read(q)
    df_trans_agg['month'] = pd.to_datetime(df_trans_agg['month'])
    df_trans_agg.sort_values('month', ascending=True, inplace=True)

    org_growth_data = []

    for org in df_trans_agg['org'].unique():
        this_df = df_trans_agg[df_trans_agg['org']==org].copy()
        if len(this_df) <= 1:
            continue
        this_df['growth'] = this_df['volume'].diff() / this_df['volume'].shift(1)

        org_growth_data.append({
            'org': org,
            'growth': this_df['growth'].replace([np.inf, -np.inf], np.nan).dropna().mean()
        })

    growth_df = pd.DataFrame(org_growth_data)
    growth_df['churned'] = growth_df['org'].isin(churned_orgs_ids)
    
    mean_churned_growth_rate = growth_df[growth_df['churned']]['growth'].mean()
    
    growth_df['mean_diff_growth_churned'] = growth_df['growth'] - mean_churned_growth_rate
    
    return growth_df[['org', 'mean_diff_growth_churned']]


def get_orgs_created_users_in_last_3_months():
    q = '''select 
                org, 
                timestamp 'epoch' + created_at * interval '1 second' as created_at, 
                status 
            from users 
            where created_at >= add_months(current_date, -3)
            order by created_at desc'''
    df_users = redshift_query_read(q, schema='secure')
    df_users['created_at'] = pd.to_datetime(df_users['created_at']).dt.date
    return df_users.groupby('org')['created_at'].count().reset_index()

In [5]:
# load orgs, integrations, transactions, recent logins, growth trend
df_orgs = get_dataframe_from_file("qgiv-stats-data", "organizations.names.csv")
df_integrations = get_dataframe_from_file("qgiv-stats-data", 'integrations.csv')
df_trans = redshift_query_read("select * from transactions where status='A' and date>=DATEADD('month', -6, CURRENT_DATE)")
logged_in_3_months = get_orgs_logged_in_last_3_months()
created_users = get_orgs_created_users_in_last_3_months()

In [4]:
# prep data
df_churned_orgs = df_orgs[~df_orgs['date_closed'].isnull()]
orgs_growth = get_diff_mean_growth_churned(df_churned_orgs['id'].tolist())

In [5]:
orgs_trans_counts = df_trans.groupby('org')['id'].count().reset_index()
orgs_never_viable = orgs_trans_counts[orgs_trans_counts['id']<100]['org'].tolist()

In [6]:
# compiling data into single dataframe
ftrs = ['org', 'has_integrations', 'recently_logged_in', 'churned', 
        'diff_mean_growth_churned']
orgs = df_orgs['id'].tolist() + df_integrations['org'].tolist() + df_trans['org'].tolist() + logged_in_3_months['org'].tolist()
df = pd.DataFrame(list(set(orgs)), columns=['org']).drop_duplicates()
df['has_integrations'] = df['org'].apply(lambda x: x in df_integrations['org'].tolist())
df['recently_logged_in'] = df['org'].apply(lambda x: x in logged_in_3_months['org'].tolist())
df['churned'] = df['org'].apply(lambda x: x in df_churned_orgs['id'].tolist())
df['recently_created_users'] = df['org'].apply(lambda x: x in created_users['org'].tolist())

df['org'] = df['org'].astype(int)
orgs_growth['org'] = orgs_growth['org'].astype(int)

df = df.merge(orgs_growth, on='org')

In [7]:
print(len(df), len(df['org'].unique()), len(df.drop_duplicates('org')))
df.head(3)

7310 4931 4931


Unnamed: 0,org,has_integrations,recently_logged_in,churned,recently_created_users,mean_diff_growth_churned
0,393219,False,False,True,False,-85.93613
1,196612,False,False,True,False,-88.050778
2,6,True,False,False,False,-89.560004


In [8]:
df['churned'].value_counts()

False    6138
True     1172
Name: churned, dtype: int64

In [9]:
df.groupby(['churned', 'recently_logged_in'])['org'].count().reset_index()

Unnamed: 0,churned,recently_logged_in,org
0,False,False,3759
1,False,True,2379
2,True,False,1172


In [10]:
df.groupby(['churned', 'has_integrations'])['org'].count().reset_index()

Unnamed: 0,churned,has_integrations,org
0,False,False,5301
1,False,True,837
2,True,False,1114
3,True,True,58


# modeling

In [11]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score

  from numpy.core.umath_tests import inner1d


In [12]:
# trying full dataset for benchmark
accuracies = []
for i in range(100):
    data = df.fillna(0.)

    rf = RandomForestClassifier()
    rf.fit(data.drop(['churned', 'org'], axis=1), data['churned'])
    y_pred = rf.predict(data.drop(['churned', 'org'], axis=1))
    (tn, fp), (fn, tp) = confusion_matrix(df['churned'], y_pred)
    accuracies.append(accuracy_score(data['churned'], y_pred))

print("FULL DATASET:")
print("-"*20)
print("true negative: {}".format(tn))
print("false positive: {}".format(fp))
print("false negative: {}".format(fn))
print("true positive: {}".format(tp))
print("absolute accuracy: {:.2f}%".format(np.mean(accuracies) * 100.))

FULL DATASET:
--------------------
true negative: 6108
false positive: 30
false negative: 210
true positive: 962
absolute accuracy: 96.77%


In [13]:
print("FEATURE IMPORTANCES:")
print("-"*20)
for f, v in zip(data.drop(['churned', 'org'], axis=1).columns, rf.feature_importances_):
    print("{}: {:.4f}".format(f, v))

FEATURE IMPORTANCES:
--------------------
has_integrations: 0.0249
recently_logged_in: 0.0608
recently_created_users: 0.0419
mean_diff_growth_churned: 0.8724


In [14]:
# trying w/ test set
accuracies = []
for i in range(100):
    data = df.fillna(0.)
    X_train, X_test, y_train, y_test = train_test_split(data.drop(['churned', 'org'], axis=1), data['churned'], stratify=data['churned'], test_size=0.25)

    rf = RandomForestClassifier()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    (tn, fp), (fn, tp) = confusion_matrix(y_test, y_pred)
    accuracies.append(accuracy_score(y_test, y_pred))

print("TRAIN/TEST SPLIT")
print("-"*20)
print("true negative: {}".format(tn))
print("false positive: {}".format(fp))
print("false negative: {}".format(fn))
print("true positive: {}".format(tp))
print("absolute accuracy: {:.2f}%".format(np.mean(accuracies) * 100.))

TRAIN/TEST SPLIT
--------------------
true negative: 1348
false positive: 187
false negative: 225
true positive: 68
absolute accuracy: 78.31%
