In [1]:
import pandas as pd
import numpy as np
import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *

# Load data

In [2]:
# data built from /form health v2/jeremy/notebooks/build - analytics qgiv.py
df = get_dataframe_from_file("qgivmodelsdata", "analytics_qgiv.change.2019.csv")

In [3]:
orgs = get_dataframe_from_file("qgiv-stats-data", "organizations.names.csv")
orgs = orgs[['id', 'date_closed']]
churned_orgs = orgs[~orgs['date_closed'].isna()]['id'].tolist()
len(orgs), len(churned_orgs)

(8143, 1701)

In [4]:
df['churned'] = df['org'].isin(churned_orgs)

# Correlations

In [5]:
trans_cols = [c for c in df.columns if '_trans_' in c or '_rec_' in c or c == 'reg_count']
corrs = df.drop(['date', 'form', 'org'], axis=1).corr()

In [6]:
corrs['churned'].round(decimals=4).dropna().sort_values()

sms_trans_count            -0.0056
restrictions               -0.0052
kiosk_trans_count          -0.0027
don_form_trans_vol         -0.0006
one_time_trans_vol         -0.0006
kiosk_trans_vol            -0.0004
mobilevt_trans_vol         -0.0004
max_amount                 -0.0004
sms_trans_vol              -0.0002
vt_trans_vol               -0.0001
min_amount                 -0.0000
rec_trans_vol               0.0010
dl_trans_volume             0.0014
multirestriction_system     0.0022
fb_trans_vol                0.0023
fb_trans_count              0.0028
mobile_trans_vol            0.0031
new_rec_volume              0.0036
collect_address_mobile      0.0040
req_ded_flds                0.0041
enable_sms                  0.0041
amounts                     0.0051
ded_types                   0.0054
collect_optin               0.0060
pledges_count               0.0060
mobilevt_trans_count        0.0065
opt_fields                  0.0066
pledge_active               0.0078
permit_anonymous    

In [7]:
# looking at the strongest correlations
cols = ['new_rec_count', 'mobile_trans_count', 'dl_trans_count', 'dl_new_rec_count',
        'one_time_trans_count', 'don_form_trans_count']
df.replace([np.inf, -np.inf], np.nan).dropna().groupby('churned')[cols].mean().reset_index().transpose()

Unnamed: 0,0,1
churned,False,True
new_rec_count,-0.0574325,-0.0301551
mobile_trans_count,-0.0600902,-0.0210392
dl_trans_count,-0.0532635,-0.0274495
dl_new_rec_count,-0.0178681,-0.00875714
one_time_trans_count,-0.177207,-0.0946777
don_form_trans_count,-0.177207,-0.0946777


Nothing striking here. There are some weak correlations but they do not invert when grouping by churned. Ideally we'd like to see that a positive mean on one side and negative mean on the other. This would be a strong indicator of an influencing factor. Here we simply see varying degrees. Not meaningless, but the differences at the moment don't appear strong enough to justify the additional model complexity.

## lag means

look at the correlations between setting and processing stats rolling means at 1 and 3 months to churn

In [12]:
lag_data = None
for f in df['form'].unique().tolist():
    this_df = df[df['form']==f].copy()
    idx_cols = ['date', 'form', 'org', 'churned']
    
    for c in this_df.columns:
        if c not in idx_cols:
            this_df['{}_lag_30'.format(c)] = this_df[c].rolling(window=4).mean()
            this_df['{}_lag_90'.format(c)] = this_df[c].rolling(window=12).mean()
    
    lag_cols = [c for c in this_df.columns if '_lag_' in c]
    if lag_data is None:
        lag_data = this_df[idx_cols + lag_cols].dropna()
    else:
        lag_data = lag_data.append(this_df[idx_cols + lag_cols].dropna())

In [21]:
corrs = lag_data.drop(['form', 'org'], axis=1).corr()['churned'].dropna().round(decimals=4)
corrs[(corrs>=.02)|(corrs<=-.02)].sort_values()

sms_trans_count_lag_30        -0.0203
sms_trans_vol_lag_30          -0.0203
sms_trans_count_lag_90        -0.0202
reg_count_lag_30               0.0213
reg_count_lag_90               0.0216
mobile_trans_count_lag_90      0.0245
mobile_trans_count_lag_30      0.0246
mobile_trans_vol_lag_90        0.0256
mobile_trans_vol_lag_30        0.0259
don_form_trans_count_lag_30    0.0271
one_time_trans_count_lag_30    0.0271
don_form_trans_count_lag_90    0.0273
one_time_trans_count_lag_90    0.0273
don_form_trans_vol_lag_30      0.0274
one_time_trans_vol_lag_30      0.0274
churned                        1.0000
Name: churned, dtype: float64

In [24]:
# looking at the strongest correlations
cols = ['sms_trans_count_lag_30', 'sms_trans_vol_lag_30', 'sms_trans_count_lag_90',
        'reg_count_lag_30', 'reg_count_lag_90', 'mobile_trans_count_lag_90',
        'mobile_trans_count_lag_30', 'mobile_trans_vol_lag_90', 'mobile_trans_vol_lag_30',
        'don_form_trans_count_lag_30', 'one_time_trans_count_lag_30', 'don_form_trans_count_lag_90',
        'one_time_trans_count_lag_90', 'don_form_trans_vol_lag_30', 'one_time_trans_vol_lag_30']
lag_data.replace([np.inf, -np.inf], np.nan).dropna().groupby('churned')[cols].mean().round(decimals=4).reset_index().transpose()

Unnamed: 0,0,1
churned,False,True
sms_trans_count_lag_30,-0.0096,-0.0214
sms_trans_vol_lag_30,-0.0096,-0.0214
sms_trans_count_lag_90,-0.0095,-0.0214
reg_count_lag_30,-0.0516,-0.0223
reg_count_lag_90,-0.0503,-0.0217
mobile_trans_count_lag_90,-0.037,-0.01
mobile_trans_count_lag_30,-0.0373,-0.01
mobile_trans_vol_lag_90,-0.034,-0.0067
mobile_trans_vol_lag_30,-0.0343,-0.0068


Not much difference between the lag correlations and the mean correlations. Some interesting data points that I would like to explore further but not showing a lot promise in predictive power of churn.