In [3]:
import pandas as pd
%matplotlib inline

import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *

# load data

In [23]:
orgs = get_dataframe_from_file("qgiv-stats-data", "organizations.names.csv")
orgs = orgs[['id', 'date_closed']]
churned_orgs = orgs[~orgs['date_closed'].isna()]['id'].tolist()
len(orgs), len(churned_orgs), "{:.2f}%".format((float(len(churned_orgs)) / float(len(orgs))) * 100.)

(8143, 1701, '20.89%')

In [6]:
q = "select * from logs"
logs = redshift_query_read(q)
len(logs)

In [9]:
logs['systemid'] = logs['systemid'].fillna(0).astype('int')
logs['systemtype'] = logs['systemtype'].fillna(0).astype('int')
logs['org'] = logs['org'].fillna(0).astype('int')
logs['churned'] = logs['org'].isin(churned_orgs)

# grouping by system type, system wide

In [18]:
grpd_counts = logs.groupby(['churned', 'systemtype'])['systemid'].count().reset_index()
grpd_counts.columns = ['churned', 'type', 'count']
grpd_counts.sort_values('count', ascending=False, inplace=True)
grpd_counts.head()

Unnamed: 0,churned,type,count
15,False,27,914465
13,False,25,443611
28,False,44,418385
6,False,13,410534
16,False,28,320542


In [35]:
grpd = grpd_counts.pivot(index="type", columns="churned", values="count").fillna(0).reset_index()
grpd.columns = ['type', 'Not Churned', 'Churned']
grpd['perc_diff'] = (grpd['Not Churned'] - grpd['Churned']) / grpd['Not Churned']
grpd

Unnamed: 0,type,Not Churned,Churned,perc_diff
0,0,2673.0,2.0,0.999252
1,2,6.0,0.0,1.0
2,4,3158.0,304.0,0.903737
3,8,970.0,15.0,0.984536
4,11,43382.0,6731.0,0.844843
5,12,242809.0,11831.0,0.951274
6,13,410534.0,12042.0,0.970667
7,15,1082.0,94.0,0.913124
8,18,4018.0,717.0,0.821553
9,20,294.0,54.0,0.816327


We see that 20% of organizations have churned but the difference between churned and not churned system type messages present in the logs. There are clear, strong differences between these figures but we are still looking at system wide log message presence. It would be better to group by organization to examine the within-organization distributions.

In [36]:
grpd_counts = logs.groupby(['org', 'systemtype'])['systemid'].count().reset_index()
grpd_counts.columns = ['org', 'type', 'count']
grpd_counts.sort_values('count', ascending=False, inplace=True)
grpd_counts.head()

Unnamed: 0,org,type,count
9,0,44,409982
15038,441565,27,41182
3960,14268,27,36278
4695,27639,13,30921
921,381,25,27630


In [39]:
grpd_org_data = None

for org in grpd_counts['org'].unique().tolist():
    these_logs = grpd_counts[grpd_counts['org']==org].copy()
    these_logs['total'] = these_logs['count'].sum()
    these_logs['perc'] = these_logs['count'] / these_logs['total']
    
    if grpd_org_data is None:
        grpd_org_data = these_logs
    else:
        grpd_org_data = grpd_org_data.append(these_logs)
        
grpd_org_data['churned'] = grpd_org_data['org'].isin(churned_orgs)

In [52]:
grpd_org = grpd_org_data.groupby(['type', 'churned'])['perc'].mean().reset_index()
grpd_org = grpd_org.pivot(index="type", columns="churned", values="perc")
grpd_org.fillna(0, inplace=True)

grpd_org = grpd_org.reset_index()
grpd_org.columns = ['type', 'not churned', 'churned']
grpd_org['diff'] = (grpd_org['not churned'] - grpd_org['churned']).abs()

In [54]:
LABELS = [
    (8, "SETTINGS"),
    (11, "DONORLOGINS"),
    (18, "REPORTING"),
    (23, "RECURRING"),
    (24, "FORMS"),
    (29, "RECIPIENTS"),
    (32, "PROMOS"),
    (35, "ORGANIZATION"),
    (41, "ORG_SMS"),
    (49, "PRICING_TEMPLATE")
]

def get_label_for_type(type_enum):
    this_label = ""
    for label in LABELS:
        if label[0] == type_enum:
            this_label = label[1]
    return this_label

grpd_org['label'] = grpd_org['type'].apply(get_label_for_type)
grpd_org.sort_values('diff', ascending=False).head(10)

Unnamed: 0,type,not churned,churned,diff,label
20,35,0.332408,0.504513,0.172105,ORGANIZATION
31,49,0.166733,0.0,0.166733,PRICING_TEMPLATE
8,18,0.092552,0.238364,0.145813,REPORTING
11,23,0.198197,0.325602,0.127405,RECURRING
3,8,0.12742,0.004969,0.122451,SETTINGS
25,41,0.08775,0.200934,0.113184,ORG_SMS
4,11,0.228393,0.322237,0.093844,DONORLOGINS
12,24,0.227409,0.310119,0.08271,FORMS
17,29,0.12015,0.190274,0.070124,RECIPIENTS
18,32,0.069068,0.135231,0.066162,PROMOS


Here we're seeing log message system types as a percentage of the organizations total log messages. There are some rather strong distinctions here thate should prove to be good features for modeling. Regardless of the efficacy of specific features, __I think it would be a good idea to shift the logs model to focus on percentage of the organizations logs rather than count as it presently does__.