In [1]:
import pandas as pd

import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

In [2]:
start_date = "2019-10-01"

# load data

### integrations 

In [3]:
q = "select * from syslog_logs where message like '%integration%' and created>='{}'"
integrations = redshift_query_read(q.format(start_date), schema='production')

In [4]:
drop_cols = ['entitytype', 'systemid', 'systemtype', 'type', 'userid', 
             'hidden', 'access', 'ack', 'ghost', 'count', 'entity', 'id']
integrations.drop(drop_cols, axis=1, inplace=True)
integrations = integrations.drop_duplicates()

In [5]:
integrations.head(3)

Unnamed: 0,org,form,created,message
0,541,541,2021-03-01 12:24:00,info@asianpacificfund.org activated Salesforce...
1,443731,966440,2021-03-03 19:03:43,rwfabercpa@gmail.com activated QuickBooks Onli...
2,442025,442025,2021-04-06 12:46:52,erin@mtryrapecrisis.org activated Emma service...


In [6]:
print("{:,} entries".format(len(integrations)))
print("{:,} orgs, {:,} forms".format(len(integrations['org'].unique()), len(integrations['form'].unique())))
print("{:.2f} entries per org".format(integrations.groupby('org')['form'].count().mean()))

872 entries
366 orgs, 367 forms
2.38 entries per org


### org status

In [7]:
q = "select * from organization"
orgs = redshift_query_read(q, schema='production')

In [8]:
q = "select * from logs where message like 'changed organization%'"
logs = redshift_query_read(q, schema='production')

In [9]:
len(logs), len(logs[logs['message'].str.contains('status from active').fillna(False)])

(42055, 2243)

### transactions

In [10]:
q = '''select
            form,
            year,
            count(id) as trans_count,
            sum(amount) as trans_vol,
            count(case when recurring=0 then 1 else null end) as onetime_count,
            count(case when recurring!=0 then 1 else null end) as rec_count
        from transactions
        where status='A' 
        group by form, year'''
trans = redshift_query_read(q, schema='production')

In [11]:
trans['uses_integration'] = trans['form'].isin(integrations['form'].unique().tolist())
trans['uses_integration'].value_counts()

False    102312
True         70
Name: uses_integration, dtype: int64

In [12]:
len_processing_forms = len(trans[trans['uses_integration']]['form'].unique())
print("{} forms w/ transactions & integrations".format(len_processing_forms))

18 forms w/ transactions & integrations


In [13]:
trans.groupby('uses_integration')[['trans_count', 'trans_vol', 'onetime_count', 'rec_count']].agg(['mean', 'median']).reset_index()

Unnamed: 0_level_0,uses_integration,trans_count,trans_count,trans_vol,trans_vol,onetime_count,onetime_count,rec_count,rec_count
Unnamed: 0_level_1,Unnamed: 1_level_1,mean,median,mean,median,mean,median,mean,median
0,False,158.319141,24.0,20010.643752,2400.0,97.795439,14.0,60.523702,0.0
1,True,128.057143,61.0,17495.979714,6785.75,102.028571,53.0,26.028571,2.0
