In [135]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

%matplotlib inline

In [136]:
start_date = "2021-04-01"

# load & prep data

## load data

In [137]:
# reporting page views
q = '''select * from syslog_logs 
        where 
            created>'{}' and 
            message LIKE '%Page view%' and
            message LIKE '%: reports/%' '''.format(start_date)
df_syslogs = redshift_query_read(q, schema='production')

In [138]:
print("{:,} page view entries".format(len(df_syslogs)))
print("{:,} page view entries - unique".format(len(df_syslogs.drop_duplicates())))

1,727,068 page view entries
589,427 page view entries - unique


In [139]:
# processing
q = "select org, date, sum(amount) as volume, count(id) as count from transactions where date>'{}' and status='A' group by org, date".format(start_date)
org_performance = redshift_query_read(q, schema='public')

In [140]:
print("{:,} orgs".format(len(org_performance['org'].unique())))
print("${:,.2f} mean volume per org".format(org_performance['volume'].mean()))
print("{:,.2f} mean count per org".format(org_performance['count'].mean()))

3,332 orgs
$1,105.15 mean volume per org
7.15 mean count per org


## prep data

### syslogs

In [141]:
df_syslogs = df_syslogs.drop_duplicates()
drop_cols = ['count', 'entitytype', 'entity', 'systemid', 'type', 'hidden',
            'access', 'ack']
df_syslogs = df_syslogs.drop(drop_cols, axis=1)

In [142]:
print("syslogs:")
print("{:,} total entries".format(len(df_syslogs)))
print("{:,} ghost entries".format(len(df_syslogs[df_syslogs['ghost']!=0])))
print("{:,} nonghost entries".format(len(df_syslogs[df_syslogs['ghost']==0])))

syslogs:
589,427 total entries
39,836 ghost entries
549,591 nonghost entries


In [143]:
df_syslogs = df_syslogs[df_syslogs['ghost']==0]

In [144]:
df_syslogs['week'] = df_syslogs['created'].dt.isocalendar().week
df = df_syslogs.groupby(['org', 'week'])['id'].count().reset_index()
df.columns = ['org', 'week', 'views']

### org performance (transactions)

In [145]:
org_performance['week'] = org_performance['date'].dt.isocalendar().week
org_performance = org_performance.groupby(['org', 'week'])[['volume', 'count']].sum().reset_index()

In [146]:
org_performance.sort_values('week', ascending=True, inplace=True)
org_data = []
for o in org_performance['org'].unique():
    this_data = org_performance[org_performance['org']==o].copy()
    for _, r in this_data.iterrows():
        r['forward_month'] = this_data[(this_data['week']>r['week'])&(this_data['week']<=r['week']+4)]['volume'].sum()
        r['backward_month'] = this_data[(this_data['week']<r['week'])&(this_data['week']>=r['week']-4)]['volume'].sum()
        org_data.append(r)
org_data = pd.DataFrame(org_data)

In [147]:
org_data['forward_perc_diff'] = (org_data['forward_month'] - org_data['backward_month']) / org_data['backward_month']

In [148]:
increasing_forward = len(org_data[org_data['forward_month']>org_data['backward_month']])
increasing_forward_perc = (increasing_forward / len(org_data)) * 100.

print("{:,} entries; {:,} orgs".format(len(org_data), len(org_data['org'].unique())))
print("{:,} ({:.2f}%) entries increasing moving forward".format(increasing_forward, increasing_forward_perc))

50,340 entries; 3,332 orgs
24,990 (49.64%) entries increasing moving forward


### merge data

In [149]:
rep_perf = org_data.merge(df, on=['org', 'week'])

In [150]:
rep_perf_data = None
for o in rep_perf['org'].unique():
    this_df = rep_perf[rep_perf['org']==o].copy()
    this_df.sort_values('week', ascending=True, inplace=True)
    this_df['views_diff'] = this_df['views'].diff()
    if rep_perf_data is None:
        rep_perf_data = this_df
    else:
        rep_perf_data = rep_perf_data.append(this_df)

# analysis

In [151]:
rep_perf_data.tail(3)

Unnamed: 0,org,week,volume,count,forward_month,backward_month,forward_perc_diff,views,views_diff
11545,447391.0,38.0,1.0,1.0,3425.55,0.0,inf,2,
11546,447376.0,38.0,1.0,4.0,10.0,0.0,inf,7,
11547,445027.0,38.0,60.0,7.0,544.75,0.0,inf,4,


In [152]:
rep_perf_data['forward_perc_diff'] = rep_perf_data['forward_perc_diff'].replace(np.inf, np.nan)
rep_perf_data['views_diff'] = rep_perf_data['views_diff'].replace(np.inf, np.nan)

In [153]:
len(rep_perf), rep_perf['forward_perc_diff'].isna().sum()

(11548, 62)

In [154]:
len_inc_rep_views = len(rep_perf_data[rep_perf_data['views_diff']>0])
perc_inc_rep_views = (len_inc_rep_views / len(rep_perf_data)) * 100.

print("{:,} ({:.2f}%) entries with increasing reporting views".format(len_inc_rep_views, perc_inc_rep_views))

len_inc_vol = len(rep_perf_data[rep_perf_data['forward_perc_diff']>0])
perc_inc_vol = (len_inc_vol / len(rep_perf_data)) * 100.

print("{:,} ({:.2f}%) entries with increasing processing".format(len_inc_vol, perc_inc_vol))

4,731 (40.97%) entries with increasing reporting views
5,752 (49.81%) entries with increasing processing


In [155]:
len_inc_rep_proc = len(rep_perf_data[(rep_perf_data['views_diff']>0)&(rep_perf_data['forward_perc_diff']>0)])
perc = (len_inc_rep_proc / len(rep_perf_data[rep_perf_data['views_diff']>0])) * 100.

print("{:,} ({:.2f}%) entries with increasing processing of increasing reporting views".format(len_inc_rep_proc, perc))

len_dec_rep_proc = len(rep_perf_data[(rep_perf_data['views_diff']>0)&(rep_perf_data['backward_month']>rep_perf_data['forward_month'])])
perc = (len_dec_rep_proc / len(rep_perf_data[rep_perf_data['views_diff']>0])) * 100.

print("{:,} ({:.2f}%) entries w/ increasing views and backward month > forward month processing".format(len_dec_rep_proc, perc))

2,464 (52.08%) entries with increasing processing of increasing reporting views
2,231 (47.16%) entries w/ increasing views and backward month > forward month processing


In [156]:
rep_perf_data[['forward_perc_diff', 'views_diff']].corr()

Unnamed: 0,forward_perc_diff,views_diff
forward_perc_diff,1.0,0.001468
views_diff,0.001468,1.0


In [157]:
print("PROCESSING AVERAGES")
print("-"*40)
print("all:")
print("\tmean: {:.2f}%".format(rep_perf_data['forward_perc_diff'].mean() * 100.))
print("\tmedian: {:.2f}%".format(rep_perf_data['forward_perc_diff'].median() * 100.))

print("increasing rep views:")
mn_proc_diff_inc_rep = rep_perf_data[rep_perf_data['views_diff']>0]['forward_perc_diff'].mean()
mdn_proc_diff_inc_rep = rep_perf_data[rep_perf_data['views_diff']>0]['forward_perc_diff'].median()

print("\tmean: {:.2f}%".format(mn_proc_diff_inc_rep * 100.))
print("\tmedian: {:.2f}%".format(mdn_proc_diff_inc_rep * 100.))

print("decreasing rep views:")
mn_proc_diff_dec_rep = rep_perf_data[rep_perf_data['views_diff']<0]['forward_perc_diff'].mean()
mdn_proc_diff_dec_rep = rep_perf_data[rep_perf_data['views_diff']<0]['forward_perc_diff'].median()

print("\tmean: {:.2f}%".format(mn_proc_diff_dec_rep * 100.))
print("\tmedian: {:.2f}%".format(mdn_proc_diff_dec_rep * 100.))

PROCESSING AVERAGES
----------------------------------------
all:
	mean: 5505.14%
	median: 1.71%
increasing rep views:
	mean: 2201.20%
	median: 4.56%
decreasing rep views:
	mean: 675.35%
	median: -0.98%


In [158]:
print("REPORTING VISIT AVERAGES")
print("-"*40)
print("all:")
print("\tmean: {:.2f}".format(rep_perf_data['views_diff'].mean()))
print("\tmedian: {:.2f}".format(rep_perf_data['views_diff'].median()))

print("increasing rep views:")
mn_rep_diff_inc_proc = rep_perf_data[rep_perf_data['forward_perc_diff']>0]['views_diff'].mean()
mdn_rep_diff_inc_proc = rep_perf_data[rep_perf_data['forward_perc_diff']>0]['views_diff'].median()

print("\tmean: {:.2f}".format(mn_rep_diff_inc_proc))
print("\tmedian: {:.2f}".format(mdn_rep_diff_inc_proc))

print("decreasing rep views:")
mn_rep_diff_dec_proc = rep_perf_data[rep_perf_data['forward_perc_diff']<0]['views_diff'].mean()
mdn_rep_diff_dec_proc = rep_perf_data[rep_perf_data['forward_perc_diff']<0]['views_diff'].median()

print("\tmean: {:.2f}".format(mn_rep_diff_dec_proc))
print("\tmedian: {:.2f}".format(mdn_rep_diff_dec_proc))

REPORTING VISIT AVERAGES
----------------------------------------
all:
	mean: 2.29
	median: 0.00
increasing rep views:
	mean: 4.12
	median: 1.00
decreasing rep views:
	mean: 0.38
	median: 0.00


# which segments of reporting are used by the highest performing orgs?

In [159]:
# query for org aggregates to identify top performers
q = "select org, sum(amount) as volume from transactions where status='A' group by org"
top_performers = redshift_query_read(q, schema='public')

In [160]:
q = "select id from organization where status=1"
active_orgs = redshift_query_read(q, schema='production')

In [161]:
top_performers = top_performers[top_performers['org'].isin(active_orgs['id'].unique())]

In [162]:
print("{:,} active orgs".format(len(top_performers)))
print("${:,.2f} mean volume".format(top_performers['volume'].mean()))
print("${:,.2f} median volume".format(top_performers['volume'].median()))

4,667 active orgs
$321,429.24 mean volume
$26,369.35 median volume


In [163]:
top_20_orgs = top_performers.sort_values('volume', ascending=False).head(20)['org'].tolist()
top_50_orgs = top_performers.sort_values('volume', ascending=False).head(50)['org'].tolist()
top_100_orgs = top_performers.sort_values('volume', ascending=False).head(100)['org'].tolist()

In [164]:
def cat_reporting_page(url):
    if '/mappings/' in url:
        return 'Mappings'
    elif 'donors/[ID]' in url:
        return 'Donors'
    elif 'export' in url:
        return 'Exports'
    elif '/invoice/' in url:
        return 'Invoices'
    elif 'reports/' in url:
        return 'Reports'
    elif 'transaction/[ID]' in url:
        return 'Transactions'
    else:
        return None

df_syslogs['rep cat'] = df_syslogs['message'].apply(cat_reporting_page)

In [165]:
print("All:")
print(df_syslogs['rep cat'].value_counts(normalize=True))
print("Not top performers:")
print(df_syslogs[~df_syslogs['org'].isin(top_100_orgs)]['rep cat'].value_counts(normalize=True))
print("Top 100 performers:")
print(df_syslogs[df_syslogs['org'].isin(top_100_orgs)]['rep cat'].value_counts(normalize=True))
print("Top 50 performers:")
print(df_syslogs[df_syslogs['org'].isin(top_50_orgs)]['rep cat'].value_counts(normalize=True))
print("Top 20 performers:")
print(df_syslogs[df_syslogs['org'].isin(top_20_orgs)]['rep cat'].value_counts(normalize=True))

All:
Reports     0.927109
Exports     0.067758
Mappings    0.005091
Invoices    0.000042
Name: rep cat, dtype: float64
Not top performers:
Reports     0.923052
Exports     0.073218
Mappings    0.003723
Invoices    0.000008
Name: rep cat, dtype: float64
Top 100 performers:
Reports     0.937195
Exports     0.054186
Mappings    0.008492
Invoices    0.000127
Name: rep cat, dtype: float64
Top 50 performers:
Reports     0.947941
Exports     0.043312
Mappings    0.008571
Invoices    0.000176
Name: rep cat, dtype: float64
Top 20 performers:
Reports     0.935363
Exports     0.053053
Mappings    0.011345
Invoices    0.000239
Name: rep cat, dtype: float64
