In [2]:
import pandas as pd

import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *
%matplotlib inline

# load data

### sys logs

In [3]:
q = '''select 
            org,
            userid,
            date_trunc('week', created) as week,
            count(id) as activities
        from syslog_logs
        where ghost=0
        group by org, userid, date_trunc('week', created)'''
df = redshift_query_read(q, schema='production')

In [4]:
print(len(df))
df.tail(3)

90225


Unnamed: 0,org,userid,week,activities
90222,443521,11379.0,2021-10-11,2
90223,0,1611478.0,2021-09-27,2
90224,0,1608691.0,2021-10-18,2


In [5]:
print("{:,} unique orgs".format(len(df['org'].unique())))
print("{:,} unique users".format(len(df['userid'].unique())))
print("{:,.2f} mean unique users per org".format(df.groupby('org')['userid'].nunique().mean()))
print("{:,.2f} median unique users per org".format(df.groupby('org')['userid'].nunique().median()))

5,042 unique orgs
18,382 unique users
5.09 mean unique users per org
2.00 median unique users per org


In [6]:
df['week'].min()

Timestamp('2020-12-07 00:00:00')

### org processing from transactions

In [7]:
q = '''select
            org,
            date_trunc('week', date) as week,
            count(id) as trans_count,
            sum(amount) as trans_vol
        from transactions
        where
            status='A' and
            date>='{}' 
        group by org, date_trunc('week', date)'''
trans = redshift_query_read(q.format(df['week'].min()), schema='public')

In [8]:
print("{:,} entries".format(len(trans)))
print("{:,} orgs".format(len(trans['org'].unique())))
print("${:,.2f} mean weekly vol per org".format(trans['trans_vol'].mean()))
print("${:,.2f} median weekly vol per org".format(trans['trans_vol'].median()))
print("{:,.2f} mean weekly count per org".format(trans['trans_count'].mean()))
print("{:,.2f} median weekly count per org".format(trans['trans_count'].median()))

82,725 entries
3,703 orgs
$4,947.93 mean weekly vol per org
$505.00 median weekly vol per org
25.64 mean weekly count per org
5.00 median weekly count per org


# analysis

## activity to performance

### average user activity per org

In [9]:
org_user_aggs = df.groupby(['org', 'userid'])['activities'].sum().reset_index()
org_user_aggs = org_user_aggs[org_user_aggs['org']!=0]
org_user_aggs['activities'].agg(['mean', 'median'])

mean      1915.387958
median     110.000000
Name: activities, dtype: float64

In [10]:
org_aggs = org_user_aggs.groupby('org')['activities'].agg(['mean', 'median', 'count']).reset_index()
org_aggs.columns = ['org', 'mean activities', 'median activities', 'users']
org_aggs = org_aggs.merge(trans.groupby('org')[['trans_count', 'trans_vol']].sum().reset_index(), on='org')
org_aggs.head(3)

Unnamed: 0,org,mean activities,median activities,users,trans_count,trans_vol
0,6,9825.333333,1366.0,9,1403,232426.41
1,9,33.136364,18.0,44,7,155.0
2,13,1098.0,257.0,8,786,89175.58


In [11]:
org_aggs[['mean activities', 'median activities', 'trans_count', 'trans_vol']].corr()[['trans_count', 'trans_vol']].head(2)

Unnamed: 0,trans_count,trans_vol
mean activities,0.223002,0.151904
median activities,0.097311,0.023282


_Overall weekly user activity is not strongly correlated to org performance._

### activity to performance for week and proceeding week

In [12]:
org_weekly = df.merge(trans, on=['org', 'week'])
org_weekly = org_weekly[org_weekly['org']!=0]
org_weekly = org_weekly.groupby(['org', 'week'])[['activities', 'trans_count', 'trans_vol']].sum().reset_index()
org_weekly.head(3)

Unnamed: 0,org,week,activities,trans_count,trans_vol
0,6,2020-12-14,5,23,2612.83
1,6,2021-01-04,6,13,1114.73
2,6,2021-01-18,6,8,228.52


In [13]:
org_weekly_data = None
for o in org_weekly['org'].unique():
    this_df = org_weekly[org_weekly['org']==o].copy().sort_values('week', ascending=True)
    this_df['trans_count_forward'] = this_df['trans_count'].shift(-1)
    this_df['trans_vol_forward'] = this_df['trans_vol'].shift(-1)
    
    if org_weekly_data is None:
        org_weekly_data = this_df
    else:
        org_weekly_data = org_weekly_data.append(this_df)

In [14]:
org_weekly_data.drop(['org', 'week'], axis=1).corr()['activities']

activities             1.000000
trans_count            0.354009
trans_vol              0.091870
trans_count_forward    0.350002
trans_vol_forward      0.099736
Name: activities, dtype: float64

There is a stronger correlation between weekly activities and transaction count on a weekly basis than overall aggregates. This correlation weakens very slightly moving forward to the following week. Interestingly, while both correlations are rather weak, activities has a stronger correlation to forward volume than the current volume.

## super users vs average

### active users per week per org

In [15]:
org_weekly_users = df.groupby(['org', 'week'])['userid'].nunique().reset_index()
org_weekly_users = org_weekly_users.merge(trans, on=['org', 'week'])
org_weekly_users.tail(3)

Unnamed: 0,org,week,userid,trans_count,trans_vol
35020,447702,2021-10-11,1,1,1.0
35021,447707,2021-10-11,3,1,1.0
35022,447722,2021-10-18,1,1,1.0


In [16]:
print("{:,.2f} mean active users per week per org".format(org_weekly_users['userid'].mean()))
print("{:,.2f} median active users per week per org".format(org_weekly_users['userid'].median()))

1.80 mean active users per week per org
1.00 median active users per week per org


In [17]:
org_weekly_users[['userid', 'trans_count', 'trans_vol']].corr()

Unnamed: 0,userid,trans_count,trans_vol
userid,1.0,0.224591,0.01473
trans_count,0.224591,1.0,0.029747
trans_vol,0.01473,0.029747,1.0


In [18]:
for r in [(0, 1), (2, 4), (5, 7), (8, 10), (11)]:
    try:
        this_data = org_weekly_users[(org_weekly_users['userid']>=r[0])&(org_weekly_users['userid']<=r[1])]
    except TypeError as e:
        this_data = org_weekly_users[org_weekly_users['userid']>=r]

    print("Range: {} ({:,} orgs)".format(r, len(this_data['org'].unique())))
    print("Count mean: {:,.2f}; median: {:,.2f}".format(this_data['trans_count'].mean(), this_data['trans_count'].median()))
    print("Vol mean: ${:,.2f}; median: ${:,.2f}".format(this_data['trans_vol'].mean(), this_data['trans_vol'].median()))
    print("-"*40)

Range: (0, 1) (2,824 orgs)
Count mean: 26.74; median: 7.00
Vol mean: $3,572.56; median: $760.00
----------------------------------------
Range: (2, 4) (2,052 orgs)
Count mean: 59.26; median: 13.00
Vol mean: $14,157.80; median: $1,456.85
----------------------------------------
Range: (5, 7) (326 orgs)
Count mean: 127.07; median: 47.00
Vol mean: $57,866.78; median: $5,506.97
----------------------------------------
Range: (8, 10) (57 orgs)
Count mean: 253.39; median: 148.00
Vol mean: $25,376.97; median: $17,471.23
----------------------------------------
Range: 11 (23 orgs)
Count mean: 1,178.07; median: 220.00
Vol mean: $65,213.68; median: $26,614.56
----------------------------------------


There is certainly a relationship between active users and performance. The median performance rises steadily with the increase in active users. Given the weak correlation and the fact that this relationship does not hold for mean performance, one would assume that greater active users is a consequence of greater performance rather than the other way around.

### activity by user

In [19]:
user_agg = df.groupby(['org', 'userid'])['activities'].agg(['sum', 'count', 'mean', 'median']).reset_index()
user_agg.columns = ['org', 'user', 'activities', 'weeks', 'mean activities', 'median activities']
user_agg.tail()

Unnamed: 0,org,user,activities,weeks,mean activities,median activities
25675,447714,1619864.0,60,1,60.0,60.0
25676,447715,1619875.0,86,1,86.0,86.0
25677,447722,1621650.0,50,1,50.0,50.0
25678,447723,1621789.0,260,1,260.0,260.0
25679,447724,1621843.0,7,1,7.0,7.0


### users lifetime

In [20]:
user_agg['weeks'].mean(), user_agg['weeks'].median()

(3.508411214953271, 1.0)

In [25]:
ranges = [(1, 10), (11, 20), (21, 30), (31, 40), (41, 50), 50]
for r in ranges:
    try:
        these_users = user_agg[(user_agg['activities']>=r[0])&(user_agg['activities']<=r[1])]
    except Exception as e:
        these_users = user_agg[user_agg['activities']>r]
    print("{}: mean weeks: {:.2f}; median weeks: {:.2f}; orgs: {:,}; total users: {:,}".format(r, these_users['weeks'].mean(), these_users['weeks'].median(), len(these_users['org'].unique()), len(these_users)))

(1, 10): mean weeks: 1.06; median weeks: 1.00; orgs: 3,679; total users: 11,852
(11, 20): mean weeks: 1.36; median weeks: 1.00; orgs: 805; total users: 3,292
(21, 30): mean weeks: 1.84; median weeks: 2.00; orgs: 401; total users: 998
(31, 40): mean weeks: 2.17; median weeks: 2.00; orgs: 248; total users: 463
(41, 50): mean weeks: 2.74; median weeks: 2.00; orgs: 239; total users: 386
50: mean weeks: 7.96; median weeks: 6.00; orgs: 3,296; total users: 8,689


### user activity distribution per processing

In [80]:
users_activity_per_org = []
for o in user_agg['org'].unique():
    if o == 0:
        continue
    this_data = user_agg[user_agg['org']==o]
    users_activity_per_org.append({
        'org': o,
        '1 - 10': len(this_data[this_data['mean activities']<10]),
        '11 - 20': len(this_data[(this_data['mean activities']>10)&(this_data['mean activities']<=20)]),
        '21 - 30': len(this_data[(this_data['mean activities']>20)&(this_data['mean activities']<=30)]),
        '31 - 40': len(this_data[(this_data['mean activities']>30)&(this_data['mean activities']<=40)]),
        '41 - 50': len(this_data[(this_data['mean activities']>40)&(this_data['mean activities']<=50)]),
        '50+': len(this_data[this_data['mean activities']>50])
    })

In [87]:
pd.DataFrame(users_activity_per_org).mean()

org        369667.597779
1 - 10          2.539270
11 - 20         0.647560
21 - 30         0.167394
31 - 40         0.081119
41 - 50         0.079334
50+             1.452797
dtype: float64

We see here that the original assumption that there are a small number of super users per org does not necessarily hold true. There are clearly more on the lowest end (1 - 10 activities per week) but the second greatest group by activity is in the highest grouping (50+). It appears by the averages that there are 1 or 2 users that are doing quite a lot, 2 or 3 that do very little, and not much in the middle.

In [90]:
org_users = pd.DataFrame(users_activity_per_org)
org_users = org_users.merge(trans.groupby('org')[['trans_count', 'trans_vol']].sum().reset_index(), on='org')
org_users.head()

Unnamed: 0,org,1 - 10,11 - 20,21 - 30,31 - 40,41 - 50,50+,trans_count,trans_vol
0,6,2,0,0,0,0,7,1403,232426.41
1,9,9,16,9,1,1,8,7,155.0
2,13,2,0,1,1,0,4,786,89175.58
3,31,1,0,0,0,0,1,244,53649.51
4,33,1,0,0,0,0,1,68,3059.0


In [91]:
org_users.corr()

Unnamed: 0,org,1 - 10,11 - 20,21 - 30,31 - 40,41 - 50,50+,trans_count,trans_vol
org,1.0,-0.014072,-0.013603,0.017173,0.002819,0.004895,-0.006116,-0.024125,0.000696
1 - 10,-0.014072,1.0,0.041229,0.093564,0.003202,0.086798,0.105304,0.065986,0.001306
11 - 20,-0.013603,0.041229,1.0,0.245814,0.068848,0.060876,0.169846,0.056221,0.002525
21 - 30,0.017173,0.093564,0.245814,1.0,0.076327,0.082949,0.153426,0.017744,-0.000899
31 - 40,0.002819,0.003202,0.068848,0.076327,1.0,0.001613,0.112225,0.041417,0.055632
41 - 50,0.004895,0.086798,0.060876,0.082949,0.001613,1.0,0.268181,0.100473,0.091103
50+,-0.006116,0.105304,0.169846,0.153426,0.112225,0.268181,1.0,0.320555,0.160203
trans_count,-0.024125,0.065986,0.056221,0.017744,0.041417,0.100473,0.320555,1.0,0.192437
trans_vol,0.000696,0.001306,0.002525,-0.000899,0.055632,0.091103,0.160203,0.192437,1.0


Given we already generally know that organizations more engaged in the system process more, it is unsurprising that we see the strongest correlations to transaction counts and volume in the 50+ activity grouping. Only one other group (41-50 to transaction count) has a correlation that breaks 10%, and even here it is just barely.

In [93]:
org_users['trans_vol'].quantile([0.25, 0.5, 0.75])

0.25     1775.1650
0.50    11598.2000
0.75    51899.3575
Name: trans_vol, dtype: float64

In [96]:
org_users[org_users['trans_vol']>51899.36].mean()

org            322554.923588
1 - 10              1.218162
11 - 20             0.450720
21 - 30             0.171650
31 - 40             0.114064
41 - 50             0.137320
50+                 3.770764
trans_count      2009.330011
trans_vol      419471.248505
dtype: float64

The organizations in third quantile of processing have a very different user activity profile than the system average. We saw in all organizations that there were more users in the 1 - 10 activities group than the 50+. The organizations in the third quantile on the other hand have on average 1 person doing very littel and nearly 4 people doing quite a lot, every week.

This further dispels the hypothesis that successful organizations have a single super user that is primarily interacting with the system, acting as a resident expert/primary operator. It would appear that the more successful organizations have several high frequency, high engement users regularly interacting with the system.