The segment widget data is unreliable (beyond a historical point) so I am exploring changing the source to the embed table (schema: production). I need to confirm that the properties are alike, specifically that the use of widgets (as recorded in the production:embed table) is the same across both sources.

In [2]:
import sys
sys.path.insert(1, '../../../scripts/')
from s3_support import *

In [6]:
# pull transactions for performance metric
q = "select org, date, amount, recurring from transactions where date>'2018-01-01' and status='A'"
trans = redshift_query_read(q)

In [7]:
trans.head(3)

Unnamed: 0,org,date,amount,recurring
0,438488,2018-01-02,15.0,999812
1,1775,2018-01-02,1650.0,0
2,438465,2018-01-02,160.0,0


In [20]:
org_trans_sums = trans.groupby('org')['amount'].sum().reset_index()

In [8]:
# pull segment data
q = '''select
            users.org as org,
            count(created_widget.id) as widgets_created,
            date_trunc('day', created_widget.original_timestamp) as day
        from created_widget
            left join users on created_widget.uuid=users.uuid
        group by day, org'''
df_widget_created = redshift_query_read(q, schema="secure")

q = '''select
            users.org as org,
            count(deleted_widget.id) as widgets_deleted,
            date_trunc('day', deleted_widget.original_timestamp) as day
        from deleted_widget
            left join users on deleted_widget.uuid=users.uuid
        group by day, org'''
df_widget_deleted = redshift_query_read(q, schema="secure")

df_widget = df_widget_created.merge(df_widget_deleted, on=['org', 'day'])
df_widget['widgets'] = df_widget

In [13]:
df_widget.sort_values('day', ascending=True, inplace=True)
widget_data = None
for org in df_widget['org'].unique():
    _df = df_widget[df_widget['org']==org].copy()
    _df['created_cumsum'] = _df['widgets_created'].cumsum()
    _df['deleted_cumsum'] = _df['widgets_deleted'].cumsum()
    _df['widgets_cumsum'] = _df['created_cumsum'] - _df['deleted_cumsum']
    if widget_data is None:
        widget_data = _df
    else:
        widget_data = widget_data.append(_df)
widget_data.head(3)

Unnamed: 0,org,widgets_created,day,widgets_deleted,created_cumsum,deleted_cumsum,widgets_cumsum
104,443134,1,2020-02-11,2,1,2,-1
107,443134,2,2020-03-05,1,3,3,0
6,443134,1,2020-03-11,1,4,4,0


In [14]:
segment_widget_orgs = widget_data['org'].unique().tolist()

In [37]:
# pull embed table data 
q = "select entity as org from embed"
embeds = redshift_query_read(q, schema="production")['org'].unique().tolist()

In [38]:
embeds[:3]

[753, 9, 292867]

In [15]:
production_widget_orgs = embeds['entity'].unique().tolist()

In [32]:
org_trans_sums['widgets_production'] = org_trans_sums['org'].apply(lambda x: x in production_widget_orgs)
org_trans_sums['widgets_segment'] = org_trans_sums['org'].apply(lambda x: x in df_widget_created['org'].unique().tolist())

In [33]:
org_trans_sums.head(3)

Unnamed: 0,org,amount,widgets_production,widgets_segment
0,6,427082.59,True,False
1,9,6.96,True,False
2,13,262823.36,True,False


In [34]:
org_trans_sums['widgets_production'].value_counts()

True     2329
False    1679
Name: widgets_production, dtype: int64

In [35]:
org_trans_sums['widgets_segment'].value_counts()

False    4008
Name: widgets_segment, dtype: int64

In [36]:
org_trans_sums.groupby('widgets_production')['amount'].mean()

widgets_production
False     79653.856700
True     238772.731876
Name: amount, dtype: float64