In [1]:
import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *

import pandas as pd
import numpy as np
from datetime import timedelta

TRANS_PATH = "../transactions/transactions.csv"
ORG_START_DATES = "../org_start_dates.csv"
ANALYTICS_PATH = "analytic_base_daily.csv"

# Getting org ex dates from transactions

In [11]:
trans = pd.read_csv(TRANS_PATH, low_memory=False)

In [5]:
trans = trans[trans['status']=='A'][['org', 'date']]
trans['date'] = pd.to_datetime(trans['date'])
org_start_dates = trans.groupby('org')['date'].min().reset_index()

In [7]:
org_start_dates['start_date'] = org_start_dates['date']
org_start_dates.drop('date', axis=1, inplace=True)

In [10]:
# week 1, 4, 12, 24, 52
org_start_dates['week_1'] = org_start_dates['start_date'] + timedelta(days=7)
org_start_dates['week_4'] = org_start_dates['start_date'] + timedelta(days=30)
org_start_dates['week_12'] = org_start_dates['start_date'] + timedelta(days=90)
org_start_dates['week_24'] = org_start_dates['start_date'] + timedelta(days=180)
org_start_dates['week_52'] = org_start_dates['start_date'] + timedelta(days=364)

In [12]:
org_start_dates.tail()

Unnamed: 0,org,start_date,week_1,week_4,week_12,week_24,week_52
4859,444888,2019-11-20,2019-11-27,2019-12-20,2020-02-18,2020-05-18,2020-11-18
4860,444895,2019-11-21,2019-11-28,2019-12-21,2020-02-19,2020-05-19,2020-11-19
4861,444896,2019-11-22,2019-11-29,2019-12-22,2020-02-20,2020-05-20,2020-11-20
4862,444897,2019-11-22,2019-11-29,2019-12-22,2020-02-20,2020-05-20,2020-11-20
4863,444902,2019-11-22,2019-11-29,2019-12-22,2020-02-20,2020-05-20,2020-11-20


In [13]:
org_start_dates.to_csv(ORG_START_DATES, index=False)

In [14]:
org_forms = trans[['org', 'form']].groupby('form')['org'].first().reset_index()

In [13]:
trans[['org', 'form']].groupby('form')['org'].first().reset_index().to_csv("org_forms.csv", index=False)

# Tagging analytics entries

In [2]:
url = get_file_url("ingest-records", "analytic_base_daily.csv")
df_analytics = pd.read_csv(url, low_memory=False)

## NA org values

In [12]:
org_is_na = df_analytics['org'].isna().sum()
len_entries = len(df_analytics)

print("{} entries; {} org is NA ({:.2f}%)".format(len_entries, org_is_na, (float(org_is_na) / float(len_entries)) * 100.))

8330598 entries; 2551994 org is NA (30.63%)


In [15]:
df_analytics['form_type'].value_counts()

4.0    6729820
0.0    1600778
Name: form_type, dtype: int64

In [17]:
org_forms = pd.read_csv("org_forms.csv")

def get_org_for_form(form_id):
    if len(org_forms[org_forms['form']==form_id]):
        return org_forms[org_forms['form']==form_id]['org'].iloc[0]
    else:
        return np.NAN

In [18]:
df_analytics['form'] = df_analytics['form'].astype('int')
df_analytics['org'] = df_analytics['form'].apply(get_org_for_form)

In [19]:
org_is_na = df_analytics['org'].isna().sum()
len_entries = len(df_analytics)

print("{} entries; {} org is NA ({:.2f}%)".format(len_entries, org_is_na, (float(org_is_na) / float(len_entries)) * 100.))

8330598 entries; 2551994 org is NA (30.63%)


## Org start date tagging

In [3]:
list_files("qgiv-stats-data", search_key="org_start")

org_start_dates.csv (0MB)
--------------------------------------------------
Matched files: 1 files (0.0GB)
Bucket qgiv-stats-data contains 8 files (0.3GB)


In [3]:
org_start_dates = get_dataframe_from_file("qgiv-stats-data", "org_start_dates.csv")

In [5]:
org_start_dates.head(3)

Unnamed: 0,org,start_date,week_1,week_4,week_12,week_24,week_52
0,0,2009-02-02,2009-02-09,2009-03-04,2009-05-03,2009-08-01,2010-02-01
1,6,2006-05-18,2006-05-25,2006-06-17,2006-08-16,2006-11-14,2007-05-17
2,9,2009-04-28,2009-05-05,2009-05-28,2009-07-27,2009-10-25,2010-04-27


In [3]:
df_analytics.columns

Index(['id', 'ids', 'date', 'visits', 'ein', 'org', 'form', 'form_type',
       'path', 'product', 'sic', 'don_form_trans_count', 'don_form_trans_vol',
       'fb_trans_count', 'fb_trans_vol', 'givi_trans_count', 'givi_trans_vol',
       'kiosk_trans_count', 'kiosk_trans_vol', 'mobile_trans_count',
       'mobile_trans_vol', 'mobile_visits', 'mobilevt_trans_count',
       'mobilevt_trans_vol', 'one_time_trans_count', 'one_time_trans_vol',
       'p2p_trans_count', 'p2p_trans_vol', 'rec_trans_count', 'rec_trans_vol',
       'sms_trans_count', 'sms_trans_vol', 'vt_trans_count', 'vt_trans_vol'],
      dtype='object')

In [None]:
df_analytics = df_analytics[~df_analytics['org'].isna()]
df_new = []
counter = 0

cols = ['visits', 'don_form_trans_count', 'don_form_trans_vol',
       'fb_trans_count', 'fb_trans_vol', 'givi_trans_count', 'givi_trans_vol',
       'kiosk_trans_count', 'kiosk_trans_vol', 'mobile_trans_count',
       'mobile_trans_vol', 'mobile_visits', 'mobilevt_trans_count',
       'mobilevt_trans_vol', 'one_time_trans_count', 'one_time_trans_vol',
       'p2p_trans_count', 'p2p_trans_vol', 'rec_trans_count', 'rec_trans_vol',
       'sms_trans_count', 'sms_trans_vol', 'vt_trans_count', 'vt_trans_vol']

print("Tagging org analytics entries")

for _, o in org_start_dates.iterrows():
    if o['org'] == 0:
        continue
    
    _df = df_analytics[df_analytics['org']==o['org']].copy()
    
    _df['wk1'] = (_df['date']>=o['start_date'])&(_df['date']<=o['week_1'])
    _df['wk4'] = (_df['date']>o['week_1'])&(_df['date']<=o['week_4'])
    _df['wk12'] = (_df['date']>o['week_4'])&(_df['date']<=o['week_12'])
    _df['wk24'] = (_df['date']>o['week_12'])&(_df['date']<=o['week_24'])
    _df['wk52'] = (_df['date']>o['week_24'])&(_df['date']<=o['week_52'])
    
    new_data = {
        'org': o['org']
    }
    for c in cols:
        for wk in ['wk1', 'wk4', 'wk12', 'wk24', 'wk52']:
            new_data["{}_{}_mean".format(wk, c)] = _df[_df[wk]][c].mean()
            new_data["{}_{}_sum".format(wk, c)] = _df[_df[wk]][c].sum()
    
    df_new.append(new_data)
    counter += 1
    if counter % 500 == 0:
        print("\tdone with {} orgs".format(counter))
        
print("Storing tagged dataframe with {} rows".format(len(df_new)))

save_dataframe_to_file("qgiv-stats-data", "org_week_tagged_analytics.csv", pd.DataFrame(df_new))

Tagging org analytics entries
	done with 500 orgs
	done with 1000 orgs
	done with 1500 orgs
	done with 2000 orgs
	done with 2500 orgs
	done with 3000 orgs
	done with 3500 orgs
	done with 4000 orgs
	done with 4500 orgs
