Need to rebuild data to contain all necessary data points within a single file. Median amounts need to be made available for by the following groupings:

- state
- zip (and none for all state)
- is christmas, is new years, neither

In [1]:
import pandas as pd

import sys
sys.path.insert(1, '../../scripts/')
from s3_support import *
%matplotlib inline

# one time

## localize data

In [161]:
q = '''select
            zip,
            state,
            platform,
            median(amount) as amount,
            count(id) as count,
            case 
                when date_part('month', date) = 12 and date_part('day', date) = 25
                    then True
                    else False
            end as is_christmas,
            case 
                when date_part('month', date) = 12 and (date_part('day', date) = 30 or date_part('day', date) = 31)
                    then True
                    else False
            end as is_newyears
        from transactions
        where
            status='A' and
            recurring=0 and
            source in ('don_form', 'mobile', 'sms') and
            donations_amt>0
        group by zip, state, platform, is_christmas, is_newyears;'''
trans = redshift_query_read(q, schema='production')
trans['query'] = 'trans'

In [162]:
# query for state, platform, xmas & new years
q = '''select
            state,
            platform,
            median(amount) as amount,
            count(id) as count,
            case 
                when date_part('month', date) = 12 and date_part('day', date) = 25
                    then True
                    else False
            end as is_christmas,
            case 
                when date_part('month', date) = 12 and (date_part('day', date) = 30 or date_part('day', date) = 31)
                    then True
                    else False
            end as is_newyears
        from transactions
        where
            status='A' and
            recurring=0 and
            source in ('don_form', 'mobile', 'sms') and
            donations_amt>0
        group by state, platform, is_christmas, is_newyears;'''
trans_nozip = redshift_query_read(q, schema='production')
trans_nozip['query'] = 'trans_nozip'

In [163]:
trans_nozip['zip'] = 'None'

trans = trans.append(trans_nozip)

In [164]:
platforms = ['iPhone', 'iPad', 'Android', 'Mac', 'Windows']
trans = trans[trans['platform'].isin(platforms)]

In [165]:
q = '''select
            state,
            median(amount) as amount,
            count(id) as count,
            case 
                when date_part('month', date) = 12 and date_part('day', date) = 25
                    then True
                    else False
            end as is_christmas,
            case 
                when date_part('month', date) = 12 and (date_part('day', date) = 30 or date_part('day', date) = 31)
                    then True
                    else False
            end as is_newyears
        from transactions
        where
            status='A' and
            recurring=0 and
            source in ('don_form', 'mobile', 'sms') and
            donations_amt>0
        group by state, is_christmas, is_newyears;'''
trans_nozipplatform = redshift_query_read(q, schema='production')
trans_nozipplatform['query'] = 'trans_nozipplatform'

In [166]:
trans_nozipplatform['zip'] = 'None'
trans_nozipplatform['platform'] = 'None'

trans = trans.append(trans_nozipplatform)

In [12]:
def state_fix(s):
    state_truncs = {'geor': 'GA', 'ohio': 'OH', 'cali': 'CA', 'fla': 'FL',
                    'flor': 'FL', 'hawa': 'HI', 'virg': 'VA', 'texa': 'TX',
                    'newy': 'NY', 'mich': 'MI', 'arka': 'AR', 'mass': 'MA',
                    'okla': 'OK', 'idah': 'ID', 'loui': 'LA', 'illi': 'IL',
                    'conn': 'CT', 'alas': 'AK', 'colo': 'CO', 'iowa': 'IA',
                    'kans': 'KS', 'wyom': 'WY', 'wisc': 'WI', 'wash': 'WA',
                    'verm': 'VT', 'utah': 'UT', 'tenn': 'TN', 'rhod': 'RI',
                    'oreg': 'OR', 'neva': 'NV', 'nebr': 'NE', 'indi': 'IN',
                    'mont': 'MT', 'alab': 'AL', 'miss': 'MS', 'penn': 'PA',
                    'dela': 'DE'}
    if str(s).lower() in state_truncs.keys():
        return state_truncs[str(s).lower()]
    
    return str(s).upper()

In [None]:
trans['state'] = trans['state'].apply(state_fix)

In [13]:
us_states = [
    'AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DC', 
    'DE', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 
    'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 
    'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 
    'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 
    'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 
    'WV', 'WI', 'WY'
]

In [None]:
trans = trans[trans['state'].isin(us_states)]

## compiling data

In [None]:
# de-duping from malformed state names
grp = ['zip', 'state', 'platform', 'is_christmas', 'is_newyears']
trans = trans.groupby(grp)[['count', 'amount']].max().reset_index()

In [169]:
trans.tail(2)

Unnamed: 0,zip,state,platform,amount,count,is_christmas,is_newyears,query
3057,,CT,,5.0,5,False,False,trans_nozipplatform
3063,,HI,,5.0,1,False,False,trans_nozipplatform


In [170]:
trans.head(2)

Unnamed: 0,zip,state,platform,amount,count,is_christmas,is_newyears,query
14,0,LA,Windows,3.0,1,False,False,trans
69,683,CT,Mac,52.5,1,False,False,trans


In [171]:
len(trans)

230155

In [172]:
#trans.to_dict(orient='records')

In [173]:
print("{:,} original".format(len(trans)))
print("{:,} de-duped".format(len(trans.drop_duplicates())))

230,155 original
230,155 de-duped


In [174]:
filters = (trans['platform'].isna())&(trans['zip'].isna())&(trans['state']=='NY')

trans[filters]

Unnamed: 0,zip,state,platform,amount,count,is_christmas,is_newyears,query


In [175]:


filters = (trans_grp['platform'].isna())&(trans_grp['zip'].isna())&(trans_grp['state']=='NY')
trans_grp

Unnamed: 0,zip,state,platform,is_christmas,is_newyears,count,amount
0,0,CA,iPhone,False,False,1,10.00
1,00000,AL,Android,False,False,1,10.00
2,00000,AL,Mac,False,False,1,0.69
3,00000,AL,Windows,False,False,1,0.75
4,00000,AL,iPhone,False,False,2,65.00
...,...,...,...,...,...,...,...
229937,VG1110,VA,Windows,False,False,2,28.10
229938,Vg1101,VA,iPhone,False,False,1,10.50
229939,ky11104,GA,iPad,False,False,1,50.00
229940,ky11106,GA,Mac,False,False,1,51.50


In [179]:
filters = (trans_grp['platform']=='None')&(trans_grp['zip']=='None')&(trans_grp['state']=='NY')

trans_grp[filters]

Unnamed: 0,zip,state,platform,is_christmas,is_newyears,count,amount
229639,,NY,,False,False,145531,55.0
229640,,NY,,False,True,7387,100.0
229641,,NY,,True,False,572,100.0


# recurring

## localize data

In [2]:
q = '''select
            recurring,
            id,
            date,
            amount,
            zip,
            state,
            platform
        from transactions
        where
            status='A' and
            recurring!=0 and
            source in ('don_form', 'mobile', 'sms')'''
rec = redshift_query_read(q, schema='production')

In [3]:
rec_counts = rec.groupby('recurring')['id'].count().reset_index()
rec_counts.columns = ['recurring', 'count']

In [4]:
rec_data = []

for r in rec_counts[rec_counts['count']>1]['recurring'].tolist():
    this_rec = rec[rec['recurring']==r].sort_values('date', ascending=True)
    
    date_diff = this_rec['date'].diff().mean()

    rec_data.append({
        'recurring': r,
        'date_diff': date_diff
    })

In [5]:
freq_df = pd.DataFrame(rec_data)

In [6]:
print("original length: {:,}".format(len(rec)))

rec = rec.merge(freq_df, on='recurring')

print("merged length: {:,}".format(len(rec)))

original length: 4,490,278
merged length: 4,467,136


In [7]:
freq_df.merge(rec[['recurring', 'amount', 'zip', 'state', 'platform']], on='recurring').drop_duplicates('recurring', keep='last').head()

Unnamed: 0,recurring,date_diff,amount,zip,state,platform
3,65,31 days 00:00:00,5.0,33803,FL,
5,234,370 days 00:00:00,50.0,11238,NY,
10,244,182 days 18:00:00,50.0,33510,FL,
21,269,91 days 12:00:00,20.0,49079,MI,
30,283,30 days 06:00:00,25.0,80302,CO,


In [8]:
def get_frequency(date_diff):
    if date_diff != pd.Timedelta(0, 'd'):
        if date_diff - pd.Timedelta(7, "d") < pd.Timedelta(1, "d"):
            return 'week'
        elif date_diff - pd.Timedelta(14, "d") < pd.Timedelta(1, "d"):
            return 'bimonth'
        elif date_diff - pd.Timedelta(30, "d") < pd.Timedelta(2, "d"):
            return 'month'
        elif date_diff - pd.Timedelta(90, "d") < pd.Timedelta(3, "d"):
            return 'quarter'
        elif date_diff - pd.Timedelta(180, "d") < pd.Timedelta(4, "d"):
            return 'biannual'
        elif date_diff - pd.Timedelta(365, "d") < pd.Timedelta(5, "d"):
            return 'annual'
    
    return None

In [9]:
rec['frequency'] = rec['date_diff'].apply(get_frequency)

In [10]:
rec['frequency'].value_counts()

month       3881715
week         315265
quarter      105271
bimonth       69057
annual        26882
biannual      14811
Name: frequency, dtype: int64

In [14]:
rec['state'] = rec['state'].apply(state_fix)

In [15]:
grp = ['zip', 'state', 'platform', 'frequency']
rec_grp = rec.groupby(grp)['amount'].median().reset_index()

In [16]:
print("{:,} full set".format(len(rec_grp)))
print("{:,} limited to US states".format(len(rec_grp[rec_grp['state'].isin(us_states)])))

125,027 full set
106,100 limited to US states


In [17]:
zip_state_platform = rec_grp[rec_grp['state'].isin(us_states)].pivot(index=['zip', 'state', 'platform'], columns='frequency', values='amount').reset_index()

In [18]:
grp = ['state', 'platform', 'frequency']
rec_grp = rec.groupby(grp)['amount'].median().reset_index()
rec_grp['zip'] = None

state_platform = rec_grp[rec_grp['state'].isin(us_states)].pivot(index=['zip', 'state', 'platform'], columns='frequency', values='amount').reset_index()

In [19]:
grp = ['platform', 'frequency']
rec_grp = rec.groupby(grp)['amount'].median().reset_index()
rec_grp['state'] = None
rec_grp['zip'] = None

platform = rec_grp[rec_grp['state'].isin(us_states)].pivot(index=['zip', 'state', 'platform'], columns='frequency', values='amount').reset_index()

In [20]:
rec_all = pd.concat([zip_state_platform, state_platform, platform])

In [21]:
len(rec_all)

78490

In [22]:
rec_all.tail()

frequency,zip,state,platform,annual,biannual,bimonth,month,quarter,week
300,,WY,Android,,,,25.0,30.0,
301,,WY,Mac,50.0,,,40.6,1000.0,10.095
302,,WY,Windows,25.0,35.0,20.0,25.0,1000.0,
303,,WY,iPad,,,,33.0,25.0,
304,,WY,iPhone,365.0,,,25.0,,


# api testing

In [1]:
import requests, json
import pandas as pd

In [2]:
url = 'https://analyticsapi.qgiv.com/amounts_rec_data/'
key = 'tIHLM2vNlBwvlZlqdKy8'

In [3]:
# test null or bad input, error expected
r = requests.get(url)
print("GET request:")
print(r.text)
print()

r = requests.post(url)
print("POST empty:")
print(r.text)

r = requests.post(url, json={'key': 'asjdfh'})
print("POST wrong key:")
print(r.text)

GET request:
<!doctype html>
<html lang=en>
<title>405 Method Not Allowed</title>
<h1>Method Not Allowed</h1>
<p>The method is not allowed for the requested URL.</p>


POST empty:
<!doctype html>
<html lang=en>
<title>400 Bad Request</title>
<h1>Bad Request</h1>
<p>Did not attempt to load JSON data because the request Content-Type was not &#39;application/json&#39;.</p>

POST wrong key:
{
  "errors": "Incorrect input",
  "success": "0"
}



In [4]:
# test correct input
r = requests.post(url, json={'key': key}, timeout=None)

In [9]:
r.text

'<html>\r\n<head><title>504 Gateway Time-out</title></head>\r\n<body>\r\n<center><h1>504 Gateway Time-out</h1></center>\r\n</body>\r\n</html>\r\n'

In [5]:
d = json.loads(r.text)
df = pd.DataFrame(d['data'])

print("len(df): {:,}".format(len(df)))
print("len(df[state]): {:,}".format(len(df['state'].unique())))
print("len(df[platform]): {:,}".format(len(df['platform'].unique())))

len(df): 13,128
len(df[state]): 51
len(df[platform]): 5


In [6]:
df.tail()

Unnamed: 0,annual,biannual,bimonth,count,is_christmas,is_newyears,month,onetime,platform,quarter,state,week,zip
13123,,,,4,False,False,40.0,42.5,Android,,AK,,99780
13124,,,,6,False,False,19.0,63.0,Android,,AK,,99801
13125,,,,2,False,False,52.0,35.625,Mac,,AK,,99827
13126,,,,1,False,False,5.0,50.0,iPad,,AK,,99835
13127,,,,13,False,False,25.0,50.0,Windows,,AK,,99901
