In [22]:
import sys
sys.path.insert(1, '../scripts/')
from s3_support import *

import pandas as pd
import numpy as np

# Transactions

In [53]:
list_files("trans-records")

transactions.2018-12-28.csv (33MB)
transactions.csv (1301MB)
transactions_2019-02-13.csv (2MB)
transactions_clean.csv (1087MB)
--------------------------------------------------
Bucket trans-records contains 4 files (2.4GB)


In [54]:
df = get_dataframe_from_file("trans-records", "transactions.csv")

In [55]:
cols = ["id", "events_tickets", "donations_amt", "zip", "purchases_amt", "donations_count", 
        "source", "state", "events_amt", "registrations_amt", "email", "status", "form",
        "transdonationentity", "org", "purchases_quantity", "events_count", "recurring",
        "amount", "purchases_count", "transdonationentitytype", "registrations_count",
        "hour", "day", "month", "is_fraud", "form_amount_mean_diff", "year",
        "form_day_mean_diff", "form_hour_mean_diff"]

In [56]:
for c in ['transDonationEntity', 'transDonationEntityType']:
    df[c.lower()] = df[c]
    df.drop(c, axis=1, inplace=True)
for c in df.columns:
    if c not in cols:
        df.drop(c, axis=1, inplace=True)

In [60]:
# fill NA's
df = df[cols].fillna(0)

# cast INT columns
int_cols = ['id', 'events_tickets', 'donations_count', 'registrations_count', 'hour', 'day', 'month', 'year',
           'transdonationentity', 'transdonationentitytype']
for c in int_cols:
    df[c] = df[c].astype('int')
    
# cast FLOAT columns
float_cols = ['donations_amt', 'events_amt', 'purchases_amt', 'registrations_amt',
             'amount']
for c in float_cols:
    df[c] = df[c].astype('float')
    
# sanitize state & zip values
df['state'] = df['state'].apply(sanitize_state_values)
df['zip'] = df['zip'].apply(lambda x: str(x)[:12].replace(',', '').replace('[^a-zA-Z]', ''))

In [61]:
save_dataframe_to_file("trans-records", "transactions_clean.csv", df, columns=cols)

uploading to S3
Done


In [62]:
df[df['id']==173719][['id', 'form', 'org', 'day', 'month', 'year']]

Unnamed: 0,id,form,org,day,month,year
165984,173719,722,33,6,10,2010


In [9]:
#df.to_csv("transactions_clean.csv", columns=cols, index=False)
#upload_file("transactions_clean.csv", "trans-records")

In [58]:
def ascii_encode_or_drop(str_val):
    try:
        str_val.encode('ascii')
        return str_val
    except:
        return ''

df['state'].iloc[165980:165990]

165980    TX
165981    CA
165982    VA
165983    WA
165984      
165985    AZ
165986      
165987      
165988    KS
165989    MD
Name: state, dtype: object

In [51]:
def sanitize_state_values(val):
    us_state_abbrev = {
        'Alabama': 'AL',
        'Alaska': 'AK',
        'Arizona': 'AZ',
        'Arkansas': 'AR',
        'California': 'CA',
        'Colorado': 'CO',
        'Connecticut': 'CT',
        'Delaware': 'DE',
        'Florida': 'FL',
        'Georgia': 'GA',
        'Hawaii': 'HI',
        'Idaho': 'ID',
        'Illinois': 'IL',
        'Indiana': 'IN',
        'Iowa': 'IA',
        'Kansas': 'KS',
        'Kentucky': 'KY',
        'Louisiana': 'LA',
        'Maine': 'ME',
        'Maryland': 'MD',
        'Massachusetts': 'MA',
        'Michigan': 'MI',
        'Minnesota': 'MN',
        'Mississippi': 'MS',
        'Missouri': 'MO',
        'Montana': 'MT',
        'Nebraska': 'NE',
        'Nevada': 'NV',
        'New Hampshire': 'NH',
        'New Jersey': 'NJ',
        'New Mexico': 'NM',
        'New York': 'NY',
        'North Carolina': 'NC',
        'North Dakota': 'ND',
        'Ohio': 'OH',
        'Oklahoma': 'OK',
        'Oregon': 'OR',
        'Pennsylvania': 'PA',
        'Rhode Island': 'RI',
        'South Carolina': 'SC',
        'South Dakota': 'SD',
        'Tennessee': 'TN',
        'Texas': 'TX',
        'Utah': 'UT',
        'Vermont': 'VT',
        'Virginia': 'VA',
        'Washington': 'WA',
        'West Virginia': 'WV',
        'Wisconsin': 'WI',
        'Wyoming': 'WY',
    }

    if val in us_state_abbrev.values():
        return val
    elif val in us_state_abbrev:
        return us_state_abbrev[val]
    else:
        try:
            str(val).encode('ascii')
            return str(val).replace(',', '').replace('"', '').replace("'", '').replace('.', '').replace('[^a-zA-Z]', '')[:4]
        except:
            return ''

# Logs

In [2]:
list_buckets()

elasticbeanstalk-us-east-1-637885584661
ingest-records
qgiv-stats-data
qgivmodelsdata
sagemaker-us-east-1-637885584661
trainingflowtestbucket
trans-records


In [24]:
df = get_dataframe_from_file("qgiv-stats-data", "logs.csv")

In [30]:
if 'data' in df.columns:
    df.drop(['data', 'id'], axis=1, inplace=True)

int_cols = ['userId', 'user', 'org', 'systemId', 'form', 'entity', 'entityType', 'ghost']
for c in int_cols:
    if c in df.columns:
        df[c] = df[c].fillna(0).astype('int')
    
for c in ["entityType", "systemId", "systemType", "userId"]:
    if c in df.columns:
        df[c.lower()] = df[c]
        df.drop(c, axis=1, inplace=True)
    
df['message'] = df['message'].str.replace(',', '')

In [31]:
cols = ["org", "form", "entity", "entitytype", "systemid", "systemtype",
        "type", "created", "userid", "ghost", "hidden", "access", "ack", "count", "message"]

df[cols].head()

Unnamed: 0,org,form,entity,entitytype,systemid,systemtype,type,created,userid,ghost,hidden,access,ack,count,message
0,441530,944710,839477,2,174838,25,3,2019-05-01 21:11:32,0,0,0,30,0,1,%team_839477% has earned the %badge_174838%
1,441530,944710,784742,10,6393269,27,3,2019-05-01 21:11:32,0,0,0,30,0,1,%contact_4883929% donated %amount_37.1% to %re...
2,443692,942649,772113,10,6393267,27,3,2019-05-01 21:10:05,0,0,0,30,0,1,%contact_4883927% donated %amount_25.00% to %r...
3,427080,944828,944828,4,6393266,27,3,2019-05-01 21:09:21,0,0,0,30,0,1,%contact_4883925% donated %amount_35.00% to %f...
4,0,0,1295717,12,1295717,44,3,2019-05-01 21:08:38,1295712,0,0,2,0,1,(1295717) has been registered


In [32]:
save_dataframe_to_file("qgiv-stats-data", "logs.csv", df, columns=cols)

uploading to S3
Done


In [33]:
df.columns

Index(['org', 'form', 'entity', 'entitytype', 'systemid', 'systemtype', 'type',
       'created', 'userid', 'ghost', 'hidden', 'access', 'ack', 'count',
       'message'],
      dtype='object')

# Analytics

In [6]:
list_files("ingest-records", search_key="daily")

analytic_base_daily.csv (1261MB)
--------------------------------------------------
Matched files: 1 files (1.2GB)
Bucket ingest-records contains 1331 files (1.7GB)


In [11]:
# df = get_dataframe_from_file('ingest-reocrds', 'analytic_base_daily.csv')
url = get_file_url("ingest-records", "analytic_base_daily.csv")
url

'https://ingest-records.s3.amazonaws.com/analytic_base_daily.csv?AWSAccessKeyId=AKIAZJBHRSUKXQYUHQ45&Signature=9pjWLI6QTGWEU5X4HQ7TxUaGTfo%3D&Expires=1575242785'

In [12]:
df = pd.read_csv(url, low_memory=False)

In [14]:
cols = {
    'id': 'int',
    'date': 'timestamp',
    'org': 'int',
    'form': 'int',
    'sic': 'int',
    'ein': 'int',
    'vt_trans_count': 'int',
    'don_form_trans_count': 'int',
    'kiosk_trans_count': 'int',
    'p2p_trans_count': 'int',
    'mobile_trans_count': 'int',
    'mobilevt_trans_count': 'int',
    'sms_trans_count': 'int',
    'fb_trans_count': 'int',
    'vt_trans_vol': 'float',
    'don_form_trans_vol': 'float',
    'kiosk_trans_vol': 'float',
    'p2p_trans_vol': 'float',
    'mobile_trans_vol': 'float',
    'mobilevt_trans_vol': 'float',
    'sms_trans_vol': 'float',
    'fb_trans_vol': 'float',
    'one_time_trans_vol': 'float',
    'one_time_trans_count': 'int',
    'rec_trans_vol': 'float',
    'rec_trans_count': 'int',
    'product': 'int'
}

In [29]:
df['ein'] = df['ein'].apply(lambda x: 0 if x == 'OMIT' or x == 'nan' else str(x).replace('-', ''))

In [30]:
for c in list(cols.keys()):
    if c == 'date':
        df[c] = pd.to_datetime(df[c])
    else:
        df[c] = df[c].fillna(0).astype(cols[c])

In [33]:
df[list(cols.keys())].head()

Unnamed: 0,id,date,org,form,sic,ein,vt_trans_count,don_form_trans_count,kiosk_trans_count,p2p_trans_count,...,p2p_trans_vol,mobile_trans_vol,mobilevt_trans_vol,sms_trans_vol,fb_trans_vol,one_time_trans_vol,one_time_trans_count,rec_trans_vol,rec_trans_count,product
0,47131755,2018-06-27,0,934771,8398,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,1
1,44787400,2018-05-19,0,934771,8398,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,1
2,44137209,2018-05-08,0,934771,8398,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,1
3,45558379,2018-06-01,0,934771,8398,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,1
4,44860872,2018-05-20,0,934771,8398,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0,0.0,0,1


In [34]:
save_dataframe_to_file("qgiv-stats-data", "analytics_clean.csv", df[list(cols.keys())], columns=list(cols.keys()))

uploading to S3
Done
