In [7]:
import pandas as pd
import sys

sys.path.insert(1, '/home/ubuntu/Recommendation/projects/form health v2/justin/lib')
sys.path.insert(1, '../../../../scripts/')
from s3_support import *
from data_loader import load_transactions, load_qgiv_analytics, load_transactions_in_range

from typing import List
from datetime import datetime, timedelta

In [8]:
# Variables

# The different rolling means that will be applied to the data
rolling_mean_windows = [2, 4, 7, 14, 21, 30]

In [9]:
# Load transaction data
transactions = load_transactions(
    columns=[
        'id', 'date', 'org', 'form', 'donations_amt', 'donations_count',
    ]
)

In [10]:
# Load analytics data
form_analytics = load_qgiv_analytics(
    base_columns=[
        'id', 'org', 'form', 'date', 'don_form_trans_count', 'don_form_trans_vol'
    ],
    qgiv_columns=[
        'pledges_count', 'events_count', 'events_priv_count', 'restrictions', 'amounts', 'ded_types', 'opt_ded_flds', 'req_ded_flds',
        'opt_fields', 'req_fields', 'pledge_active', 'donation_active', 'multirestriction_system','show_amount', 'permit_anonymous',
        'permit_recurring', 'permit_other_amount', 'permit_create_own_pledge', 'collect_company', 'collect_phone', 'collect_optin',
        'collect_captcha', 'collect_address_mobile', 'enable_donorlogins'
    ],
    filters=['A.don_form_trans_count > 0']
)

In [11]:
def get_rolling_means(windows: List):
    def calculate_rolling_mean(window):
        return transactions \
            .groupby(['form', 'date']) \
            ['donations_amt'] \
            .sum() \
            .rolling(window=window) \
            .mean() \
            .reset_index() \
            .rename(columns={'donations_amt': 'rolling_mean_{}'.format(window)})
    
    merged_data_frame = pd.concat(list(map(calculate_rolling_mean, windows)), axis=1)
    
    return merged_data_frame.loc[:, ~merged_data_frame.columns.duplicated()]

# Pre calculate all the rolling means to be used when creating feature set
rolling_means = get_rolling_means(rolling_mean_windows)

In [12]:
def engineer_features(
    transactions: pd.DataFrame,
    form_analytics: pd.DataFrame,
    save_when_finished: bool
) -> pd.DataFrame:
    
    # Add the orgs total yearly transaction volume to the dataframe
    total_donation_amounts = transactions.groupby('form').sum().reset_index()
    
    form_analytics['yearly_volume'] = form_analytics.apply(
        lambda row: total_donation_amounts.loc[total_donation_amounts['form'] == row['form']]['donations_amt'].values[0],
        axis=1
    )
    
    print('Added `yearly_volume`')
    
    # Add the orgs total yearly transaction counts to the dataframe
    form_analytics['yearly_donation_count'] = form_analytics.apply(
        lambda row: total_donation_amounts.loc[total_donation_amounts['form'] == row['form']]['donations_count'].values[0],
        axis=1
    )
    
    print('Added `yearly_donation_count`')
    
    # Add the orgs average fonation size to the dataframe
    form_analytics['average_donation_size'] = form_analytics.apply(
        lambda row: row["yearly_volume"] / row["yearly_donation_count"] if (row["yearly_volume"] > 0 and row["yearly_donation_count"] > 0) else 0,
        axis=1
    )
    
    print('Added `average_donation_size`')
    
    # Convenience function to get the rolling mean
    def get_rolling_mean(window, form, date):
        value = rolling_means.loc[(rolling_means['form'] == form) & (rolling_means['date'] == date)]['rolling_mean_{}'.format(window)]
        
        if value.empty:
            return 0
        else:
            return value.values[0]
        
    
    for window in rolling_mean_windows:
        form_analytics['rolling_mean_{}'.format(window)] = form_analytics.apply(
            lambda row: get_rolling_mean(window, row['form'], row['date']),
            axis=1
        )
        print('Added `rolling_mean` for window {}'.format(window))
    
    if save_when_finished:
        print('Saving file `form-health-training-data.csv` in bucket `form-health-v2`')
        save_dataframe_to_file(
            'form-health-v2', 
            'form-health-training-data.csv', 
            form_analytics
        )
    
    return form_analytics
    
form_analytics = engineer_features(
    transactions=transactions,
    form_analytics=form_analytics,
    save_when_finished=True
)

Added `yearly_volume`
Added `yearly_donation_count`
Added `average_donation_size`
Added `rolling_mean` for window 2
Added `rolling_mean` for window 4
Added `rolling_mean` for window 7
Added `rolling_mean` for window 14
Added `rolling_mean` for window 21
Added `rolling_mean` for window 30
Saving file `form-health-training-data.csv` in bucket `form-health-v2`
uploading to S3
Done
