In [1]:
import pandas as pd
import sys

sys.path.insert(1, '/home/ubuntu/Recommendation/projects/form health v2/justin/lib')
sys.path.insert(1, '../../../../scripts/')
from s3_support import *
from data_loader import load_transactions, load_qgiv_analytics, load_transactions_in_range
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from typing import List
from datetime import datetime, timedelta

# Pre Processing

In [2]:
def create_training_data(
    form_analytics: pd.DataFrame,
    y_column: str,
):
    # Make a copy so the original training data stays intact
    form_analytics = form_analytics.copy(deep=True)
    
    # Drop all of the rows that do not have a target value
    form_analytics.dropna(subset=[y_column])

    # Get the target
    form_analytics_y = form_analytics[[y_column]]
    
    # Remove irrelevant features
    form_analytics_X = form_analytics.drop(['id', 'org', 'form', 'date', y_column], axis=1)

    return train_test_split(
        form_analytics_X,
        form_analytics_y,
        shuffle=False
    )

In [3]:
def create_training_data_from_saved_file(
    y_column
):
    return create_training_data(
        form_analytics=get_dataframe_from_file('form-health-v2', 'form-health-training-data.csv'),
        y_column=y_column
    )

In [4]:
def training_data(
    from_saved_file,
    y_column
):
    if from_saved_file:
        return create_training_data_from_saved_file(
            y_column=y_column
        )
    else:
        return create_training_data(
            form_analytics=form_analytics,
            y_column=y_column
        )

X_train, X_test, y_train, y_test = training_data(
    from_saved_file=True,
    y_column='don_form_trans_count'
)

# Training

In [5]:
classifier = GradientBoostingRegressor(
    n_estimators=160,
    max_depth=5,
    min_impurity_decrease=0.9,
    verbose=1
)

classifier.fit(
    X_train,
    y_train.values.ravel()
)

prediction = classifier.predict(X_test)
mse = mean_squared_error(prediction, y_test)

      Iter       Train Loss   Remaining Time 
         1         118.2917            2.19m
         2          97.2654            2.18m
         3          80.2207            2.22m
         4          66.4133            2.23m
         5          55.2070            2.23m
         6          46.1137            2.22m
         7          38.7154            2.24m
         8          32.7045            2.26m
         9          27.8293            2.22m
        10          23.7873            2.19m
        20           8.1241            2.05m
        30           5.7463            1.96m
        40           5.0876            1.86m
        50           4.7783            1.67m
        60           4.5502            1.52m
        70           4.3311            1.36m
        80           4.1674            1.22m
        90           4.0138            1.07m
       100           3.9055           55.36s


# Testing

In [6]:
mse

19.914383078741746

In [7]:
def print_feature_importance(
    classifier: GradientBoostingRegressor,
    training_data: pd.DataFrame
):
    feature_importances = pd.DataFrame(
        classifier.feature_importances_,
        index = X_train.columns,
        columns=['Importance']
    ).sort_values(
        'Importance',
        ascending=False
    )
    
    print("Feature Importances")
    print(feature_importances)

In [8]:
print_feature_importance(
    classifier=classifier,
    training_data=X_train
)

Feature Importances
                          Importance
don_form_trans_vol          0.829879
average_donation_size       0.052772
yearly_volume               0.044254
ded_types                   0.034666
yearly_donation_count       0.027206
req_fields                  0.004543
enable_donorlogins          0.004519
restrictions                0.000959
collect_phone               0.000341
show_amount                 0.000206
opt_fields                  0.000144
amounts                     0.000136
events_count                0.000095
permit_anonymous            0.000066
collect_optin               0.000058
permit_other_amount         0.000046
donation_active             0.000033
collect_address_mobile      0.000024
pledge_active               0.000015
permit_create_own_pledge    0.000010
collect_company             0.000009
events_priv_count           0.000007
req_ded_flds                0.000005
pledges_count               0.000003
multirestriction_system     0.000002
rolling_mean_4    