In [2]:
import pandas as pd
import sys

sys.path.insert(1, '/home/ubuntu/Recommendation/projects/form health v2/justin/lib')
from data_loader import load_transactions, load_qgiv_analytics
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

from typing import List

# Load Data

In [3]:
# Load transaction data
transactions = load_transactions(
    columns=[
        'id', 'org', 'form', 'donations_amt', 'donations_count', 
    ]
)

In [4]:
form_analytics = load_qgiv_analytics(
    base_columns=[
        'id', 'org', 'form', 'date', 'don_form_trans_count'
    ],
    qgiv_columns=[
        'events_count', 'events_priv_count', 'restrictions', 'amounts', 'ded_types', 
        'opt_fields', 'req_fields', 'pledge_active', 'donation_active', 'multirestriction_system', 'show_amount',
        'permit_anonymous', 'permit_other_amount', 'permit_create_own_pledge', 'collect_company', 'collect_phone',
        'collect_optin', 'collect_address_mobile', 'enable_donorlogins', 'enable_sms'
    ],
    filters=['A.don_form_trans_count > 0']
)

# Feature Engineering

In [5]:
def engineer_features(
    transactions: pd.DataFrame,
    form_analytics: pd.DataFrame
) -> pd.DataFrame:
    
    # Add the orgs total yearly transaction volume to the dataframe
    total_donation_amounts = transactions.groupby('org').sum().reset_index()
    
    form_analytics['yearly_volume'] = form_analytics.apply(
        lambda row: total_donation_amounts.loc[total_donation_amounts['org'] == row['org']]['donations_amt'].values[0],
        axis=1
    )
    
    # Add the orgs total yearly transaction counts to the dataframe
    form_analytics['yearly_donation_count'] = form_analytics.apply(
        lambda row: total_donation_amounts.loc[total_donation_amounts['org'] == row['org']]['donations_count'].values[0],
        axis=1
    )
    
    # Add the orgs average fonation size to the dataframe
    form_analytics['average_donation_size'] = form_analytics.apply(
        lambda row: row["yearly_volume"] / row["yearly_donation_count"] if (row["yearly_volume"] > 0 and row["yearly_donation_count"] > 0) else 0,
        axis=1
    )
    
    return form_analytics
    
form_analytics = engineer_features(
    transactions=transactions,
    form_analytics=form_analytics
)

SyntaxError: invalid syntax (<ipython-input-5-831c18aaf70b>, line 29)

# Feature Scaling

In [None]:
def normalize_org_scale_features(
    form_analytics: pd.DataFrame,
    columns_to_normalize: List[str]
) -> pd.DataFrame:
    min_max_scaler = MinMaxScaler()
    
    form_analytics[columns_to_normalize] = min_max_scaler.fit_transform(
        form_analytics[columns_to_normalize]
    )
    
    return form_analytics

form_analytics = normalize_org_scale_features(
    form_analytics=form_analytics,
    columns_to_normalize=[
        'yearly_volume',
        'yearly_donation_count',
        'average_donation_size'
    ]
)

# Pre Processing

In [None]:
def create_training_data(
    form_analytics: pd.DataFrame,
    y_column: str,
):
    # Drop all of the rows that do not have a target value
    form_analytics.dropna(subset=[y_column])

    # Get the target
    form_analytics_y = form_analytics[[y_column]]
    
    # Remove irrelevant features
    form_analytics_X = form_analytics.drop(['id', 'org', 'form', 'date', y_column], axis=1)

    return train_test_split(
        form_analytics_X,
        form_analytics_y,
        shuffle=False
    )

X_train, X_test, y_train, y_test = create_training_data(
    form_analytics=form_analytics,
    y_column='don_form_trans_count'
)

In [None]:
# Train a RandomForest
classifier = RandomForestRegressor(
    bootstrap=False,
    n_jobs=-1,
    verbose=True
)

classifier.fit(
    X_train,
    y_train.values.ravel()
)

prediction = classifier.predict(X_test)
mse = mean_squared_error(prediction, y_test)

print('Number of samples: {}'.format(len(X_train)))
print('Mean Squared Error: {}'.format(mse))

In [None]:
def print_feature_importance(
    classifier: RandomForestRegressor,
    training_data: pd.DataFrame
):
    feature_importances = pd.DataFrame(
        classifier.feature_importances_,
        index = X_train.columns,
        columns=['Importance']
    ).sort_values(
        'Importance',
        ascending=False
    )
    
    print("Feature Importances")
    print(feature_importances)

In [None]:
print_feature_importance(
    classifier=classifier,
    training_data=X_train
)