In [67]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *
%matplotlib inline

# load & prep data

### google traffic

In [68]:
q = '''select
            date_trunc('week', date) as date,
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
            where date>=2018
        group by date_trunc('week', date), org, form;'''
pageviews = redshift_query_read(q)

In [69]:
pageviews = pageviews[pageviews['form']!=0]
pageviews['date'] = pd.to_datetime(pageviews['date'])
len(pageviews), len(pageviews['form'].unique()), pageviews['date'].min(), pageviews['date'].max()

(677307,
 14382,
 Timestamp('2018-01-01 00:00:00'),
 Timestamp('2021-04-19 00:00:00'))

### transactions

In [70]:
q = '''select 
            form, 
            date_trunc('week', date) as date,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A' and date>=2018 and source='p2p'
        group by form, date_trunc('week', date)
    '''
trans = redshift_query_read(q)

In [71]:
trans['date'] = pd.to_datetime(trans['date'])
len(trans), len(trans['form'].unique()), trans['date'].min(), trans['date'].max()

(45274,
 4965,
 Timestamp('2018-01-01 00:00:00'),
 Timestamp('2021-04-26 00:00:00'))

### analytics

In [72]:
q = "select * from analytics_weekly where date>=2018"
df_base = redshift_query_read(q)
print("done with analytics")

q = "select * from analyticsp2p_weekly where date>=2018"
df_p2p = redshift_query_read(q)
print("done with analyticsqgiv")

done with analytics
done with analyticsqgiv


In [73]:
p2p_forms = df_p2p['form'].tolist()
df_base = df_base[df_base['form'].isin(p2p_forms)]

df_analytics = df_base.merge(df_p2p, on=["org", "form", "date"]).dropna()

df_analytics = df_analytics.drop(['org', 'product'], axis=1).groupby(['date', 'form']).sum().reset_index()

df_analytics['date'] = pd.to_datetime(df_analytics['date'])

In [74]:
df_analytics['date'].min(), df_analytics['date'].max()

(Timestamp('2018-01-01 00:00:00'), Timestamp('2021-04-05 00:00:00'))

### merge data

In [75]:
# narrow dataset to forms in p2p analytics
pageviews = pageviews[pageviews['form'].isin(p2p_forms)]
trans = trans[trans['form'].isin(p2p_forms)]

# merge traffic and transactions
trans_n_views = trans.merge(pageviews, on=['form', 'date'])
trans_n_views.columns = ['form', 'date', 'trans_count', 'trans_vol', 'org', 'pageviews']

# add conversion & average trans value
trans_n_views['conversion'] = trans_n_views['trans_count'] / trans_n_views['pageviews']
trans_n_views['avg_trans'] = trans_n_views['trans_vol'] / trans_n_views['trans_count']

In [76]:
df_an = df_analytics.merge(trans_n_views, on=['form', 'date'])

In [77]:
len(df_an), len(df_an['form'].unique()), df_an['date'].min(), df_an['date'].max()

(1842, 229, Timestamp('2018-01-15 00:00:00'), Timestamp('2021-04-05 00:00:00'))

# clustering & modeling

In [78]:
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_squared_error, mean_absolute_error, r2_score

CLUSTERS = [2, 3, 4, 5, 9, 12, 15, 24]

In [79]:
drop_cols = ['avg_trans', 'date', 'form', 'trans_count', 'trans_vol', 
             'org', 'pageviews', 'reg_count', 'sub_reg_count', 
             'don_count']
targets = 'conversion'
ftrs = [c for c in df_an.columns if c not in targets and c not in drop_cols and '_trans_' not in c and '_volume' not in c]

In [81]:
for cluster_count in CLUSTERS:
    print("Modeling w/ {} clusters".format(cluster_count))
    print("-"*40)
    
    # set clusters
    cluster_training_data = df_an.groupby('form')[ftrs].mean().reset_index()
    kmeans = KMeans(n_clusters=cluster_count).fit(cluster_training_data)
    cluster_training_data['meta_cluster'] = kmeans.labels_
    df_an['meta_cluster'] = df_an['form'].apply(lambda x: cluster_training_data[cluster_training_data['form']==x]['meta_cluster'].iloc[0])
    
    # modelcluster

    score_explained_variance = []
    score_mse = []
    score_mae = []
    score_r2 = []
    
    for i in range(50):
        # train/test data split
        X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs + ['meta_cluster']], df_an['conversion'])
        
        rf = RandomForestRegressor()
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_test)
        
        score_explained_variance.append(explained_variance_score(y_test, y_pred))
        score_mse.append(mean_squared_error(y_test, y_pred))
        score_mae.append(mean_absolute_error(y_test, y_pred))
        score_r2.append(r2_score(y_test, y_pred))
        
    print("\tvariance explained: {}".format(np.mean(score_explained_variance)))
    print("\tmse: {}".format(np.mean(score_mse)))
    print("\tmae: {}".format(np.mean(score_mae)))
    print("\tr2: {}".format(np.mean(score_r2)))
    print()

Modeling w/ 2 clusters
----------------------------------------
	variance explained: -8.206219247908267
	mse: 61.06092963163971
	mae: 0.7970638004916989
	r2: -8.306221562763204

Modeling w/ 3 clusters
----------------------------------------
	variance explained: -6.746315316669204
	mse: 56.35581391492565
	mae: 0.761756636913821
	r2: -6.799354418413347

Modeling w/ 4 clusters
----------------------------------------
	variance explained: -9.588833586158957
	mse: 61.29351267218627
	mae: 0.7619990856135372
	r2: -9.679860242209038

Modeling w/ 5 clusters
----------------------------------------
	variance explained: -8.057701709724531
	mse: 58.77278780775065
	mae: 0.7658130338990848
	r2: -8.18068961581176

Modeling w/ 9 clusters
----------------------------------------
	variance explained: -4.6500176925029395
	mse: 53.058638452257895
	mae: 0.7318839991900917
	r2: -4.691780869668669

Modeling w/ 12 clusters
----------------------------------------
	variance explained: -5.540370855284468
	mse: