In [2]:
import pandas as pd
import numpy as np

import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *
%matplotlib inline

# load & prep data

### google traffic

In [3]:
q = '''select
            date_trunc('week', date) as date,
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
            where date>=2018
        group by date_trunc('week', date), org, form;'''
pageviews = redshift_query_read(q)

In [4]:
pageviews = pageviews[pageviews['form']!=0]
pageviews['date'] = pd.to_datetime(pageviews['date'])
len(pageviews), len(pageviews['form'].unique()), pageviews['date'].min(), pageviews['date'].max()

(677307,
 14382,
 Timestamp('2018-01-01 00:00:00'),
 Timestamp('2021-04-19 00:00:00'))

### transactions

In [5]:
q = '''select 
            form, 
            date_trunc('week', date) as date,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A' and date>=2018 and source='p2p'
        group by form, date_trunc('week', date)
    '''
trans = redshift_query_read(q)

In [6]:
trans['date'] = pd.to_datetime(trans['date'])
len(trans), len(trans['form'].unique()), trans['date'].min(), trans['date'].max()

(45831,
 5024,
 Timestamp('2018-01-01 00:00:00'),
 Timestamp('2021-05-03 00:00:00'))

### analytics

In [7]:
q = "select * from analytics_weekly where date>=2018"
df_base = redshift_query_read(q)
print("done with analytics")

q = "select * from analyticsp2p_weekly where date>=2018"
df_p2p = redshift_query_read(q)
print("done with analyticsqgiv")

KeyboardInterrupt: 

In [None]:
p2p_forms = df_p2p['form'].tolist()
df_base = df_base[df_base['form'].isin(p2p_forms)]

df_analytics = df_base.merge(df_p2p, on=["org", "form", "date"]).dropna()

df_analytics = df_analytics.drop(['org', 'product'], axis=1).groupby(['date', 'form']).sum().reset_index()

df_analytics['date'] = pd.to_datetime(df_analytics['date'])

In [None]:
df_analytics['date'].min(), df_analytics['date'].max()

### merge data

In [None]:
# narrow dataset to forms in p2p analytics
pageviews = pageviews[pageviews['form'].isin(p2p_forms)]
trans = trans[trans['form'].isin(p2p_forms)]

# merge traffic and transactions
trans_n_views = trans.merge(pageviews, on=['form', 'date'])
trans_n_views.columns = ['form', 'date', 'trans_count', 'trans_vol', 'org', 'pageviews']

# add conversion & average trans value
trans_n_views['conversion'] = trans_n_views['trans_count'] / trans_n_views['pageviews']
trans_n_views['avg_trans'] = trans_n_views['trans_vol'] / trans_n_views['trans_count']

In [None]:
df_an = df_analytics.merge(trans_n_views, on=['form', 'date'])

len(df_an), len(df_an['form'].unique()), df_an['date'].min(), df_an['date'].max()

# training

In [None]:
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import explained_variance_score, mean_squared_error, mean_absolute_error, r2_score

In [None]:
targets = 'conversion'
cluster_count = 9
ftrs = ['teams_count', 'class_count', 'cat_count', 
        'promo_count', 'fields', 'allows_reg_ind', 
        'allows_teams', 'allows_sub_reg', 
        'allows_pfp_off_don', 'allows_tfp_off_don', 
        'allows_soc_post_pfp_tcp', 'share_home', 
        'share_pfp', 'share_tfp', 'allows_social']
# ftrs + ['meta_cluster']

In [None]:
print("allows_* features:")
print(", ".join([f for f in ftrs if 'allows' in f]))
print()
print("not allows_* features:")
print(", ".join([f for f in ftrs if 'allows' not in f]))

In [None]:
print("Modeling w/ {} clusters".format(cluster_count))
print("-"*40)

# set clusters
cluster_training_data = df_an.groupby('form')[ftrs].mean().reset_index()
kmeans = KMeans(n_clusters=cluster_count).fit(cluster_training_data)
cluster_training_data['meta_cluster'] = kmeans.labels_
df_an['meta_cluster'] = df_an['form'].apply(lambda x: cluster_training_data[cluster_training_data['form']==x]['meta_cluster'].iloc[0])

# model
score_explained_variance = []
score_mse = []
score_mae = []
score_r2 = []

for i in range(50):
    # train/test data split
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs + ['meta_cluster']], df_an['conversion'])

    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)

    score_explained_variance.append(explained_variance_score(y_test, y_pred))
    score_mse.append(mean_squared_error(y_test, y_pred))
    score_mae.append(mean_absolute_error(y_test, y_pred))
    score_r2.append(r2_score(y_test, y_pred))

print("\tvariance explained: {}".format(np.mean(score_explained_variance)))
print("\tmse: {}".format(np.mean(score_mse)))
print("\tmae: {}".format(np.mean(score_mae)))
print("\tr2: {}".format(np.mean(score_r2)))
print()

# storing

In [None]:
import joblib

In [None]:
# store cluster
cluster_training_data[['form', 'meta_cluster']].head()

In [None]:
fname = "p2p_clusters.csv"
#cluster_training_data[['form', 'meta_cluster']].to_csv(fname, index=False)

In [None]:
# store model
'''
joblib.dump({
    "model": rf,
    "features": ftrs + ['meta_cluster']
}, "model.p2p.jobilb")
'''

In [None]:
ftrs + ['meta_cluster']