In [11]:
import pandas as pd
import sys
sys.path.insert(1, '../../../../scripts/')
from s3_support import *
%matplotlib inline

Based upon the stats and the nature of the product, it would seem that the best approach would be to train separate models for each relevant target: conversion, transaction count, & page views.

# load & prep data

### google traffic

In [12]:
q = '''select
            date_trunc('week', date) as date,
            org,
            form,
            sum(views) as pageviews
        from googleanalytics_traffic
            where date>=2018
        group by date_trunc('week', date), org, form;'''
pageviews = redshift_query_read(q)

In [13]:
pageviews = pageviews[pageviews['form']!=0]
pageviews['date'] = pd.to_datetime(pageviews['date'])
len(pageviews), len(pageviews['form'].unique())

(665076, 20572)

### transactions

In [14]:
q = '''select 
            form, 
            date_trunc('week', date) as date,
            count(id) as count, 
            sum(amount) as vol
        from transactions
        where status='A' and date>=2018 and source='p2p'
        group by form, date_trunc('week', date)
    '''
trans = redshift_query_read(q)

In [15]:
trans['date'] = pd.to_datetime(trans['date'])
len(trans), len(trans['form'].unique())

(41835, 4579)

### analytics data

In [16]:
q = "select * from analytics_weekly where date>=2018"
df_base = redshift_query_read(q)
print("done with analytics")

q = "select * from analyticsp2p_weekly where date>=2018"
df_p2p = redshift_query_read(q)
print("done with analyticsqgiv")

done with analytics
done with analyticsqgiv


In [17]:
p2p_forms = df_p2p['form'].tolist()
df_base = df_base[df_base['form'].isin(p2p_forms)]

df_analytics = df_base.merge(df_p2p, on=["org", "form", "date"]).dropna()

df_analytics = df_analytics.drop(['org', 'product'], axis=1).groupby(['date', 'form']).sum().reset_index()

df_analytics['date'] = pd.to_datetime(df_analytics['date'])

### merge data

In [35]:
# narrow dataset to forms in p2p analytics
pageviews = pageviews[pageviews['form'].isin(p2p_forms)]
trans = trans[trans['form'].isin(p2p_forms)]

# merge traffic and transactions
trans_n_views = trans.merge(pageviews, on=['form', 'date'])
trans_n_views.columns = ['form', 'date', 'trans_count', 'trans_vol', 'org', 'pageviews']

# add conversion & average trans value
trans_n_views['conversion'] = trans_n_views['trans_count'] / trans_n_views['pageviews']
trans_n_views['avg_trans'] = trans_n_views['trans_vol'] / trans_n_views['trans_count']

In [36]:
df_an = df_analytics.merge(trans_n_views, on=['form', 'date'])

# modeling

In [56]:
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import math

## modelling all features

In [60]:
drop_cols = ['date', 'form', 'trans_count', 'trans_vol', 'org', 'pageviews']
targets = ['conversion', 'avg_trans']
ftrs = [c for c in df_an.columns if c not in targets and c not in drop_cols]

In [67]:
print("-"*40)
print("Targeting conversion")
print("-"*40)

print("Random Forest:")
rf_r2 = []
rf_mse = []
rf_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['conversion'])
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    rf_r2.append(rf.score(X_test, y_test))
    rf_mse.append(mse)
    rf_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(rf_r2)))
print("\tMSE: {:.4f}".format(np.mean(rf_mse)))
print("\tRMSE: {:.4f}".format(np.mean(rf_rmse)))

print("GBM")
gb_r2 = []
gb_mse = []
gb_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['conversion'])
    gb = GradientBoostingRegressor()
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    gb_r2.append(gb.score(X_test, y_test))
    gb_mse.append(mse)
    gb_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(gb_r2)))
print("\tMSE: {:.4f}".format(np.mean(gb_mse)))
print("\tRMSE: {:.4f}".format(np.mean(gb_rmse)))

print()
print("Feature importances:")
ftr_importances = [c for c in zip(X_test.columns, rf.feature_importances_)]
for f in ftr_importances:
    if float("{:.4f}".format(f[1])) > 0.:
        print("\t{}: {:.4f}".format(f[0], f[1]))

----------------------------------------
Targeting conversion
----------------------------------------
Random Forest:
	R2: -80.0971
	MSE: 54.7695
	RMSE: 6.8000
GBM
	R2: -23.8027
	MSE: 117.7714
	RMSE: 9.4515

Feature importances:
	vt_trans_count: 0.0002
	p2p_trans_count: 0.2842
	p2p_trans_vol: 0.1951
	sms_trans_vol: 0.0090
	sub_reg_count: 0.1364
	teams_count: 0.0131
	don_volume: 0.0284
	don_count: 0.0509
	class_count: 0.0435
	cat_count: 0.0153
	promo_count: 0.0025
	amt_count: 0.0295
	ded_count: 0.0315
	fields: 0.0461
	opt_fields: 0.0223
	req_fields: 0.0338
	allows_reg_ind: 0.0133
	allows_teams: 0.0005
	allows_reg_team_create: 0.0092
	allows_reg_team_join: 0.0002
	allows_opt_reg_donation: 0.0055
	allows_pfp_off_don: 0.0082
	allows_tfp_off_don: 0.0014
	social_auto: 0.0006
	sponsors_count: 0.0192


In [68]:
print("-"*40)
print("Targeting average transaction")
print("-"*40)

print("Random Forest:")
rf_r2 = []
rf_mse = []
rf_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['avg_trans'])
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    rf_r2.append(rf.score(X_test, y_test))
    rf_mse.append(mse)
    rf_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(rf_r2)))
print("\tMSE: {:.4f}".format(np.mean(rf_mse)))
print("\tRMSE: {:.4f}".format(np.mean(rf_rmse)))

print("GBM")
gb_r2 = []
gb_mse = []
gb_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['avg_trans'])
    gb = GradientBoostingRegressor()
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    gb_r2.append(gb.score(X_test, y_test))
    gb_mse.append(mse)
    gb_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(gb_r2)))
print("\tMSE: {:.4f}".format(np.mean(gb_mse)))
print("\tRMSE: {:.4f}".format(np.mean(gb_rmse)))

print()
print("Feature importances:")
ftr_importances = [c for c in zip(X_test.columns, rf.feature_importances_)]
for f in ftr_importances:
    if float("{:.4f}".format(f[1])) > 0.:
        print("\t{}: {:.4f}".format(f[0], f[1]))

----------------------------------------
Targeting average transaction
----------------------------------------
Random Forest:
	R2: 0.6574
	MSE: 41518.6146
	RMSE: 121.0853
GBM
	R2: 0.6756
	MSE: 82546.9944
	RMSE: 203.6237

Feature importances:
	vt_trans_count: 0.0001
	p2p_trans_count: 0.1843
	vt_trans_vol: 0.0001
	p2p_trans_vol: 0.3008
	teams_count: 0.0003
	don_volume: 0.2420
	don_count: 0.0282
	class_count: 0.0004
	cat_count: 0.0012
	promo_count: 0.0382
	amt_count: 0.0230
	ded_count: 0.0002
	fields: 0.0738
	opt_fields: 0.0002
	req_fields: 0.0781
	allows_reg_ind: 0.0003
	allows_teams: 0.0083
	allows_reg_team_create: 0.0002
	allows_reg_team_join: 0.0001
	allows_opt_reg_donation: 0.0140
	allows_pfp_off_don: 0.0002
	allows_tfp_off_don: 0.0002
	sponsors_count: 0.0058


In [69]:
print("-"*40)
print("Targeting pageviews")
print("-"*40)

print("Random Forest:")
rf_r2 = []
rf_mse = []
rf_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['pageviews'])
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    rf_r2.append(rf.score(X_test, y_test))
    rf_mse.append(mse)
    rf_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(rf_r2)))
print("\tMSE: {:.4f}".format(np.mean(rf_mse)))
print("\tRMSE: {:.4f}".format(np.mean(rf_rmse)))

print("GBM")
gb_r2 = []
gb_mse = []
gb_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['pageviews'])
    gb = GradientBoostingRegressor()
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    gb_r2.append(gb.score(X_test, y_test))
    gb_mse.append(mse)
    gb_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(gb_r2)))
print("\tMSE: {:.4f}".format(np.mean(gb_mse)))
print("\tRMSE: {:.4f}".format(np.mean(gb_rmse)))

print()
print("Feature importances:")
ftr_importances = [c for c in zip(X_test.columns, rf.feature_importances_)]
for f in ftr_importances:
    if float("{:.4f}".format(f[1])) > 0.:
        print("\t{}: {:.4f}".format(f[0], f[1]))

----------------------------------------
Targeting pageviews
----------------------------------------
Random Forest:
	R2: 0.5916
	MSE: 232741.8900
	RMSE: 472.6694
GBM
	R2: 0.6416
	MSE: 217900.5564
	RMSE: 460.8585

Feature importances:
	vt_trans_count: 0.0025
	p2p_trans_count: 0.6328
	mobilevt_trans_count: 0.0009
	vt_trans_vol: 0.0021
	p2p_trans_vol: 0.0579
	mobilevt_trans_vol: 0.0004
	sms_trans_vol: 0.0002
	sub_reg_count: 0.0115
	teams_count: 0.0108
	don_volume: 0.0486
	don_count: 0.0533
	class_count: 0.0108
	cat_count: 0.0158
	promo_count: 0.0125
	amt_count: 0.0226
	ded_count: 0.0059
	fields: 0.0063
	opt_fields: 0.0029
	req_fields: 0.0087
	allows_reg_ind: 0.0018
	allows_teams: 0.0017
	allows_reg_team_create: 0.0029
	allows_reg_team_join: 0.0042
	allows_opt_reg_donation: 0.0202
	allows_pfp_off_don: 0.0025
	allows_tfp_off_don: 0.0028
	social_auto: 0.0008
	count_posts: 0.0002
	sponsors_count: 0.0561


## modelling selected features

selected by correlations and distributions

In [38]:
# classifications, promos, allow teams, categories, 
# allow reg team join, fields, allow sub regs
ftrs = ['sub_reg_count', 'teams_count', 'class_count',
             'promo_count', 'allows_teams', 'trans_count',
             'trans_vol', 'pageviews', 'cat_count',
             'allows_reg_team_join', 'fields']
targets = ['conversion', 'avg_trans']

In [53]:
print("-"*40)
print("Targeting conversion")
print("-"*40)

print("Random Forest:")
rf_r2 = []
rf_mse = []
rf_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['conversion'])
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    rf_r2.append(rf.score(X_test, y_test))
    rf_mse.append(mse)
    rf_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(rf_r2)))
print("\tMSE: {:.4f}".format(np.mean(rf_mse)))
print("\tRMSE: {:.4f}".format(np.mean(rf_rmse)))

print("GBM")
gb_r2 = []
gb_mse = []
gb_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['conversion'])
    gb = GradientBoostingRegressor()
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    gb_r2.append(gb.score(X_test, y_test))
    gb_mse.append(mse)
    gb_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(gb_r2)))
print("\tMSE: {:.4f}".format(np.mean(gb_mse)))
print("\tRMSE: {:.4f}".format(np.mean(gb_rmse)))

print()
print("Feature importances:")
ftr_importances = [c for c in zip(X_test.columns, rf.feature_importances_)]
for f in ftr_importances:
    print("\t{}: {:.4f}".format(f[0], f[1]))
    

print()
print("Excluding other targets")
print("-"*40)

excl_cols = ['trans_count', 'trans_vol', 'pageviews']

print("Random Forest:")
rf_r2 = []
rf_mse = []
rf_rmse = []
for i in range(10):
    these_ftrs = [c for c in ftrs if c not in excl_cols]
    X_train, X_test, y_train, y_test = train_test_split(df_an[these_ftrs], df_an['conversion'])
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    rf_r2.append(rf.score(X_test, y_test))
    rf_mse.append(mse)
    rf_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(rf_r2)))
print("\tMSE: {:.4f}".format(np.mean(rf_mse)))
print("\tRMSE: {:.4f}".format(np.mean(rf_rmse)))

print("GBM")
gb_r2 = []
gb_mse = []
gb_rmse = []
for i in range(10):
    these_ftrs = [c for c in ftrs if c not in excl_cols]
    X_train, X_test, y_train, y_test = train_test_split(df_an[these_ftrs], df_an['conversion'])
    gb = GradientBoostingRegressor()
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    gb_r2.append(gb.score(X_test, y_test))
    gb_mse.append(mse)
    gb_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(gb_r2)))
print("\tMSE: {:.4f}".format(np.mean(gb_mse)))
print("\tRMSE: {:.4f}".format(np.mean(gb_rmse)))

print()
print("Feature importances:")
ftr_importances = [c for c in zip(X_test.columns, rf.feature_importances_)]
for f in ftr_importances:
    print("\t{}: {:.4f}".format(f[0], f[1]))

----------------------------------------
Targeting conversion
----------------------------------------
Random Forest:
	R2: -28.2333
	MSE: 31.5314
	RMSE: 5.0857
GBM
	R2: -230.9148
	MSE: 140.4867
	RMSE: 11.2583

Feature importances:
	sub_reg_count: 0.0271
	teams_count: 0.0120
	class_count: 0.0198
	promo_count: 0.0209
	allows_teams: 0.0001
	trans_count: 0.4519
	trans_vol: 0.2857
	pageviews: 0.1062
	cat_count: 0.0382
	allows_reg_team_join: 0.0013
	fields: 0.0368

Excluding other targets
----------------------------------------
Random Forest:
	R2: -0.4725
	MSE: 83.1284
	RMSE: 8.3535
GBM
	R2: -28.7386
	MSE: 120.3510
	RMSE: 10.3729

Feature importances:
	sub_reg_count: 0.2498
	teams_count: 0.1753
	class_count: 0.1693
	promo_count: 0.0478
	allows_teams: 0.0003
	cat_count: 0.0503
	allows_reg_team_join: 0.0006
	fields: 0.3066


In [54]:
print("-"*40)
print("Targeting average transaction")
print("-"*40)

print("Random Forest:")
rf_r2 = []
rf_mse = []
rf_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['avg_trans'])
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    rf_r2.append(rf.score(X_test, y_test))
    rf_mse.append(mse)
    rf_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(rf_r2)))
print("\tMSE: {:.4f}".format(np.mean(rf_mse)))
print("\tRMSE: {:.4f}".format(np.mean(rf_rmse)))

print("GBM")
gb_r2 = []
gb_mse = []
gb_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['avg_trans'])
    gb = GradientBoostingRegressor()
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    gb_r2.append(gb.score(X_test, y_test))
    gb_mse.append(mse)
    gb_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(gb_r2)))
print("\tMSE: {:.4f}".format(np.mean(gb_mse)))
print("\tRMSE: {:.4f}".format(np.mean(gb_rmse)))

print()
print("Feature importances:")
ftr_importances = [c for c in zip(X_test.columns, rf.feature_importances_)]
for f in ftr_importances:
    print("\t{}: {:.4f}".format(f[0], f[1]))
    

print()
print("Excluding other targets")
print("-"*40)

excl_cols = ['trans_count', 'trans_vol', 'pageviews']

print("Random Forest:")
rf_r2 = []
rf_mse = []
rf_rmse = []
for i in range(10):
    these_ftrs = [c for c in ftrs if c not in excl_cols]
    X_train, X_test, y_train, y_test = train_test_split(df_an[these_ftrs], df_an['avg_trans'])
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    rf_r2.append(rf.score(X_test, y_test))
    rf_mse.append(mse)
    rf_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(rf_r2)))
print("\tMSE: {:.4f}".format(np.mean(rf_mse)))
print("\tRMSE: {:.4f}".format(np.mean(rf_rmse)))

print("GBM")
gb_r2 = []
gb_mse = []
gb_rmse = []
for i in range(10):
    these_ftrs = [c for c in ftrs if c not in excl_cols]
    X_train, X_test, y_train, y_test = train_test_split(df_an[these_ftrs], df_an['avg_trans'])
    gb = GradientBoostingRegressor()
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    gb_r2.append(gb.score(X_test, y_test))
    gb_mse.append(mse)
    gb_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(gb_r2)))
print("\tMSE: {:.4f}".format(np.mean(gb_mse)))
print("\tRMSE: {:.4f}".format(np.mean(gb_rmse)))

print()
print("Feature importances:")
ftr_importances = [c for c in zip(X_test.columns, rf.feature_importances_)]
for f in ftr_importances:
    print("\t{}: {:.4f}".format(f[0], f[1]))

----------------------------------------
Targeting average transaction
----------------------------------------
Random Forest:
	R2: 0.4523
	MSE: 65718.1811
	RMSE: 171.5065
GBM
	R2: 0.5673
	MSE: 145722.3956
	RMSE: 302.5947

Feature importances:
	sub_reg_count: 0.0001
	teams_count: 0.0004
	class_count: 0.0005
	promo_count: 0.0651
	allows_teams: 0.0007
	trans_count: 0.1801
	trans_vol: 0.5530
	pageviews: 0.0491
	cat_count: 0.0009
	allows_reg_team_join: 0.0005
	fields: 0.1495

Excluding other targets
----------------------------------------
Random Forest:
	R2: -1.7852
	MSE: 166780.0506
	RMSE: 347.5048
GBM
	R2: -1.7369
	MSE: 29458.0628
	RMSE: 169.5221

Feature importances:
	sub_reg_count: 0.0248
	teams_count: 0.1131
	class_count: 0.1035
	promo_count: 0.4915
	allows_teams: 0.0164
	cat_count: 0.1294
	allows_reg_team_join: 0.0251
	fields: 0.0962


In [55]:
print("-"*40)
print("Targeting pageviews")
print("-"*40)

print("Random Forest:")
rf_r2 = []
rf_mse = []
rf_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['pageviews'])
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    rf_r2.append(rf.score(X_test, y_test))
    rf_mse.append(mse)
    rf_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(rf_r2)))
print("\tMSE: {:.4f}".format(np.mean(rf_mse)))
print("\tRMSE: {:.4f}".format(np.mean(rf_rmse)))

print("GBM")
gb_r2 = []
gb_mse = []
gb_rmse = []
for i in range(10):
    X_train, X_test, y_train, y_test = train_test_split(df_an[ftrs], df_an['pageviews'])
    gb = GradientBoostingRegressor()
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    gb_r2.append(gb.score(X_test, y_test))
    gb_mse.append(mse)
    gb_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(gb_r2)))
print("\tMSE: {:.4f}".format(np.mean(gb_mse)))
print("\tRMSE: {:.4f}".format(np.mean(gb_rmse)))

print()
print("Feature importances:")
ftr_importances = [c for c in zip(X_test.columns, rf.feature_importances_)]
for f in ftr_importances:
    print("\t{}: {:.4f}".format(f[0], f[1]))
    

print()
print("Excluding other targets")
print("-"*40)

excl_cols = ['trans_count', 'trans_vol', 'pageviews']

print("Random Forest:")
rf_r2 = []
rf_mse = []
rf_rmse = []
for i in range(10):
    these_ftrs = [c for c in ftrs if c not in excl_cols]
    X_train, X_test, y_train, y_test = train_test_split(df_an[these_ftrs], df_an['pageviews'])
    rf = RandomForestRegressor()
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    rf_r2.append(rf.score(X_test, y_test))
    rf_mse.append(mse)
    rf_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(rf_r2)))
print("\tMSE: {:.4f}".format(np.mean(rf_mse)))
print("\tRMSE: {:.4f}".format(np.mean(rf_rmse)))

print("GBM")
gb_r2 = []
gb_mse = []
gb_rmse = []
for i in range(10):
    these_ftrs = [c for c in ftrs if c not in excl_cols]
    X_train, X_test, y_train, y_test = train_test_split(df_an[these_ftrs], df_an['pageviews'])
    gb = GradientBoostingRegressor()
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    
    gb_r2.append(gb.score(X_test, y_test))
    gb_mse.append(mse)
    gb_rmse.append(math.sqrt(mse))
    
print("\tR2: {:.4f}".format(np.mean(gb_r2)))
print("\tMSE: {:.4f}".format(np.mean(gb_mse)))
print("\tRMSE: {:.4f}".format(np.mean(gb_rmse)))

print()
print("Feature importances:")
ftr_importances = [c for c in zip(X_test.columns, rf.feature_importances_)]
for f in ftr_importances:
    print("\t{}: {:.4f}".format(f[0], f[1]))

----------------------------------------
Targeting pageviews
----------------------------------------
Random Forest:
	R2: 0.9697
	MSE: 22025.5411
	RMSE: 115.2869
GBM
	R2: 0.9884
	MSE: 5852.8446
	RMSE: 63.6094

Feature importances:
	sub_reg_count: 0.0006
	teams_count: 0.0015
	class_count: 0.0003
	promo_count: 0.0005
	allows_teams: 0.0001
	trans_count: 0.0014
	trans_vol: 0.0004
	pageviews: 0.9939
	cat_count: 0.0004
	allows_reg_team_join: 0.0000
	fields: 0.0007

Excluding other targets
----------------------------------------
Random Forest:
	R2: 0.1610
	MSE: 533578.7596
	RMSE: 712.9679
GBM
	R2: 0.2229
	MSE: 433503.0506
	RMSE: 646.0105

Feature importances:
	sub_reg_count: 0.2011
	teams_count: 0.1723
	class_count: 0.1303
	promo_count: 0.1432
	allows_teams: 0.0166
	cat_count: 0.1817
	allows_reg_team_join: 0.0423
	fields: 0.1125


## modelling selected features and form embedding

try form or org embedding?