# Train Random Forest model

In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

## Setup

In [2]:
from tep.utils import load_array
X = load_array('data/meta_features_v3.bc')
X.shape

(1293005, 24)

In [3]:
y = load_array('data/log_labels_v3.bc')
y.shape

(1293005,)

In [4]:
import numpy as np

In [5]:
np.nan_to_num(X, copy=False)

array([[0., 1., 1., ..., 0., 0., 1.],
       [0., 0., 2., ..., 0., 0., 1.],
       [0., 0., 2., ..., 0., 0., 1.],
       ...,
       [1., 3., 0., ..., 0., 0., 0.],
       [1., 3., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 5., 0., 0.]])

In [6]:
X_train = X[:-10000]
y_train = y[:-10000]
X_valid = X[-10000:]
y_valid = y[-10000:]
print(X_train.shape)
print(X_valid.shape)
print(y_train.shape)
print(y_valid.shape)

(1283005, 24)
(10000, 24)
(1283005,)
(10000,)


In [7]:
import pandas as pd
from tep.featureGenerator import FeatureGenerator
fg = FeatureGenerator()
columns = fg.structured_feature_map()[1]
columns

array(['urls', 'hashtags', 'mentions', 'length', 'sentiment', 'followers',
       'friends', 'follower_friend_ratio', 'verified', 'listings',
       'tweets', 'tweet_freq', 'favorites', 'favorite_freq',
       'account_age', 'month', 'day', 'weekday', 'hour', 'minute',
       'quote', 'quoted_popularity', 'quoted_sentiment', 'reply'],
      dtype='<U31')

In [8]:
df = pd.DataFrame(data=X, columns=columns)
df.head()

Unnamed: 0,urls,hashtags,mentions,length,sentiment,followers,friends,follower_friend_ratio,verified,listings,...,account_age,month,day,weekday,hour,minute,quote,quoted_popularity,quoted_sentiment,reply
0,0.0,1.0,1.0,103.0,0.3125,443717.0,781.0,568.139565,1.0,2751.0,...,3963.0,9.0,1.0,3.0,9.0,9.0,0.0,0.0,0.0,1.0
1,0.0,0.0,2.0,37.0,0.0,79503.0,2136.0,37.220506,1.0,1020.0,...,3648.0,11.0,13.0,0.0,23.0,49.0,0.0,0.0,0.0,1.0
2,0.0,0.0,2.0,119.0,0.61,1524959.0,1353.0,1127.094605,1.0,14295.0,...,4429.0,11.0,28.0,1.0,20.0,5.0,0.0,0.0,0.0,1.0
3,1.0,0.0,1.0,139.0,-0.5,600213.0,7984.0,75.176979,1.0,4723.0,...,3753.0,2.0,23.0,4.0,18.0,11.0,0.0,0.0,0.0,1.0
4,1.0,1.0,0.0,132.0,0.112121,144853.0,937.0,154.592316,1.0,3463.0,...,3522.0,7.0,6.0,3.0,11.0,26.0,0.0,0.0,0.0,0.0


In [9]:
df.isnull().sum()

urls                     0
hashtags                 0
mentions                 0
length                   0
sentiment                0
followers                0
friends                  0
follower_friend_ratio    0
verified                 0
listings                 0
tweets                   0
tweet_freq               0
favorites                0
favorite_freq            0
account_age              0
month                    0
day                      0
weekday                  0
hour                     0
minute                   0
quote                    0
quoted_popularity        0
quoted_sentiment         0
reply                    0
dtype: int64

## Define metrics

In [14]:
import math
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m, ret_scores=False):
    res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
                m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)
    if ret_scores: return res

## Train benchmark

In [11]:
from sklearn.ensemble import RandomForestRegressor
m = RandomForestRegressor(n_estimators=10, n_jobs=-1, min_samples_leaf=8, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

  warn("Some inputs do not have OOB scores. "


[0.6732642146135293, 0.8343643673603437, 0.9009733940704928, 0.8466501400357503, 0.8169242288243781]


## Define configurations

We vary RF training along three dimensions:
* Number of estimators: 20, 40, 80 estimators
* Minimum number of samples per leaf: 8, 16, 32 samples
* Max number of features: all, 0.4, 0.7

In [19]:
configurations = [
    {"n_estimators": 20, "min_samples_leaf": 8, "max_features": "auto"},
    {"n_estimators": 20, "min_samples_leaf": 8, "max_features": 0.4},
    {"n_estimators": 20, "min_samples_leaf": 8, "max_features": 0.7},
    {"n_estimators": 20, "min_samples_leaf": 16, "max_features": "auto"},
    {"n_estimators": 20, "min_samples_leaf": 16, "max_features": 0.4},
    {"n_estimators": 20, "min_samples_leaf": 16, "max_features": 0.7},
    {"n_estimators": 20, "min_samples_leaf": 32, "max_features": "auto"},
    {"n_estimators": 20, "min_samples_leaf": 32, "max_features": 0.4},
    {"n_estimators": 20, "min_samples_leaf": 32, "max_features": 0.7},
    {"n_estimators": 40, "min_samples_leaf": 8, "max_features": "auto"},
    {"n_estimators": 40, "min_samples_leaf": 8, "max_features": 0.4},
    {"n_estimators": 40, "min_samples_leaf": 8, "max_features": 0.7},
    {"n_estimators": 40, "min_samples_leaf": 16, "max_features": "auto"},
    {"n_estimators": 40, "min_samples_leaf": 16, "max_features": 0.4},
    {"n_estimators": 40, "min_samples_leaf": 16, "max_features": 0.7},
    {"n_estimators": 40, "min_samples_leaf": 32, "max_features": "auto"},
    {"n_estimators": 40, "min_samples_leaf": 32, "max_features": 0.4},
    {"n_estimators": 40, "min_samples_leaf": 32, "max_features": 0.7},
    {"n_estimators": 80, "min_samples_leaf": 8, "max_features": "auto"},
    {"n_estimators": 80, "min_samples_leaf": 8, "max_features": 0.4},
    {"n_estimators": 80, "min_samples_leaf": 8, "max_features": 0.7},
    {"n_estimators": 80, "min_samples_leaf": 16, "max_features": "auto"},
    {"n_estimators": 80, "min_samples_leaf": 16, "max_features": 0.4},
    {"n_estimators": 80, "min_samples_leaf": 16, "max_features": 0.7},
    {"n_estimators": 80, "min_samples_leaf": 32, "max_features": "auto"},
    {"n_estimators": 80, "min_samples_leaf": 32, "max_features": 0.4},
    {"n_estimators": 80, "min_samples_leaf": 32, "max_features": 0.7},
]

In [20]:
for config in configurations:
    print(config)
    m = RandomForestRegressor(n_estimators=config["n_estimators"], 
                              n_jobs=-1, 
                              min_samples_leaf=config["min_samples_leaf"],
                              max_features=config["max_features"],
                              oob_score=True)
    m.fit(X_train, y_train)
    scores = print_score(m, ret_scores=True)
    config["train_loss"] = scores[0]
    config["val_loss"] = scores[1]
    config["train_r2"] = scores[2]
    config["val_r2"] = scores[3]

{'n_estimators': 20, 'min_samples_leaf': 8, 'max_features': 'auto'}


  warn("Some inputs do not have OOB scores. "


[0.6644953729789213, 0.8289956564477136, 0.9035361139055367, 0.8486172478577517, 0.8432041236316468]
{'n_estimators': 20, 'min_samples_leaf': 8, 'max_features': 0.4}


  warn("Some inputs do not have OOB scores. "


[0.6988899017237078, 0.8246181788533794, 0.8932916604476471, 0.8502117676540385, 0.8450427354976453]
{'n_estimators': 20, 'min_samples_leaf': 8, 'max_features': 0.7}


  warn("Some inputs do not have OOB scores. "


[0.6754629229374203, 0.8212605650449298, 0.9003255469275495, 0.851429075623028, 0.8447850790279372]
{'n_estimators': 20, 'min_samples_leaf': 16, 'max_features': 'auto'}


  warn("Some inputs do not have OOB scores. "


[0.7440982700035548, 0.83011866344908, 0.8790400994931913, 0.8482068258297765, 0.8438385814212778]
{'n_estimators': 20, 'min_samples_leaf': 16, 'max_features': 0.4}


  warn("Some inputs do not have OOB scores. "


[0.7651122838378301, 0.8280647556937328, 0.8721115921966548, 0.848957040218177, 0.8444486897442407]
{'n_estimators': 20, 'min_samples_leaf': 16, 'max_features': 0.7}


  warn("Some inputs do not have OOB scores. "


[0.7493958231293695, 0.8241622651568133, 0.8773116381890312, 0.8503773512666336, 0.8449470948547959]
{'n_estimators': 20, 'min_samples_leaf': 32, 'max_features': 'auto'}


  warn("Some inputs do not have OOB scores. "


[0.7963266685048559, 0.837352795350232, 0.8614637742127192, 0.8455496718723237, 0.8419650989433641]
{'n_estimators': 20, 'min_samples_leaf': 32, 'max_features': 0.4}


  warn("Some inputs do not have OOB scores. "


[0.8092195619487844, 0.8377285364289625, 0.8569415297058095, 0.8454110293461983, 0.8413690588359064]
{'n_estimators': 20, 'min_samples_leaf': 32, 'max_features': 0.7}


  warn("Some inputs do not have OOB scores. "


[0.7979572489163573, 0.8325500183347296, 0.8608958521805952, 0.8473163421655923, 0.8428330433785984]
{'n_estimators': 40, 'min_samples_leaf': 8, 'max_features': 'auto'}
[0.6606339498997391, 0.823431114951085, 0.9046539715048811, 0.8506427070339878, 0.8479019159351454]
{'n_estimators': 40, 'min_samples_leaf': 8, 'max_features': 0.4}
[0.6952800676078044, 0.8197053913711642, 0.8943911315016161, 0.8519912232134204, 0.8493311770456898]
{'n_estimators': 40, 'min_samples_leaf': 8, 'max_features': 0.7}
[0.6711274185382097, 0.8176748330866098, 0.9016009749588446, 0.8527236039067001, 0.8494946882121621]
{'n_estimators': 40, 'min_samples_leaf': 16, 'max_features': 'auto'}
[0.7411644528911357, 0.827075781175803, 0.879992056094845, 0.8493176121056777, 0.8468678996106658]
{'n_estimators': 40, 'min_samples_leaf': 16, 'max_features': 0.4}
[0.7627009737138717, 0.8257726731914513, 0.8729164222315046, 0.849792056509205, 0.8471800423483218]
{'n_estimators': 40, 'min_samples_leaf': 16, 'max_features': 0.7}

In [21]:
configurations

[{'n_estimators': 20,
  'min_samples_leaf': 8,
  'max_features': 'auto',
  'train_loss': 0.6644953729789213,
  'val_loss': 0.8289956564477136,
  'train_r2': 0.9035361139055367,
  'val_r2': 0.8486172478577517},
 {'n_estimators': 20,
  'min_samples_leaf': 8,
  'max_features': 0.4,
  'train_loss': 0.6988899017237078,
  'val_loss': 0.8246181788533794,
  'train_r2': 0.8932916604476471,
  'val_r2': 0.8502117676540385},
 {'n_estimators': 20,
  'min_samples_leaf': 8,
  'max_features': 0.7,
  'train_loss': 0.6754629229374203,
  'val_loss': 0.8212605650449298,
  'train_r2': 0.9003255469275495,
  'val_r2': 0.851429075623028},
 {'n_estimators': 20,
  'min_samples_leaf': 16,
  'max_features': 'auto',
  'train_loss': 0.7440982700035548,
  'val_loss': 0.83011866344908,
  'train_r2': 0.8790400994931913,
  'val_r2': 0.8482068258297765},
 {'n_estimators': 20,
  'min_samples_leaf': 16,
  'max_features': 0.4,
  'train_loss': 0.7651122838378301,
  'val_loss': 0.8280647556937328,
  'train_r2': 0.87211159219

In [26]:
import json
with open("models/rf_v1.txt", 'w') as f:
    for c in configurations:
        json.dump(c, f, sort_keys=True)
        f.write("\n")