In [None]:
! [ -e /content ] && pip install -Uqq fastbook kaggle waterfallcharts treeinterpreter dtreeviz

In [None]:
from fastbook import *
from pandas.api.types import is_string_dtype, is_numeric_dtype, is_categorical_dtype
from fastai.tabular.all import *
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, make_scorer
from sklearn.model_selection import cross_val_score
from IPython.display import Image, display_svg, SVG
import matplotlib.pyplot as plt
import joblib
import numpy as np

# Experiment 1 vs. legacy
# 1.0.0 (loss = 95)
# 1.0.1 (loss = 71)
# 1.0.2 (loss = 68)
# 1.0.3 (loss = 90) (range: -10,000 - 10,000)

# Experiment 2 vs. legacy-1 (accurate item price readings)
# 1.0.4 (loss = 142000, loss / mean_pred = 2.76) (range: -1000 - 1500) (dep_var = weighted_profit)

# Experiment 3 - (1.0.5 vs 1.1.0) (weighted_profit vs gst)
# 1.0.5 - (dep_var = weighted_profit) (range: [-400, 700]) (loss = 92000, loss / mean_pred = 2.06)
# 1.1.0 - (dep_var = gst) (range: [-400, 700]) (loss = 54, loss / mean_pred = 1.74)
# 1.1.0 wins

# Experiment 4 - (1.1.1 vs 1.2.0) (gst RF vs. gst NN)
# 1.1.1 - (dep_var = gst) (range: [-400, 700]) (loss = 70, loss / mean_pred = 1.43)
# 1.2.0 - (dep_var = gst) (range: [-400, 700]) (loss = 77, loss / mean_pred = 1.56)

# Experiment 5 - (1.2.1 - 1.2.2) (NN short lookback vs. NN long lookback)

# Experiment 6 - (legacy-2 vs 1.1.2 vs 1.2.3) (baseline vs RF GST vs NN GST)
# 63 day lookback and 14 day test set
# 1.1.2 - (dep_var = gst) (range: [-180, 400]) (loss = 53, loss / mean_pred = 1.31)
# 1.2.3 - (dep_var = gst) (range: [-180, 400]) (loss = 53, loss / mean_pred = 1.31)

model_version = '1.1.2'
model_can_be_dumped = True
dep_var = 'gst'
min_samples_leaf_param = 5
max_features_param = 'sqrt'
num_est = 80
validation_lookback = 14
train_start_days_past = 63
wanted_columns = [
    "high_price_1h",
    "high_price_5m",
    "high_volume_1h",
    "high_volume_5m",
    "low_price_1h",
    "low_price_5m",
    "low_volume_1h",
    "low_volume_5m",
    "player_count",
    "timestampElapsed"
  ]

In [None]:
path = Path.cwd()
Path.BASE_PATH = path
dsets_path = path.parent.parent/'fastai/datasets'
path.ls()

In [None]:
df = pd.read_csv(dsets_path/'osrs_trades.csv', low_memory=False)
make_date(df, 'timestamp')
df = add_datepart(df, 'timestamp')

In [None]:
procs = [Categorify, FillMissing]
max_day = df["timestampDayofyear"].max()
minCond = (df.timestampDayofyear>(max_day - train_start_days_past))
maxCond = (df.timestampDayofyear<=(max_day - validation_lookback))
train_idx = np.where((maxCond) & (minCond))[0]
valid_idx = np.where(~maxCond)[0]

splits = (list(train_idx),list(valid_idx))
cont,cat = cont_cat_split(df, 1, dep_var=dep_var)
to = TabularPandas(df, procs, cat, cont, y_names=dep_var, splits=splits)
save_pickle(path/'preprocessed_train_valid.pkl', to)

In [None]:
to = load_pickle(path/'preprocessed_train_valid.pkl')

In [None]:
xs,y = to.train.xs,to.train.y
valid_xs,valid_y = to.valid.xs,to.valid.y

In [None]:
def r_mse(pred,y): return round(math.sqrt(((pred-y)**2).mean()), 6)
def m_rmse(m, xs, y): return r_mse(m.predict(xs), y)

In [None]:
def rf(xs, y, n_estimators=num_est,
       max_features=max_features_param, min_samples_leaf=min_samples_leaf_param, **kwargs):
    return RandomForestRegressor(n_jobs=-1, n_estimators=n_estimators, max_features=max_features,
        min_samples_leaf=min_samples_leaf, oob_score=True).fit(xs, y)

In [None]:
m = rf(xs, y)

In [None]:
def rf_feat_importance(m, df):
    return pd.DataFrame({'cols':df.columns, 'imp':m.feature_importances_}
                       ).sort_values('imp', ascending=False)

In [None]:
fi = rf_feat_importance(m, xs)

In [None]:
def plot_fi(fi):
    return fi.plot('cols', 'imp', 'barh', figsize=(12,7), legend=False)

In [None]:
xs_imp = xs
valid_xs_imp = valid_xs

def drop_unwanted_columns(df, required_columns):
  df = df[required_columns]
  return df
    
xs_imp = drop_unwanted_columns(xs_imp, wanted_columns)
valid_xs_imp = drop_unwanted_columns(valid_xs_imp, wanted_columns)
    
xs_imp = xs_imp.sort_index(axis=1)
valid_xs_imp = valid_xs_imp.sort_index(axis=1)

m = rf(xs_imp, y)
mean_rmse = m_rmse(m, valid_xs_imp, valid_y)
mean_rmse, mean_rmse / valid_y.mean()

In [None]:
preds = np.stack([t.predict(valid_xs_imp) for t in m.estimators_])

In [None]:
plt.plot([r_mse(preds[:i+1].mean(0), valid_y) for i in range(num_est)]);

In [None]:
plot_fi(rf_feat_importance(m, xs_imp));

In [None]:
cluster_columns(xs_imp)

In [None]:
m.predict(valid_xs_imp.head(n=10))

In [None]:
valid_y.head(n=10)

In [None]:
y_pred = m.predict(valid_xs_imp)

slope, intercept = np.polyfit(valid_y, y_pred, 1)
reg_line = slope * np.array(valid_y) + intercept

plt.figure(figsize=(10, 6))
plt.scatter(valid_y, y_pred, color='blue', alpha=0.6, label='Predicted vs Actual')
plt.plot([min(valid_y), max(valid_y)], [min(valid_y), max(valid_y)], color='red', linestyle='--', label='Ideal Fit')
plt.plot(valid_y, reg_line, color='green', linestyle='-', linewidth=2, label='Regression Line')
plt.xlabel('Actual Labels')
plt.ylabel('Predicted Labels')
plt.title('Predictions vs Actual Labels')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
xs_imp.columns

In [None]:
if model_can_be_dumped:
    joblib.dump(m, f'/home/tristan/Documents/dev/ge_bot/prediction_api/{model_version}.pkl')
    print("dumped new model")
else:
    print("did not dump new model")