### Introduction to Machine Learning: Lesson 3 

In [None]:
#importing necessary libraries
%load_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics

#loading preprocessed file
PATH = "data/bulldozers/"

df_raw = pd.read_feather('tmp/bulldozers-raw')
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')

In [None]:
#creating a validation set

def split_vals(a,n): return a[:n], a[n:]
n_valid = 12000
n_trn = len(df_trn)-n_valid
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)
raw_train, raw_valid = split_vals(df_raw, n_trn)

#define function to calculate rmse and print score
def rmse(x,y): return math.sqrt(((x-y)**2).mean())

def print_score(m):
   res = [rmse(m.predict(X_train), y_train), rmse(m.predict(X_valid), y_valid),
               m.score(X_train, y_train), m.score(X_valid, y_valid)]
   if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
   print(res)

In [None]:
set_rf_samples(50000)

In [None]:
#building a random forest model

m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

In [None]:
def get_preds(t): return t.predict(X_valid)
%time preds = np.stack(parallel_trees(m, get_preds))
np.mean(preds[:,0]), np.std(preds[:,0])

In [None]:
x = raw_valid.copy()

In [None]:
x['pred_std'] = np.std(preds, axis=0)
x['pred'] = np.mean(preds, axis=0)

### Confidence based on Tree Variance 

In [None]:
x.Enclosure.value_counts().plot.barh()

In [None]:
flds = ['Enclosure', 'SalePrice', 'pred', 'pred_std']
enc_summ = x[flds].groupby('Enclosure', as_index=False).mean()
enc_summ

In [None]:
enc_summ = enc_summ[~pd.isnull(enc_summ.SalePrice)]
enc_summ.plot('Enclosure', 'pred', 'barh', xerr='pred_std', alpha=0.6, xlim=(0,11));

In [None]:
#the value count for each category
raw_valid.ProductSize.value_counts().plot.barh();

In [None]:
#category wise mean for sale price, prediction and standard deviation
flds = ['ProductSize', 'SalePrice', 'pred', 'pred_std']
summ = x[flds].groupby(flds[0]).mean()
summ

### Feature Importance 

In [None]:
fi = rf_feat_importance(m, df_trn)
fi[:10]

In [None]:
fi.plot('cols', 'imp', figsize=(10,6), legend=False);

In [None]:
def plot_fi(fi):
return fi.plot('cols','imp','barh', figsize=(12,7), legend=False)
plot_fi(fi[:30]);

In [None]:
to_keep = fi[fi.imp>0.005].cols
len(to_keep)

In [None]:
df_keep = df_trn[to_keep].copy()
X_train, X_valid = split_vals(df_keep, n_trn)

m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5,
n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

In [None]:
fi = rf_feat_importance(m, df_keep)
plot_fi(fi)

### Introduction to Machine Learning : Lesson 4 

In [None]:
df_trn2, y_trn, nas = proc_df(df_raw, 'SalePrice', max_n_cat=7)
X_train, X_valid = split_vals(df_trn2, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3,
     max_features=0.6, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

In [None]:
fi = rf_feat_importance(m, df_trn2)
fi[:25]

In [None]:
from scipy.cluster import hierarchy as hc
corr = np.round(scipy.stats.spearmanr(df_keep).correlation, 4)
corr_condensed = hc.distance.squareform(1-corr)
z = hc.linkage(corr_condensed, method='average')
fig = plt.figure(figsize=(16,10))
dendrogram = hc.dendrogram(z, labels=df_keep.columns,
    orientation='left', leaf_font_size=16)
plt.show()

In [None]:
#define function to calculate oob score
def get_oob(df):
  m = RandomForestRegressor(n_estimators=30, min_samples_leaf=5, max_features=0.6, n_jobs=-1, oob_score=True)
  x, _ = split_vals(df, n_trn)
  m.fit(x, y_train)
  return m.oob_score_

In [None]:
get_oob(df_keep)

In [None]:
for c in ('saleYear', 'saleElapsed', 'fiModelDesc', 'fiBaseModel', 'Grouser_Tracks', 'Coupler_System'):
  print(c, get_oob(df_keep.drop(c, axis=1)))

In [None]:
to_drop = ['saleYear', 'fiBaseModel', 'Grouser_Tracks']
get_oob(df_keep.drop(to_drop, axis=1))

In [None]:
df_keep.drop(to_drop, axis=1, inplace=True)
X_train, X_valid = split_vals(df_keep, n_trn)
reset_rf_samples()

m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

In [None]:
from pdpbox import pdp
from plotnine import *

set_rf_samples(50000)

df_trn2, y_trn, nas = proc_df(df_raw, 'SalePrice', max_n_cat=7)
X_train, X_valid = split_vals(df_trn2, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.6, n_jobs=-1)
m.fit(X_train, y_train);

plot_fi(rf_feat_importance(m, df_trn2)[:10]);

In [None]:
df_raw.plot('YearMade', 'saleElapsed', 'scatter', alpha=0.01, figsize=(10,8));

In [None]:
x_all = get_sample(df_raw[df_raw.YearMade>1930], 500)
ggplot(x_all, aes('YearMade', 'SalePrice'))+stat_smooth(se=True, method='loess')

In [None]:
x = get_sample(X_train[X_train.YearMade>1930], 500)

def plot_pdp(feat, clusters=None, feat_name=None):
   feat_name = feat_name or feat
   p = pdp.pdp_isolate(m, x, feat)
   return pdp.pdp_plot(p, feat_name, plot_lines=True, cluster=clusters is not None, n_cluster_centers=clusters)

plot_pdp('YearMade')

In [None]:
plot_pdp(['Enclosure_EROPS w AC', 'Enclosure_EROPS', 'Enclosure_OROPS'], 5, 'Enclosure')

In [None]:
from treeinterpreter import treeinterpreter as ti
df_train, df_valid = split_vals(df_raw[df_keep.columns], n_trn)
row = X_valid.values[None,0]
row

In [None]:
prediction, bias, contributions = ti.predict(m, row)
prediction[0], bias[0]

In [None]:
idxs = np.argsort(contributions[0])
[o for o in zip(df_keep.columns[idxs], df_valid.iloc[0][idxs], contributions[0][idxs])]

### Introduction to Machine Learning : Lesson 5 

In [None]:
df_ext = df_keep.copy()
df_ext['is_valid'] = 1
df_ext.is_valid[:n_trn] = 0
x, y, nas = proc_df(df_ext, 'is_valid')

m = RandomForestClassifier(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(x, y);
m.oob_score_

In [None]:
fi = rf_feat_importance(m, x)
fi[:10]

In [None]:
feats=['SalesID', 'saleElapsed', 'MachineID']
(X_train[feats]/1000).describe()

In [None]:
(X_valid[feats]/1000).describe()

In [None]:
x.drop(feats, axis=1, inplace=True)
m = RandomForestClassifier(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(x, y);
m.oob_score_

In [None]:
fi = rf_feat_importance(m, x)
fi[:10]

In [None]:
set_rf_samples(50000)
feats=['SalesID', 'saleElapsed', 'MachineID', 'age', 'YearMade', 'saleDayofyear']
X_train, X_valid = split_vals(df_keep, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

In [None]:
for f in feats:

  df_subs = df_keep.drop(f, axis=1)
  X_train, X_valid = split_vals(df_subs, n_trn)
  m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
  m.fit(X_train, y_train)
  print(f)
  print_score(m)

In [None]:
reset_rf_samples()
df_subs = df_keep.drop(['SalesID', 'MachineID', 'saleDayofyear'],axis=1)
X_train, X_valid = split_vals(df_subs, n_trn)
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

In [None]:
m = RandomForestRegressor(n_estimators=160, max_features=0.5, n_jobs=-1, oob_score=True)
%time m.fit(X_train, y_train)
print_score(m)

### Random Forest from Scratch 

In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

from fastai.imports import *
from fastai.structured import *
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

from IPython.display import display
from sklearn import metrics

In [None]:
PATH = "data/bulldozers/"

df_raw = pd.read_feather('tmp/bulldozers-raw')
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')
def split_vals(a,n): return a[:n], a[n:]

n_valid = 12000
n_trn = len(df_trn)-n_valid

X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)
raw_train, raw_valid = split_vals(df_raw, n_trn)
x_sub = X_train[['YearMade', 'MachineHoursCurrentMeter']]

In [None]:
class TreeEnsemble():
   def __init__(self, x, y, n_trees, sample_sz, min_leaf=5):
       np.random.seed(42)
       self.x,self.y,self.sample_sz,self.min_leaf = x,y,sample_sz,min_leaf
       self.trees = [self.create_tree() for i in range(n_trees)]

   def create_tree(self):
       rnd_idxs = np.random.permutation(len(self.y))[:self.sample_sz]
       return DecisionTree(self.x.iloc[rnd_idxs], self.y[rnd_idxs], min_leaf=self.min_leaf)
       
   def predict(self, x):
       return np.mean([t.predict(x) for t in self.trees], axis=0)



In [None]:
class DecisionTree():
   def __init__(self, x, y, idxs=None, min_leaf=5):
       if idxs is None: idxs=np.arange(len(y))
       self.x,self.y,self.idxs,self.min_leaf = x,y,idxs,min_leaf
       self.n,self.c = len(idxs), x.shape[1]
       self.val = np.mean(y[idxs])
       self.score = float('inf')
       self.find_varsplit()
       
   # This just does one decision; we'll make it recursive later
   def find_varsplit(self):
       for i in range(self.c): self.find_better_split(i)
           
   # We'll write this later!
   def find_better_split(self, var_idx): pass
   
   @property
   def split_name(self): return self.x.columns[self.var_idx]
   
   @property
   def split_col(self): return self.x.values[self.idxs,self.var_idx]

   @property
   def is_leaf(self): return self.score == float('inf')
   
   def __repr__(self):
       s = f'n: {self.n}; val:{self.val}'
       if not self.is_leaf:
           s += f'; score:{self.score}; split:{self.split}; var:{self.split_name}'
       return s