## Introduction to Machine Learning: Lesson 1 

### Importing necessary libraries 

In [None]:
%load ext_autoreload
%autoreload 2
%matplotlib inline
from fastai.imports import*
from fastai.structured import *
from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import display
from sklearn import metrics

### Downloading the Dataset 

In [None]:
PATH = “data/bulldozers/”

In [None]:
!ls data/bulldozers/

Or,

!ls {PATH}

### Reading the files 

In [None]:
df_raw = pd.read_csv(f'{PATH}Train.csv', low_memory=False, parse_dates=["saledate"])

In [None]:
df_raw.head()

In [None]:
def display_all(df):
    with pd.option_context("display.max_rows", 1000, "display.max_columns", 1000): 
        display(df)

In [None]:
display_all(df_raw.head().transpose())

In [None]:
df_raw.SalePrice = np.log(df_raw.SalePrice)

### Introduction to Random Forest 

In [None]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df_raw.drop('SalePrice', axis=1), df_raw.SalePrice)

### Data Preprocessing 

In [None]:
add_datepart(df_raw, 'saledate')
df_raw.columns

In [None]:
train_cats(df_raw)

In [None]:
df_raw.UsageBand.cat.set_categories(['High', 'Medium', 'Low'], ordered=True, inplace=True)

### Missing Value Treatment 

In [None]:
display_all(df_raw.isnull().sum().sort_index()/len(df_raw))

In [None]:
#to save
os.makedirs('tmp', exist_ok=True)
df.to_feather('tmp/bulldozers-raw')

#to read
df_raw = pd.read_feather('tmp/bulldozers-raw')

In [None]:
df, y, nas = proc_df(df_raw, 'SalePrice')

### Model Building 

In [None]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(df, y)
m.score(df,y)

In [None]:
def split_vals(a,n): 
    return a[:n].copy(), a[n:].copy()

n_valid = 12000  # same as Kaggle's test set size
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

X_train.shape, y_train.shape, X_valid.shape

In [None]:
#define a function to check rmse value
def rmse(x,y): 
    return math.sqrt(((x-y)**2).mean())

In [None]:
def print_score(m):
    res = [rmse(m.predict(X_train), y_train),
           rmse(m.predict(X_valid), y_valid),
           m.score(X_train, y_train), m.score(X_valid, y_valid)]
    if hasattr(m, 'oob_score_'): res.append(m.oob_score_)
    print(res)

In [None]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

## Introduction to Machine Learning: Lesson 2 

### Creating a Validation set 

In [None]:
def split_vals(a,n):
   return a[:n].copy(), a[n:].copy()

n_valid = 12000  
n_trn = len(df)-n_valid
raw_train, raw_valid = split_vals(df_raw, n_trn)
X_train, X_valid = split_vals(df, n_trn)
y_train, y_valid = split_vals(y, n_trn)

In [None]:
m = RandomForestRegressor(n_jobs=-1)
%time m.fit(X_train, y_train)
print_score(m)

In [None]:
df_trn, y_trn, nas = proc_df(df, 'SalePrice', subset=30000)
X_train, _ = split_vals(df_trn, 20000)
y_train, _ = split_vals(y_trn, 20000)

### Building a single tree

In [None]:
m = RandomForestRegressor(n_estimators=1, max_depth=3, bootstrap=False, n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)

In [None]:
draw_tree(m.estimators_[0], df_trn, precision=3)

### Introduction to Bagging 

In [None]:
m = RandomForestRegressor(n_jobs=-1)
m.fit(X_train, y_train)
print_score(m)

In [None]:
preds = np.stack([t.predict(X_valid) for t in m.estimators_])

In [None]:
preds.shape

In [None]:
preds[:,0], np.mean(preds[:,0]), y_valid[0]

In [None]:
plt.plot([metrics.r2_score(y_valid, np.mean(preds[:i+1], axis=0)) for i in range(10)]);

### Out-of-Bag (OOB) Score 

In [None]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

### Subsampling 

In [None]:
df_trn, y_trn, nas = proc_df(df_raw, 'SalePrice')
X_train, X_valid = split_vals(df_trn, n_trn)
y_train, y_valid = split_vals(y_trn, n_trn)
set_rf_samples(20000)

In [None]:
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

In [None]:
reset_rf_samples()
m = RandomForestRegressor(n_estimators=40, n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

### Other Hyperparameters to Experiment with and Tune 

### Min sample leaf 

In [None]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3,n_jobs=-1, oob_score=True)
m.fit(X_train, y_train)
print_score(m)

### Max feature 

In [None]:
m = RandomForestRegressor(n_estimators=40, min_samples_leaf=3, max_features=0.5, n_jobs=-1, oob_score=True) 
m.fit(X_train, y_train)
print_score(m)