# Supervised Learning

## Non-parametric models

### Tree methods

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import scipy.stats
import matplotlib.pyplot as plt
%matplotlib inline

&nbsp;

Preprocessing steps of the dataset from file `homes.csv`     

In [None]:
filepath = 'data/homes.csv'
dataset = pd.read_csv(filepath, header = 0, sep = ',')

In [None]:
dataset.drop(['id','date', 'sqft_living15', 'lat', 'long', 'sqft_lot15', 'zipcode'], axis = 1, inplace = True)

In [None]:
dataset.shape

In [None]:
dataset.dtypes

In [None]:
dataset.yr_built = [2015 - yr for yr in dataset.yr_built]
dataset.yr_renovated = [2015 - yr if yr != 0 else yr for yr in dataset.yr_renovated]

In [None]:
new_cols = dataset.columns[0:6].tolist() + dataset.columns[-4:].tolist() + dataset.columns[6:10].tolist()
dataset = dataset[new_cols]

In [None]:
dataset.head(10)

&nbsp;

we know from workshop_1 that the variables `condition` and `grade` start at level `1`. 

in order to avoid the additional step of `LabelEncoder()` we can simply subtract 1 from every element of the column 

In [None]:
dataset.condition = [i-1 for i in dataset.condition]

In [None]:
dataset.grade = [i-1 for i in dataset.grade]

In [None]:
dataset.head(10)

In [None]:
dataset.waterfront = dataset.waterfront.astype('category')
dataset.view = dataset.view.astype('category')
dataset.grade = dataset.grade.astype('category')
dataset.condition = dataset.condition.astype('category')

In [None]:
dataset.dtypes

In [None]:
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

In [None]:
x_train, x_test, y_train, y_test = train_test_split(dataset.iloc[:,1:], dataset.iloc[:,0], test_size = .2, random_state = 43)

In [None]:
enc = OneHotEncoder(categorical_features = [9,10,11,12])

In [None]:
dataset_ = enc.fit(dataset).transform(dataset).toarray()
dataset_

&nbsp;

&nbsp;


# Tree Regression  

<a href = 'http://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html'>link</a>

`DecisionTreeRegressor(criterion='mse', 
                        max_depth=None, 
                        min_samples_split=2, 
                        min_samples_leaf=1, 
                        max_features=None, 
                        random_state=None, 
                        min_impurity_decrease=0.0)`
                        
                        
`criterion`: `mse` or `mae`      
`max_depth`: the maximum depth of a tree. if not specified more nodes will be added until the leaves are pure      
`min_samples_split`: the miniumum number of samples required to split a node     
`min_sample_leaf`: the minimum number of samples required to become a leaf node    
`max_features`: the number of features to consider when looking for the best split    
`random_state`: seed      
`min_impurity_decrease`: a node will be split if this split induces a decrease of the impurity greater than or equal to this value

&nbsp;


In [None]:
tree = DecisionTreeRegressor(criterion = 'mse',
                             splitter = 'best',
                             max_depth = 10,
                             min_samples_split = 70)

In [None]:
tree.fit(X = x_train, y  = np.log(y_train))

In [None]:
y_train_pred_tree = tree.predict(x_train)

In [None]:
from bokeh.io import output_notebook, show
from bokeh.plotting import figure
output_notebook()

In [None]:
p = figure(plot_width = 800, plot_height = 400)
p.scatter(range(x_train.shape[0]), np.log(y_train - y_train_pred_tree), color = 'purple')
show(p)

In [None]:
p = figure(plot_width = 700, plot_height = 400)
p.scatter(np.log(y_train_pred_tree), np.log(y_train - y_train_pred_tree), color = 'purple')
show(p)

In [None]:
tree.feature_importances_

In [None]:
df_tree = pd.DataFrame(tree.feature_importances_, dataset.columns[1:], columns = ['value']).sort_values(by = ['value'], ascending = True)

In [None]:
plt.figure(figsize = (5,8))
plt.barh(width = df_tree.value, y = df_tree.index)

In [None]:
y_pred_tree = tree.predict(x_test)

In [None]:
y_pred_tree

In [None]:
mean_squared_error(y_true = np.log(y_test), y_pred = y_pred_tree)

&nbsp;

&nbsp;

# Random Forest Regression

`RandomForestRegressor(n_estimators=10, 
                        criterion='mse', 
                        max_depth=None, 
                        min_samples_split=2, 
                        min_samples_leaf=1, 
                        max_features='auto', 
                        max_leaf_nodes=None, 
                        min_impurity_decrease=0.0, 
                        min_impurity_split=None, 
                        bootstrap=True, 
                        n_jobs=1, 
                        random_state=None, 
                        verbose=0)`
                        
`n_estimators=10`: number of trees in the forest    
`criterion`: `mse` or `mae`      
`max_depth`: the maximum depth of a tree. if not specified more nodes will be added until the leaves are pure      
`min_samples_split`: the miniumum number of samples required to split a node     
`min_sample_leaf`: the minimum number of samples required to become a leaf node    
`max_features`: the number of features to consider when looking for the best split    
`random_state`: seed      
`min_impurity_decrease`: a node will be split if this split induces a decrease of the impurity greater than or equal to this value      
`n_jobs`: the number of jobs to run in parallel for both fit and predict     

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
rf = RandomForestRegressor(n_estimators = 150,
                        criterion='mse',
                        max_depth = 6,
                        min_samples_split =20,
                        bootstrap = True,
                        n_jobs = 10)

In [None]:
rf.fit(X = x_train, y = np.log(y_train))

In [None]:
rf.feature_importances_

In [None]:
df_rf = pd.DataFrame(rf.feature_importances_, dataset.columns[1:], columns = ['value']).sort_values(by = ['value'], ascending = True)

In [None]:
plt.figure(figsize = (5,8))
plt.barh(width = df_rf.value, y = df_rf.index)

In [None]:
y_train_pred_rf = rf.predict(x_train)

In [None]:
p = figure(plot_width = 800, plot_height = 400)
p.scatter(range(x_train.shape[0]), np.log(y_train - y_train_pred_rf), color = 'purple')
show(p)

In [None]:
p = figure(plot_width = 700, plot_height = 400)
p.scatter(np.log(y_train_pred_tree), np.log(y_train - y_train_pred_rf), color = 'purple')
show(p)

In [None]:
y_pred_rf = rf.predict(x_test)

In [None]:
mean_squared_error(y_true = np.log(y_test), y_pred = y_pred_rf)