# Nonlinear Relationships

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
% matplotlib inline
import pandas as pd

In [None]:
from sklearn.datasets import load_boston
boston_data = load_boston()
df = pd.DataFrame(boston_data.data, columns=boston_data.feature_names)

In [None]:
df.head()

In [None]:
y = boston_data.target

## Brief Introduction to Decision Tree

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
X = df[['LSTAT']].values

In [None]:
tree = DecisionTreeRegressor(max_depth=5)

In [None]:
tree.fit(X, y)

In [None]:
sort_idx = X.flatten().argsort()

In [None]:
plt.figure(figsize=(10,8))
plt.scatter(X[sort_idx], y[sort_idx])
plt.plot(X[sort_idx], tree.predict(X[sort_idx]), color='k')

plt.xlabel('LSTAT')
plt.ylabel('MEDV');

***

Using `max_depth` of 5 led to overfitting. Let's try 2 instead.

In [None]:
tree = DecisionTreeRegressor(max_depth=2)
tree.fit(X, y)
sort_idx = X.flatten().argsort()
plt.figure(figsize=(10,8))
plt.scatter(X[sort_idx], y[sort_idx])
plt.plot(X[sort_idx], tree.predict(X[sort_idx]), color='k')

plt.xlabel('LSTAT')
plt.ylabel('MEDV');

# Brief Introduction to Random Forest

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
X = df.values
#y = df['MEDV'].values

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.3, 
                                                    random_state=42)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
forest = RandomForestRegressor(n_estimators=500, criterion='mse', 
                               random_state=42, n_jobs=-1)

In [None]:
forest.fit(X_train, y_train)

In [None]:
y_train_pred = forest.predict(X_train)

In [None]:
y_test_pred = forest.predict(X_test)

In [None]:
print("MSE train: {0:.4f}, test: {1:.4f}".\
      format(mean_squared_error(y_train, y_train_pred), 
             mean_squared_error(y_test, y_test_pred)))

In [None]:
print("R^2 train: {0:.4f}, test: {1:.4f}".\
      format(r2_score(y_train, y_train_pred),
             r2_score(y_test, y_test_pred)))

# Brief Introduction to AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor

In [None]:
ada = AdaBoostRegressor(DecisionTreeRegressor(max_depth=4), 
                        n_estimators=500, random_state=42)

In [None]:
ada.fit(X_train, y_train)

In [None]:
y_train_pred = ada.predict(X_train)

In [None]:
y_test_pred = ada.predict(X_test)

In [None]:
print("MSE train: {0:.4f}, test: {1:.4f}".\
      format(mean_squared_error(y_train, y_train_pred), 
             mean_squared_error(y_test, y_test_pred)))

In [None]:
print("R^2 train: {0:.4f}, test: {1:.4f}".\
      format(r2_score(y_train, y_train_pred),
             r2_score(y_test, y_test_pred)))

***

# Revisiting Feature Importance

13 features.

Are they all equally important?

Which features are more important?

Can scikit-learn help us with this?

## According to AdaBoost

In [None]:
ada.feature_importances_

In [None]:
df.columns

In [None]:
result = pd.DataFrame(ada.feature_importances_, df.columns)
result.columns = ['feature']

In [None]:
result.sort_values(by='feature', ascending=False)

In [None]:
result.sort_values(by='feature', ascending=False).plot(kind='bar');

so LSTAT is the most important feature we have.

***

## According to Random Forest

In [None]:
forest.feature_importances_

In [None]:
result = pd.DataFrame(forest.feature_importances_, df.columns)
result.columns = ['feature']
result.sort_values(by='feature', ascending=False).plot(kind='bar');

***