In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

![NASA](http://www.nasa.gov/sites/all/themes/custom/nasatwo/images/nasa-logo.svg)

<center>
<h1><font size="+3">NCCS Training Course Series</font></h1>
</center>

---

<center>
    <h1><font color="red">Machine Learning with Scikit-Learn</font></h1>
</center>

## Useful Links

- <a href="https://scikit-learn.org/stable/tutorial/index.html">scikit-learn Tutorials</a>
- <a href="https://medium.com/@amitg0161/sklearn-linear-regression-tutorial-with-boston-house-dataset-cde74afd460a">Sklearn Linear Regression Tutorial with Boston House Dataset</a>



## <font color="red">Scikit-Learn</font>

- Scikit-learn is a free machine learning library for Python. 
- Features various algorithms like support vector machine, random forests, and k-neighbours.
- Supports Python numerical and scientific libraries like NumPy and SciPy.

![FIG_AXES](https://scikit-learn.org/stable/_static/ml_map.png)
Image Source: scikit-learn.org

## Package Requirements

- Numpy
- scipy
- matplotlib
- pandas
- scikit-learn
- seaborn

In [None]:
%matplotlib inline
import numpy as np
import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns

import pandas as pd

import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics

## <font color="red">Boston Dataset</font>
- Contains information about different houses in Boston.
- There are 506 samples and 13 feature variables in this dataset. 
- Maintained at Carnegie Mellon University.
- <a href="https://archive.ics.uci.edu/ml/machine-learning-databases/housing/">This is a copy of UCI ML housing dataset</a>.

We want to predict the value of prices of the house using the given features. 

### Obtain the Dataset

In [None]:
from sklearn.datasets import load_boston
boston = load_boston()

In [None]:
print(boston.DESCR)

### Features of the Dataset

In [None]:
print("Keys: ", boston.keys())

In [None]:
print("Shape: ", boston.data.shape)

In [None]:
print("Feature Names: ", boston.feature_names)

#### Attribute Information:
| Acronym | Description |
| --- | --- |
| **CRIM** |    Per capita crime rate by town |
|**ZN** |   Proportion of residential land zoned for lots over 25,000 sq.ft. |
| **INDUS** | Proportion of non-retail business acres per town |
| **CHAS** |  Charles River dummy variable (= 1 if tract bounds river; 0 otherwise) |
| **NOX** |  Nitric oxides concentration (parts per 10 million) |
| **RM** |    Average number of rooms per dwelling |
| **AGE** |   roportion of owner-occupied units built prior to 1940 |
| **DIS** |  weighted distances to five Boston employment centres |
| **RAD** |   index of accessibility to radial highways |
| **TAX** |  full-value property-tax rate per \$10,000 |
| **PTRATIO** |  pupil-teacher ratio by town |
| **B** |       1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town |
| **LSTAT** |    % lower status of the population |
| **MEDV** |    Median value of owner-occupied homes in $1000's |

## <font color="red">Extract Data</font>

**Pass the data into a Pandas dataframe**

In [None]:
bos_pd = pd.DataFrame(boston.data)
bos_pd.head()

#### Relabel the columns using the Boston dataset feature names

In [None]:
bos_pd.columns = boston.feature_names
bos_pd.head()

#### Add home prices to the Pandas dataframe

In [None]:
boston.target[:5]

In [None]:
print("Shape of the target data: ", boston.target.shape)

In [None]:
bos_pd['PRICE']=boston.target
bos_pd.head()

## <font color="red">Data Pre-Procesessing</font>

#### Check Missing Values
It is a good practice to see if there are any missing values in the data. 

In [None]:
# Count the number of missing values for each feature
bos_pd.isnull().sum()

#### Obtain basic statistics on the data

In [None]:
bos_pd

In [None]:
bos_pd.describe().transpose()

## <font color="red">Exploratory Data Analysis</font>

- Important step before training the model. 
- We use visualizations to understand the relationship of the target variable with other features.

#### Distribution of the target variable

In [None]:
plt.figure(figsize=(8, 6));
plt.hist(bos_pd['PRICE']);
plt.title('Boston Housing Prices and Count Histogram');
plt.xlabel('price ($1000s)');
plt.ylabel('count');
plt.show();

In [None]:
plt.figure(figsize=(8, 6));
sns.distplot(bos_pd['PRICE']);
plt.show();

From the above output we can see that the values of PRICE is normally distributed with some of the outliers.

#### Heatmap: Two-Dimensional Graphical Representation
- Represent the individual values that are contained in a matrix as colors.
- Create a correlation matrix that measures the linear relationships between the variables.

In [None]:
plt.figure(figsize=(12, 9));
correlation_matrix = bos_pd.corr().round(2);
sns.heatmap(correlation_matrix, cmap="YlGnBu", annot=True);
plt.show();

- **RM** has a strong positive correlation with **PRICE** (0.7) where as **LSTAThas** a high negative correlation with **PRICE** (-0.74).
- The features **RAD**, **TAX** have a correlation of 0.91. These feature pairs are strongly correlated to each other. This can affect the model. Same goes for the features **DIS** and **AGE** which have a correlation of -0.75.
- The predictor variables such as **CRIM**, **INDUS**, **NOX**, **Age**, **RAD**, **TAX**, **PTRATIO**, **LSTAT** have a negative correlation on the target. Increase of any of them leads to the decrease in the price of the housing.
- The predictor variables such as **ZN**, **RM**, **DIS**, **B** have good positive correlation with the target. Increase in any of them leads to the increase in the price of the house.

In [None]:
for feature_name in boston.feature_names:
    plt.figure(figsize=(5, 4));
    plt.scatter(bos_pd[feature_name], bos_pd['PRICE']);
    plt.ylabel('Price', size=12);
    plt.xlabel(feature_name, size=12);
plt.show();

- The prices increase as the value of RM increases linearly. There are few outliers and the data seems to be capped at 50.
- The prices tend to decrease with an increase in LSTAT. Though it doesn’t look to be following exactly a linear line.

Based on the above observations we will plot an `lmplot` between **RM** and **PRICE** to see the relationship between the two more clearly.

In [None]:
sns.lmplot(x = 'RM', y = 'PRICE', data = bos_pd);

## <font color="red">Simple Linear Model</font>
- It is difficult to visualize the multiple features.
- We want to predict the house price with just one variable and then move to the regression with all features.
- Because **RM** shows positive correlation with the **House Prices**, we will use **RM** for the model.

In [None]:
X_rooms = bos_pd.RM
y_price = bos_pd.PRICE


X_rooms = np.array(X_rooms).reshape(-1,1)
y_price = np.array(y_price).reshape(-1,1)

print(X_rooms.shape)
print(y_price.shape)

#### Splitting the data into training and testing sets
- We split the data into training and testing sets. 
- We train the model with 80% of the samples and test with the remaining 20%. 
- We do this to assess the model’s performance on unseen data.

In [None]:
X_train_1, X_test_1, Y_train_1, Y_test_1 = \
             train_test_split(X_rooms, y_price, test_size = 0.2, random_state=5)

print(X_train_1.shape)
print(Y_train_1.shape)
print(X_test_1.shape)
print(Y_test_1.shape)

#### Training and testing the model
- We use scikit-learn’s LinearRegression to train our model on both the training and check it on the test sets.
- We check the model performance on the train dataset.

In [None]:
reg_1 = LinearRegression()
reg_1.fit(X_train_1, Y_train_1)

y_train_predict_1 = reg_1.predict(X_train_1)
rmse = (np.sqrt(metrics.mean_squared_error(Y_train_1, y_train_predict_1)))
r2 = round(reg_1.score(X_train_1, Y_train_1),2)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

#### Model Evaluation for Test Set

In [None]:
y_pred_1 = reg_1.predict(X_test_1)
rmse = (np.sqrt(metrics.mean_squared_error(Y_test_1, y_pred_1)))
r2 = round(reg_1.score(X_test_1, Y_test_1),2)

print("The model performance for training set")
print("--------------------------------------")
print("Root Mean Squared Error: {}".format(rmse))
print("R^2: {}".format(r2))

In [None]:
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: {:.4f}'.format(metrics.r2_score(Y_test_1, y_pred_1)))

#### 45-Degree Plot

In [None]:
plt.figure(figsize=(8, 5));
plt.scatter(Y_test_1, y_pred_1);
plt.plot([0, 50], [0, 50], '--k');
plt.axis('tight');
plt.xlabel("Actual House Prices ($1000)");
plt.ylabel("Predicted House Prices: ($1000)");
#plt.xticks(range(0, int(max(y_test)),2));
#plt.yticks(range(0, int(max(y_test)),2));
plt.title("Actual Prices vs Predicted prices");
plt.tight_layout();

## <font color="red">Linear Regression Model with All Variables</font>
- We want to create a model considering all the features in the dataset.

### Create the Model

In [None]:
X = bos_pd.drop('PRICE', axis = 1)
y = bos_pd['PRICE']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=42)

reg_all = LinearRegression()
reg_all.fit(X_train, y_train)

### Model Evaluation for Training Set

In [None]:
y_train_predict = reg_all.predict(X_train)
rmse = (np.sqrt(metrics.mean_squared_error(y_train, y_train_predict)))
r2 = round(reg_all.score(X_train, y_train),2)

print("The model performance for training set")
print("--------------------------------------")
print('RMSE is {}'.format(rmse))
print('R2 score is {}'.format(r2))
print("\n")

### Model Evaluation for Test Set

In [None]:
y_pred = reg_all.predict(X_test)
rmse = (np.sqrt(metrics.mean_squared_error(y_test, y_pred)))
r2 = round(reg_all.score(X_test, y_test),2)

print("The model performance for training set")
print("--------------------------------------")
print("Root Mean Squared Error: {}".format(rmse))
print("R^2: {}".format(r2))

In [None]:
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: {:.4f}'.format(metrics.r2_score(y_test, y_pred)))

#### Error Distribution

In [None]:
sns.distplot(y_test - y_pred);

#### 45-Degree Plot

In [None]:
plt.figure(figsize=(8, 5));
plt.scatter(y_test, y_pred);
plt.plot([0, 50], [0, 50], '--k');
plt.axis('tight');
plt.xlabel("Actual House Prices ($1000)");
plt.ylabel("Predicted House Prices: ($1000)");
#plt.xticks(range(0, int(max(y_test)),2));
#plt.yticks(range(0, int(max(y_test)),2));
plt.title("Actual Prices vs Predicted prices");
plt.tight_layout();

In [None]:
print("RMS: %r " % np.sqrt(np.mean((y_test - y_pred) ** 2)))

In [None]:
df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df2 = df1.head(10)
df2

In [None]:
df2.plot(kind='bar');

## <font color="red">Choosing the Best Model:</font> k-Fold Cross-Validation

- Cross-validation is a resampling procedure used to evaluate machine learning models on a limited data sample.
- It is primarily used in applied machine learning to estimate the skill of a machine learning model on unseen data.
- We use a limited sample in order to estimate how the model is expected to perform in general when used to make predictions on data not used during the training of the model.

The general procedure is as follows:

1. Shuffle the dataset randomly.
2. Split the dataset into **k** groups
3. For each unique group:
       3.1 Take the group as a hold out or test data set
       3.2 Take the remaining groups as a training data set
       3.3 Fit a model on the training set and evaluate it on the test set
       3.4 Retain the evaluation score and discard the model
4. Summarize the skill of the model using the sample of model evaluation scores

How to choose **k**?
- A poorly chosen value for **k** may result in a mis-representative idea of the skill of the model, such as a score with a high variance, or a high bias.
- The choice of **k** is usually 5 or 10, but there is no formal rule. As **k** gets larger, the difference in size between the training set and the resampling subsets gets smaller. As this difference decreases, the bias of the technique becomes smaller.
- A value of **k=10** is very common in the field of applied machine learning, and is recommend if you are struggling to choose a value for your dataset.

Below is the visualization of a k-fold validation when k=5.
![FIG_kFold](https://scikit-learn.org/stable/_images/grid_search_cross_validation.png)
Image Source: https://scikit-learn.org/



In [None]:
# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import Ridge
from sklearn.linear_model import BayesianRidge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor

# user variables to tune
seed    = 9
folds   = 10
metric  = "neg_mean_squared_error"

# hold different regression models in a single dictionary
models = dict()
models["Linear"]        = LinearRegression()
models["Lasso"]         = Lasso()
models["ElasticNet"]    = ElasticNet()
models["Ridge"]         = Ridge()
models["BayesianRidge"] = BayesianRidge()
models["KNN"]           = KNeighborsRegressor()
models["DecisionTree"]  = DecisionTreeRegressor()
models["SVR"]           = SVR()
models["AdaBoost"]      = AdaBoostRegressor()
models["GradientBoost"] = GradientBoostingRegressor()
models["RandomForest"]  = RandomForestRegressor()
models["ExtraTrees"]    = ExtraTreesRegressor()

# 10-fold cross validation for each model
model_results = list()
model_names   = list()
for model_name in models:
    model   = models[model_name]
    k_fold  = KFold(n_splits=folds, random_state=seed)
    results = cross_val_score(model, X_train, y_train, cv=k_fold, scoring=metric)
    
    model_results.append(results)
    model_names.append(model_name)
    print("{:>20}: {:.2f}, {:.2f}".format(model_name, round(results.mean(), 3), 
                                  round(results.std(), 3)))

# box-whisker plot to compare regression models
figure = plt.figure();
figure.suptitle('Regression models comparison');
ax = figure.add_subplot(111);
plt.boxplot(model_results);
ax.set_xticklabels(model_names, rotation = 45, ha="right");
ax.set_ylabel("Mean Squared Error (MSE)");
plt.margins(0.05, 0.1);
#plt.savefig("model_mse_scores.png")
plt.show();
#plt.clf()
#plt.close()

**Based on the above comparison, we can see that `Gradient Boosting Regression` model outperforms all the other regression models.**

## <font color="red">Model with Gradient Boosted Tree</font>


In [None]:
clf = GradientBoostingRegressor()
clf.fit(X_train, y_train)

predicted = clf.predict(X_test)
expected = y_test

#### Error Distribution

In [None]:
sns.distplot(expected - predicted);

#### 45-Degree Plot

In [None]:
plt.figure(figsize=(8, 5));
plt.scatter(expected, predicted)
plt.plot([0, 50], [0, 50], '--k');
plt.axis('tight');
plt.xlabel('True price ($1000s)');
plt.ylabel('Predicted price ($1000s)');
plt.tight_layout();

In [None]:
print("RMS: %r " % np.sqrt(np.mean((predicted - expected) ** 2)))

In [None]:
# The coefficient of determination: 1 is perfect prediction
print('Coefficient of determination: {:.4f}'.format(metrics.r2_score(expected, predicted)))

In [None]:
df1 = pd.DataFrame({'Actual': expected, 'Predicted': predicted})
df2 = df1.head(10)
df2

In [None]:
df2.plot(kind='bar');

In [None]:
# Plot training deviance

n_estimators = 100
# compute test set deviance
test_score = np.zeros((n_estimators,), dtype=np.float64)

for i, y_pred in enumerate(clf.staged_predict(X_test)):
    test_score[i] = clf.loss_(expected, y_pred)

plt.figure(figsize=(12, 6));
plt.subplot(1, 1, 1);
plt.title('Deviance');
plt.plot(np.arange(n_estimators) + 1, clf.train_score_, 'b-',
         label='Training Set Deviance');
plt.plot(np.arange(n_estimators) + 1, test_score, 'r-',
         label='Test Set Deviance');
plt.legend(loc='upper right');
plt.xlabel('Boosting Iterations');
plt.ylabel('Deviance');

#### Feature Importance
- Once we have a trained model, we can understand feature importance (or variable importance) of the dataset which tells us how important each feature is, to predict the target.

In [None]:
# plot model's feature importance
feature_importance = clf.feature_importances_
feature_importance = 100.0 * (feature_importance / feature_importance.max())

sorted_idx = np.argsort(feature_importance)
pos        = np.arange(sorted_idx.shape[0]) + .5

plt.barh(pos, feature_importance[sorted_idx], align='center');
plt.yticks(pos, boston.feature_names[sorted_idx]);
plt.xlabel('Relative Importance');
plt.title('Variable Importance');
#plt.savefig("feature_importance.png");
#plt.clf();
#plt.close();