# Introduction to Scikit-Learn (sklearn)

This notebook demonstrates some of the most useful functions of  beautiful Scikit-learn library

What we're going to cover:

0. An end-to-end sklearn workflow
1. Getting the data ready
2. Choose the right estimators/algorithm for our problems
3. Fit the model/algorithm & use it to make predictions on our data
4. Evaluating a model
5. Improve a model
6. Save and load a trained model
7. Putting it all together

## 0. An end-to-end sklearn workflow

In [None]:
import numpy as np

In [None]:
# 1. Get the data ready
import pandas as pd
heartDisease = pd.read_csv('data/heart-disease.csv')
heartDisease

In [None]:
# Create X(feature matrix)
X = heartDisease.drop('target', axis=1)

# Create Y(labels)
y = heartDisease.target

In [None]:
# 2. Create the right model and hyerparamenters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# we will keep default hyperparameters
clf.get_params()

In [None]:
# 3. Fit the model to the data
from sklearn.model_selection import train_test_split

XTrain, XTest, yTrain, yTest = train_test_split(X, y, test_size=0.3)

In [None]:
clf.fit(XTrain, yTrain);

In [None]:
XTrain

In [None]:
# make a prediction
ylabel = clf.predict(np.array([0, 2, 3, 4]))

In [None]:
XTest

In [None]:
y_preds = clf.predict(XTest)
y_preds

In [None]:
yTest

In [None]:
# 4, Evaluate the model on training data and test data
clf.score(XTrain, yTrain)

In [None]:
clf.score(XTest, yTest)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(yTest, y_preds))

In [None]:
confusion_matrix(yTest, y_preds)

In [None]:
accuracy_score(yTest, y_preds)

In [None]:
# 5. Improve a model
# Try different amount of n_estimators
np.random.seed(42)
for i in range(20, 120, 50):
    print(f"Trying classifier for {i} ") 
    clf = RandomForestClassifier(n_estimators=i).fit(XTrain, yTrain)
    print(f"Model accouracy on test set:{clf.score(XTest, yTest) * 100:.2f}%")
    print("")

In [None]:
# 6. Save the model and load it
import pickle

pickle.dump(clf, open("models/randomForestModel.pkl", "wb"))

In [None]:
loaded_model = pickle.load(open("models/randomForestModel.pkl", "rb"))
loaded_model.score(XTest, yTest)

## 1. Getting data ready to be used with M/L

There are 3 main things, we need to do:
    
    1. Split the data into features and labels usually 'X's & 'y's
    2. Filling or disregarding missing values
    3. Converting non-numerical values to numerical values(also called feature encoding)

In [None]:
heartDisease.head()

In [None]:
X = heartDisease.drop("target", axis=1) # Axis 1=age,sex, cp,trestbps,chol, fbs, restecg, thalach, exang, oldpeak, slope, ca,thal

X.head()

In [None]:
y = heartDisease.target
y.head()

In [None]:
XTrain.shape, XTest.shape, yTrain.shape, yTest.shape

In [None]:
X.shape[0] * 0.8

In [None]:
242 + 61

In [None]:
len(heartDisease)

## 1.1 Make sure its all numerical

In [None]:
carSales = pd.read_csv("data/car-sales-extended.csv")

In [None]:
carSales.head()

In [None]:
carSales.Doors.value_counts()

In [None]:
len(carSales)

In [None]:
carSales.dtypes

In [None]:
# Split into X/y
X = carSales.drop("Price", axis=1)
y = carSales.Price

# Spit into training and test
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                               y,
                                               test_size=0.2)

In [None]:
# Build M/L model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(XTrain, yTrain)
model.score(XTest, yTest)

In [None]:
X.head()

In [None]:
# Let's try and convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalFeatures = ["Make", "Colour","Doors"]
oneHot = OneHotEncoder()
transformer = ColumnTransformer([("oneHot",
                                oneHot,
                                 categoricalFeatures)],
                               remainder="passthrough")
transformedX = transformer.fit_transform(X)
transformedX

In [None]:
pd.DataFrame(transformedX)

In [None]:
X.head()

In [None]:
dummies = pd.get_dummies(carSales[["Make", "Colour", "Doors"]])
dummies

In [None]:
# Let's refit the model
np.random.seed(42)
XTrain, XTest, yTrain, yTest = train_test_split(transformedX,
                                               y,
                                               test_size = 0.2)

model.fit(XTrain, yTrain)

In [None]:
X.head()

In [None]:
model.score(XTest, yTest)

## 1.2 What if there were missing values?

1. Fill them with some value (also known as imputation).
2. Remove samples with missing data altogether.

In [None]:
# Import carSales missing data
carSalesMissing = pd.read_csv("data/car-sales-extended-missing-data.csv")
carSalesMissing

In [None]:
carSalesMissing.isna().sum()

In [None]:
# Create X & y
X = carSalesMissing.drop("Price", axis= 1)
y = carSalesMissing.Price

In [None]:
# Let's try and convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalFeatures = ["Make", "Colour","Doors"]
oneHot = OneHotEncoder()
transformer = ColumnTransformer([("oneHot",
                                oneHot,
                                 categoricalFeatures)],
                               remainder="passthrough")
transformedX = transformer.fit_transform(X)
transformedX

In [None]:
carSalesMissing

#### Option 1: Fill missing data with Pandas

In [None]:
carSalesMissing.Doors.value_counts()

In [None]:
# Fill the 'Make' column
carSalesMissing.Make.fillna("missing", inplace=True)

# Fill the "Colour" column 
carSalesMissing.Colour.fillna("missing", inplace=True)

# Fill the "Odometer (KM)" column
carSalesMissing["Odometer (KM)"].fillna(carSalesMissing["Odometer (KM)"].mean(), inplace=True)

# Fill the "Doors column"
carSalesMissing.Doors.fillna(4, inplace=True)

In [None]:
# Checkout data frame again
carSalesMissing.isna().sum()

In [None]:
carSalesMissing.dropna(inplace=True)

In [None]:
carSalesMissing.isna().sum()

In [None]:
len(carSalesMissing)

In [None]:
X = carSalesMissing.drop("Price", axis=1)
y = carSalesMissing.Price

In [None]:
# Let's try and convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalFeatures = ["Make", "Colour","Doors"]
oneHot = OneHotEncoder()
transformer = ColumnTransformer([("oneHot",
                                oneHot,
                                 categoricalFeatures)],
                               remainder="passthrough")
transformedX = transformer.fit_transform(X)
transformedX

### Option 2: Fill missing values with Scikit-Learn

In [None]:
carSalesMissing = pd.read_csv("data/car-sales-extended-missing-data.csv")

In [None]:
carSalesMissing

In [None]:
# Drop the rows with no labels
carSalesMissing.dropna(subset=["Price"], inplace=True)
carSalesMissing.isna().sum()

In [None]:
# Split into X & y
X = carSalesMissing.drop("Price", axis=1)
y = carSalesMissing.Price

In [None]:
X.isna().sum()

In [None]:
# Fill missing values with Scikit-Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with 'missing' & numerical values with mean
catImputer = SimpleImputer(strategy='constant', fill_value="missing")
doorImputer = SimpleImputer(strategy='constant', fill_value=4)
numImputer= SimpleImputer(strategy='mean')

# Define Columns
catFeatures = ['Make', 'Colour']
doorFeature = ["Doors"]
numFeatures = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("catImputer", catImputer, catFeatures),
    ("doorImputer", doorImputer, doorFeature),
    ("numImputer", numImputer, numFeatures)
])

# Transform the data
filledX = imputer.fit_transform(X)
filledX

In [None]:
carSalesFilled = pd.DataFrame(filledX,
                             columns=["Make", "Colour", "Doors", "Odometer (KM)"])
carSalesFilled

In [None]:
carSalesFilled.isna().sum()

In [None]:
# Let's try and convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categoricalFeatures = ["Make", "Colour","Doors"]
oneHot = OneHotEncoder()
transformer = ColumnTransformer([("oneHot",
                                oneHot,
                                 categoricalFeatures)],
                               remainder="passthrough")
transformedX = transformer.fit_transform(carSalesFilled)
transformedX

In [None]:
# Now we 've' got our data as numbers and filled (no missing values)
# Let's fit the model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

XTrain, XTest, yTrain, yTest = train_test_split(transformedX,
                                               y,
                                               test_size = 0.2)
model = RandomForestRegressor()
model.fit(XTrain, yTrain)
model.score(XTest, yTest)

In [None]:
len(carSalesFilled), len(carSales)

## 2. Choosing the right estimator/algorithm for our problem

Scikit-learn uses estimator as another term for machine learning model or algorithm

* Classification - predicting whether sample is one thing or another
* Regression - predicting a number

Step 1- Check the Scikit-Learn machine learning map... https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

### 2.1 Picking a m/l model for a regression problem

In [None]:
# Import Boston housing dataset
from sklearn.datasets import load_boston
boston = load_boston()
boston;

In [None]:
bostonDf = pd.DataFrame(boston["data"], columns=boston["feature_names"])
bostonDf["target"] = pd.Series(boston["target"])
bostonDf.head()

In [None]:
# How many samples?
len(bostonDf)

In [None]:
# Let's try the Ridge Regression model
from sklearn.linear_model import Ridge

# Setup random seed
np.random.seed(42)

# Create the data 
X = bostonDf.drop("target", axis=1)
y = bostonDf.target

# Split into train and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                               y, 
                                               test_size=0.2)

# Instantiate the Ridge model
model = Ridge()
model.fit(XTrain, yTrain)

# Check the score of Ridge model on test data
model.score(XTest, yTest)

How do we improve score?

What if Ridge wasn't working?

Let's refer back to map... https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [None]:
# Let's try Random Forest Regressor
from sklearn.ensemble import RandomForestRegressor

# Setup random seed
np.random.seed(42)

# Create the data
X = bostonDf.drop("target", axis=1)
y = bostonDf.target

# Split the data
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                               y, 
                                               test_size=0.2)

# Instantiate the Random Forest Regressor
rf = RandomForestRegressor(n_estimators=100)
rf.fit(XTrain, yTrain)

# Check the score of Ridge model on test data
rf.score(XTest, yTest)

In [None]:
# Check the Ridge model again
model.score(XTest, yTest)

### 2.2 Choosing and estimator for classification problem

Let's go to the map...https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html

In [None]:
heartDisease.head()

In [None]:
len(heartDisease)

Consulting the map and it says to try `LinearSVC`

In [None]:
# Import the LinarSVC estimator class
from sklearn.svm import LinearSVC

# Setup the random seed
np.random.seed(42)

# Make the data
X = heartDisease.drop("target", axis=1)
y = heartDisease.target

# Split the data
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                               y,
                                               test_size=0.2)

# Instantiate LinearSVC
clf = LinearSVC(max_iter=10000)
clf.fit(XTrain, yTrain)

# Evaluate the LinearSVC
clf.score(XTest, yTest)

In [None]:
heartDisease.target.value_counts()

In [None]:
# Import the `RandomForestClassifier` estimator class
from sklearn.ensemble import RandomForestClassifier

# Setup the random seed
np.random.seed(42)

# Make the data
X = heartDisease.drop("target", axis=1)
y = heartDisease.target

# Split the data
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                               y,
                                               test_size=0.2)

# Instantiate RandomForesrtClassifier
clf = RandomForestClassifier()
clf.fit(XTrain, yTrain)

# Evaluate the RandomForestClassifier
clf.score(XTest, yTest)

Titbit:

    1. If you have structured data, use ensemble methods
    2. If youu have unstructured data, use Deeplearning or trasfer learning

In [None]:
heartDisease

## 3. Fit the model/ algorithm on our data and use it to make predictions

### 3.1 Fitting the model to the data

Different names for
* `X` = features, feature variables, data
* `y` = labels, targets, target variables

In [None]:
# Import the `RandomForestClassifier` estimator class
from sklearn.ensemble import RandomForestClassifier

# Setup the random seed
np.random.seed(42)

# Make the data
X = heartDisease.drop("target", axis=1)
y = heartDisease.target

# Split the data
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                               y,
                                               test_size=0.2)

# Instantiate RandomForesrtClassifier
clf = RandomForestClassifier()
print(clf)
# Fit the model to the data(training machine learning model)
clf.fit(XTrain, yTrain)

# Evaluate the RandomForestClassifier(use the patterns the model has learned)
clf.score(XTest, yTest)

In [None]:
X.head(), X.tail(), y.head(), y.tail() 

# Random Forest model Deepdive

Resources for Random Forest Models

* [Random Forest Wikipedia][1]

* [Random Forest in python][2] by yhat

* [An Implementation and Explanation of the Random Forest in Python][3]

[1]:https://en.wikipedia.org/wiki/Random_forest
[2]:http://blog.yhat.com/posts/random-forests-in-python.html#:~:text=by%20yhat%20%7C%20June%205%2C%202013&text=Random%20forest%20is%20capable%20of,about%20random%20forests%20using%20Python
[3]:https://towardsdatascience.com/an-implementation-and-explanation-of-the-random-forest-in-python-77bf308a9b76

### 3.2 Make predictions using a machine learning model
2 ways to make predictions:
  1. `predict()`
  2. `predict_proba()`
  

In [None]:
# Use a trained model to make predictions
clf.predict(np.array([1, 7, 8, 3, 4]))

In [None]:
XTest.head()

In [None]:
clf.predict(XTest)

In [None]:
np.array(yTest)

In [None]:
# Compare prediction to truth labels to evaluate the model
yPreds = clf.predict(XTest)
np.mean(yPreds == yTest)

In [None]:
clf.score(XTest, yTest)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(yTest, yPreds)

Make predictions with `predict_proba()`

In [None]:
# predict_proba() returns probabilities of classification label
clf.predict_proba(XTest[:5])

In [None]:
# Let's predict() on same data...
clf.predict(XTest[:5])

In [None]:
heartDisease.target.value_counts()

`predict()` can also be used in regreesion models 

In [None]:
bostonDf.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create data
X = bostonDf.drop('target', axis=1)
y = bostonDf.target

# Split into X and y
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                                y,
                                                test_size=0.2)

# Instantiate and fit the model
model = RandomForestRegressor().fit(XTrain, yTrain)

# Make predictions
yPreds = model.predict(XTest)

In [None]:
yPreds[:10]

In [None]:
np.array(yPreds)

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
# Compare the predictions to the truth
from sklearn.metrics import mean_absolute_error

mean_absolute_error(yTest, yPreds)

## 4. Evaluating a m/l Model

There are 3 ways to evaluate Scikit-Learn models/estimators:

1. Estimator `score` method
2. `Scoring` parameter
3. Metric functions

### 4.1 Evaluating with estimating `score` method

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heartDisease.drop("target", axis=1)
y = heartDisease.target

XTrain, XTest, yTrain, yTest = train_test_split(X,
                                                y,
                                                test_size=0.2)

clf = RandomForestClassifier()

clf.fit(XTrain, yTrain)



In [None]:
clf.score(XTrain, yTrain)

In [None]:
clf.score(XTest, yTest)

Let's do same but for regression... 

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create data
X = bostonDf.drop('target', axis=1)
y = bostonDf.target

# Split into X and y
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                                y,
                                                test_size=0.2)

# Instantiate and fit the model
model = RandomForestRegressor().fit(XTrain, yTrain)

In [None]:
model.score(XTest, yTest)

### 4.2 Evaluating a model using `scoring` parameter

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heartDisease.drop("target", axis=1)
y = heartDisease.target

XTrain, XTest, yTrain, yTest = train_test_split(X,
                                                y,
                                                test_size=0.2)

clf = RandomForestClassifier()

clf.fit(XTrain, yTrain)

In [None]:
clf.score(XTest, yTest)

In [None]:
cross_val_score(clf, X, y, cv=5)

In [None]:
cross_val_score(clf, X, y, cv=10)

In [None]:
np.random.seed(42)

# Single training and test split score
clfSingleScore = clf.score(XTest, yTest)

# Take the mean of 5 cross-val scores
clfCrossValScore = (np.mean(cross_val_score(clf, X, y, cv=5)))

# Compare the two
clfSingleScore, clfCrossValScore

In [None]:
# Default scoring parameter of classifier = mean accuracy
clf.score()

In [None]:
# Scoring parameter set to None by default
cross_val_score(clf, X, y, cv=5, scoring=None)

### 4.2.1 Classification model evaluation metrics

1. Accuracy
2. Area under ROC curve
3. Confusion matrix
4. Classification report

**Accuracy**

In [None]:
heartDisease.head()

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heartDisease.drop("target", axis=1)
y = heartDisease.target

clf = RandomForestClassifier()

CrossValScore = cross_val_score(clf, X, y, cv=5)

In [None]:
c = np.mean(CrossValScore)

In [None]:
print(f"Heart Disease Classifier Cross-Validated Accuracy:{c * 100:.2f}%")

**Area under the reciever operating curve(AUC/ROC)**
* Area under curve(AUC)
* ROC curve

RIC curve are a comparison of a model's true positive rate(tpr) vs a model's false positive rate(fpr)

* True positive = model predicts 1 when truth is 1
* False positive = model predicts 1 when truth is 0
* True negative = model predicts 0 when truth is 0
* False negative = model predicts 0 when truth is 1

In [None]:
# Create XTest..., etc
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                                y,
                                                test_size=0.2)

In [None]:
from sklearn.metrics import roc_curve

# Fit the clssifier
clf.fit(XTrain, yTrain)

# Make predictions with probabilities
yProb = clf.predict_proba(XTest)
yProb[:10], len(yProb)

In [None]:
yProbPositive = yProb[:, 1]
yProbPositive

In [None]:
yProbPositive[:10]

In [None]:
# Calculate fpr, tpr and thresholds
fpr, tpr, thresholds = roc_curve(yTest, yProbPositive)

# Check the false positive rates
fpr

In [None]:
# Create a function for plotting ROC curves
import matplotlib.pyplot as plt

def plotRocCurve(fpr, tpr):
    """
    Plot a ROC curve given the false positive rate (fpr) and true positive rate(tpr) 
    """
    
    # Plot roc curve
    plt.plot(fpr, tpr,color="orange", label="ROC")
    # Plot line with no predictive power(baseline)
    plt.plot([0,1], [0,1], color="darkblue", linestyle="--", label="Guesing")
    
    # Customise the plot
    plt.xlabel("False Positive Rate (fpr)")
    plt.ylabel("True Positive Rate (tpr)")
    plt.title("ROC curve")
    plt.legend()
    plt.show()

plotRocCurve(fpr, tpr)

In [None]:
from sklearn.metrics import roc_auc_score

roc_auc_score(yTest, yProbPositive)

In [None]:
# Plot perfect ROC curve and AUC score
fpr, tpr, thresholds = roc_curve(yTest, yTest)
plotRocCurve(fpr,tpr)

In [None]:
# Perfect AUC Score
roc_auc_score(yTest, yTest)

**Confusion Matrix**

A confusion matrix is a quick way to compare the label a model predicts and actual label it was supposed to predict

In essence, giving you an idea of where the model is getting confused.

In [None]:
from sklearn.metrics import confusion_matrix

yPreds = clf.predict(XTest)

confusion_matrix(yTest, yPreds)

In [None]:
# Visualize confusion matrix with pd.crosstab()
pd.crosstab(yTest,
           yPreds,
           rownames=["Actual Labels"],
           colnames=["Predicted Labels"])

In [None]:
24 + 5 + 4 + 28

In [None]:
# How to install a conda package into current environment from Jupyter notebook?
import sys
!conda install -y --prefix {sys.prefix} seaborn

In [None]:
# Make our confusion matrix more visual with Seaborn's heatmap()
import seaborn as sns

# Set the font scale
sns.set(font_scale=1.5)

# Create confusion matrix
confMat = confusion_matrix(yTest, yPreds)

# Plot it using seaborn heatmap
sns.heatmap(confMat);

In [None]:
def plotConfMat(confMat):
    """
    Plots a confusion matrix using Seaborn's heatmap()
    """
    fig, ax = plt.subplots()
    ax = sns.heatmap(confMat,
                    annot=True, # Annotate the box with confMat info
                    cbar=False)
    plt.xlabel("Predicted label")
    plt.ylabel("True label");
 
plotConfMat(confMat)

In [None]:
from sklearn.metrics import plot_confusion_matrix

plot_confusion_matrix(clf, X, y);

**Classification Report** 

In [None]:
from sklearn.metrics import classification_report

print(classification_report(yTest, yPreds))

In [None]:
# Where precision and recall become valuable
diseaseTrue = np.zeros(10000)
diseaseTrue[0] = 1 # only one positibe case

diseasePreds = np.zeros(10000) # model predicts every case as O


pd.DataFrame(classification_report(diseaseTrue,
                                  diseasePreds,
                                  output_dict=True))

To summarize classification metrics:
    
   * **Accuracy**: is a good measure to start with if all classes are balanced(e.g. same amount of samples which are labelled with 0 or 1).
   * **Precision** and **recall** become more important when classes are imbalanced.
   * If false positves are worse than false negatives, aim for higher precision
   * If false negatives are worse than false positives, aim for higher recall
   * **F1-score** is a combination of precision and recall

### 4.2.2 Regression model evaluation metrics

Model Evaluation Metrics: https://scikit-learn.org/stable/modules/model_evaluation.html

1. R^2 (pronouned r-squared) or coefficient of determination
2. Mean absolute error (MAE)
3. Mean squared error (MSE)

**R^2**

What R-squared does:Compares your models predictions to the mean of the targets. Values can rangefrom negative infinity(a very poor model) to 1. 

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = bostonDf.drop("target",axis=1)
y = bostonDf.target

XTrain, XTest, yTrain, yTest = train_test_split(X,
                                                y,
                                                test_size=0.2)

model = RandomForestRegressor()
model.fit(XTrain, yTrain)


In [None]:
model.score(XTest, yTest)

In [None]:
from sklearn.metrics import r2_score

# Fill an array with yTest mean
yTestM = np.full(len(yTest), yTest.mean())

In [None]:
yTest.mean()

In [None]:
r2_score(yTest, yTestM)

In [None]:
r2_score(yTest, yTest)

**Mean absolute error (MAE)**

MAE is the average of absolute difference between predictions and actual values. It gives you an idea of how wrong your model prediction are.

In [None]:
# Mean absolute error
from sklearn.metrics import mean_absolute_error

yPreds = model.predict(XTest)
mae = mean_absolute_error(yTest, yPreds)

In [None]:
mae

In [None]:
df = pd.DataFrame(data={"actual values":yTest,
                       "predicted values": yPreds})

In [None]:
df['difference'] = df['predicted values'] - df['actual values']

In [None]:
df

**Mean Squared error**


In [None]:
# Mean Squared error
from sklearn.metrics import mean_squared_error

yPreds = model.predict(XTest)
mse = mean_squared_error(yTest,yPreds)
mse

In [None]:
# Calculate MSE by hand
squared = np.square(df['difference'])
squared.mean()

### 4.23 Finally using the `scoring` paramter

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heartDisease.drop("target", axis=1)
y = heartDisease.target

clf = RandomForestClassifier()

In [None]:
np.random.seed(42)
cvAcc = cross_val_score(clf, X, y, cv=5, scoring=None)
cvAcc

In [None]:
print(f'The cross validated accuracy is: {cvAcc.mean() * 100:.2f}%')

In [None]:
np.random.seed(42)
cvAcc = cross_val_score(clf, X, y, cv=5, scoring="accuracy")
cvAcc

In [None]:
print(f'The cross validated accuracy is: {cvAcc.mean() * 100:.2f}%')

In [None]:
# precision
cvPrecision = cross_val_score(clf, X, y, cv=5, scoring="precision")
np.mean(cvPrecision)

In [None]:
# recall
cvRecall = cross_val_score(clf, X, y, cv=5, scoring="recall")
np.mean(cvRecall)

In [None]:
cvF1 = cross_val_score(clf, X, y, cv=5, scoring="f1")
np.mean(cvF1)

How about regression model?

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = bostonDf.drop("target", axis=1)
y = bostonDf["target"]

model = RandomForestRegressor()

In [None]:
np.random.seed(42)
cvR2 = cross_val_score(model, X, y, cv=5, scoring=None)
np.mean(cvR2)

In [None]:
np.random.seed(42)
cvR2 = cross_val_score(model, X, y, cv=5, scoring="r2")
cvR2

In [None]:
# Mean absolute error
cvMAE = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")
cvMAE

In [None]:
# Mean Square error
cvMSE = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error")
np.mean(cvMSE)

### 4.3 Using different evaluation metrics as Scikit-Learn functions

**Classification evaluation functions**

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = heartDisease.drop('target', axis=1)
y = heartDisease.target

XTrain, XTest, yTrain, yTest = train_test_split(X,
                                                y,
                                                test_size=0.2)

clf= RandomForestClassifier()
clf.fit(XTrain, yTrain)

# Make some predictions
yPreds = clf.predict(XTest)

# Evaluate the classifier
print("Classifier metrics on test set")
print(f"Accuracy:{accuracy_score(yTest, yPreds)* 100:.2f}")
print(f"Precision:{precision_score(yTest, yPreds) * 100:.2f}")
print(f"Recall:{recall_score(yTest, yPreds)}")
print(f"F1: {f1_score(yTest, yPreds)}")

In [None]:
len(yPreds)

**Regression evaluation functions**

In [None]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = bostonDf.drop('target', axis=1)
y = bostonDf.target

XTrain, XTest, yTrain, yTest = train_test_split(X,
                                                y,
                                                test_size=0.2)

model = RandomForestRegressor()
model.fit(XTrain, yTrain)

# Make some predictions
yPreds = model.predict(XTest)

# Evaluate the classifier
print("Regression model metrics on test set")
print(f"R^2: {r2_score(yTest, yPreds)}")
print(f"MAE: {mean_absolute_error(yTest, yPreds)}")
print(f"MSE: {mean_squared_error(yTest, yPreds)}")
# print(f"F1: {f1_score(yTest, yPreds)}")

## 5. Improving a model

first predictions = baseline predictions
first model = baseline predictions

From a data perspective:
* Could we collect more data? (generally, the more data, the better) 
* Could we improve our data? 

From a model perspective:
* Is there a better model we could use?
* Could we improve the current model? 

Hyperparameter vs Parameters
* Parameter = models finds these patterns in data
* Hyperparameter = settings on model you can adjust to improve its ability to find patterns

Three ways to adjust hyperparameters:
1. By hand
2. Randomly with RandomSearchCV
3. Exhaustively with GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [None]:
clf.get_params()

### 5.1 Tuning hyperparameters by hand

Let's make 3 sets training, validation and test.

In [None]:
clf.get_params()

We're going to try and adjust:

* `max_depth`
* `max_features`
* `min_sample_leaf`
* `min_samples_split`
* `n_estimators`

In [None]:
def evaluate_preds(yTrue, yPreds):
    """
    Performs evauation, comparison on yTrue labels vs yPreds labels on a classification
    """
    accuracy = accuracy_score(yTrue, yPreds)
    precision = precision_score(yTrue, yPreds)
    recall = recall_score(yTrue, yPreds)
    f1 = f1_score(yTrue, yPreds)
    metricDict = {"accuracy":round(accuracy, 2),
                  "precision": round(precision, 2),
                  "recall": round(recall, 2),
                  "f1": round(f1,2)}
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision :.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")

    return metricDict;

In [None]:
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

# Shuffle the data
heartDiseaseShuffled = heartDisease.sample(frac=1)

# Split into X & y
X = heartDiseaseShuffled.drop("target", axis=1)
y = heartDiseaseShuffled.target

# Split the data to train, validation & test sets
trainSplit = round(0.7 * len(heartDiseaseShuffled))
validsplit = round (trainSplit + 0.15 * len(heartDiseaseShuffled))
XTrain, yTrain = X[:trainSplit], y[:trainSplit]
XValid, yValid = X[trainSplit:validsplit], y[trainSplit:validsplit]
XTest, yTest = X[validsplit:], y[validsplit:]

clf = RandomForestClassifier()
clf.fit(XTrain, yTrain)

# Make baseline predictions
yPreds = clf.predict(XValid)

# Evaluate the classifier on validation set
baselineMetrics = evaluate_preds(yValid, yPreds)
baselineMetrics

In [None]:
clf.get_params()

In [None]:
np.random.seed(42)

clf2 = RandomForestClassifier(n_estimators=100)
clf2.fit(XTrain, yTrain)

# Make predictions 
yPreds2 = clf.predict(XValid)

# Evaluate 2nd classifier
clf2Metrics = evaluate_preds(yValid, yPreds2)

### 5.2 Hyperparameter tuning with RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid = {"n_estimators":[10, 100, 200, 500, 1000, 1200],
        "max_depth":[None, 5, 10, 20, 30],
        "max_features":["auto", "sqrt"],
        "min_samples_split":[2, 4, 6],
        "min_samples_leaf":[1, 2,4]
       }

np.random.seed(42)

# Split into X & y
X = heartDiseaseShuffled.drop("target", axis=1)
y = heartDiseaseShuffled.target

# Split into train and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                               y,
                                               test_size=0.2)


# Instantiate RandomForestClassifier
clf = RandomForestClassifier()

# Setup RandomizedSearchCV
rsClf = RandomizedSearchCV(estimator=clf,
                           param_distributions=grid,
                           n_iter=10, # number of models to try
                           cv=5,
                           verbose=2)

# Fit the RandomizedSearchcv version of clf
rsClf.fit(XTrain, yTrain);

In [None]:
rsClf.best_params_

In [None]:
# Make predictions with the best hyperparameters
rsYPreds = rsClf.predict(XTest)

# Evaluate the predictions
rsMetrics = evaluate_preds(yTest, rsYPreds)

### 5.3 Hyperparameter tuning with GridSearchCV

In [None]:
grid

In [None]:
grid2 = {'n_estimators': [100, 200, 500],
         'max_depth': [None],
         'max_features': ['auto', 'sqrt'],
         'min_samples_split': [6],
         'min_samples_leaf': [1, 2]}

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split

np.random.seed(42)

# Split into X & y
X = heartDiseaseShuffled.drop("target", axis=1)
y = heartDiseaseShuffled.target

# Split into train and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                               y,
                                               test_size=0.2)


# Instantiate RandomForestClassifier
clf = RandomForestClassifier()

# Setup GridSearchCV
gsClf = GridSearchCV(estimator=clf,
                     param_grid=grid2,
                     cv=5,
                     verbose=2)

# Fit the GridSearchcv version of clf
gsClf.fit(XTrain, yTrain);

In [None]:
gsClf.best_params_

In [None]:
# Evaluate GridSearchCV model
gsYpreds = gsClf.predict(XTest)

# Evaluate the classifier on validation set
gsMetrics = evaluate_preds(yTest, gsYpreds)

In [None]:
compareMetrics = pd.DataFrame({"baseline": baselineMetrics,
                               "clf2": clf2Metrics,
                                "random search": rsMetrics,
                                "grid search": gsMetrics})

compareMetrics.plot.bar(figsize=(10, 8));

## 6. Saving and loading trained M/L models

Two ways to save and load models
1. With python's `pickle` module
2. With the `joblib` module

**Pickle**

In [None]:
import pickle

# Save an exisiting model to file
pickle.dump(gsClf, open('models/gsModel.pkl', "wb"))

In [None]:
# Load a saved model
loadedPickeModel = pickle.load(open('models/gsModel.pkl', "rb"))

In [None]:
# Make some predictions
loadYPreds = loadedPickeModel.predict(XTest)
evaluate_preds(yTest, loadYPreds);

**Joblib** 

In [None]:
from joblib import dump, load

# Save the model
dump(gsClf, filename="models/gsModel(1).joblib")

In [None]:
# Import a saved joblib model
loadedJobModel = load(filename="models/gsModel(1).joblib")

In [None]:
# Make and evaluate joblib predictions
joblibYPreds = loadedJobModel.predict(XTest)
evaluate_preds(yTest, joblibYPreds);

## 7. Putting it all together

In [None]:
data = pd.read_csv("data/car-sales-extended-missing-data.csv")
data

In [None]:
data.dtypes

In [None]:
data.isna().sum()

Steps we need to do(all inone cell):
1. Fill the missing data
2. Convert data to numbers
3. Build a model on the data

In [3]:
# Getting data ready
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling 
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Setup random seed
import numpy as np
np.random.seed(42)

# Import data and drop rows with missing labels
data = pd.read_csv("data/car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"], inplace=True)

# Define different features and transform pipeline
categoricalFeatures = ["Make", "Colour"]
categoricalTransformer = Pipeline(steps=[
    ("imputer",SimpleImputer(strategy="constant",fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

doorFeature = ["Doors"]
doorTranformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))
])

numericFeatures = ["Odometer (KM)"]
numericTransformer = Pipeline(steps=[
    ("impute", SimpleImputer(strategy="mean"))
])

# Setup preprocess steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(
            transformers=[
                ("cat", categoricalTransformer, categoricalFeatures),
                ("door",doorTranformer, doorFeature),
                ("num", numericTransformer, numericFeatures)
            ])

# Creating a preproessong and modelling pipeline
model = Pipeline(steps=[("preprocessor", preprocessor),
                       ("model", RandomForestRegressor())])

# Split data
X = data.drop("Price", axis=1)
y = data.Price
XTrain, XTest, yTrain, yTest = train_test_split(X,
                                                y,
                                                test_size=0.2)

# Fit and score the model
model.fit(XTrain, yTrain)
model.score(XTest, yTest)

0.22188417408787875