In [None]:
# Sanity check: interpreter and key package versions
import sys, platform
print("Python:", platform.python_version())
print("Executable:", sys.executable)

import ipykernel, jupyter_client, numpy as np, pandas as pd, sklearn, matplotlib, seaborn
try:
    import jupyter_server
    js_ver = getattr(jupyter_server, "__version__", "n/a")
except Exception:
    js_ver = "n/a"
try:
    import notebook
    nb_ver = getattr(notebook, "__version__", "n/a")
except Exception:
    nb_ver = "n/a"

print(
    "Versions:",
    "ipykernel", ipykernel.__version__,
    "jupyter_client", jupyter_client.__version__,
    "jupyter_server", js_ver,
    "notebook", nb_ver,
    "numpy", np.__version__,
    "pandas", pd.__version__,
    "sklearn", sklearn.__version__,
    "matplotlib", matplotlib.__version__,
    "seaborn", seaborn.__version__,
)


# Introduction to Scikit learn

## 0. And end-to-end Scikit-Learn Workflow 

In [None]:
# standard imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
%matplotlib inline

In [None]:
# 1. Get The data ready
import pandas as pd
import numpy as np
heart_disease = pd.read_csv("Data/heart-disease.csv")
heart_disease

In [None]:
# Create x (Feature Matrix)
x = heart_disease.drop("target", axis=1)

# Create y (labels)
y = heart_disease["target"]

In [None]:
## 2. Choose the right model and hyperparameters
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# We'll keep the default hyperparameters
clf.get_params()

In [None]:
# 3. Fit the model to the data
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
clf.fit(x_train, y_train)

In [None]:
# Make a prediction
x_train

In [None]:
x_test

In [None]:
y_preds = clf.predict(x_test)
y_preds

In [None]:
y_test

In [None]:
# 4. Evaluate the model on the training data and test data
clf.score(x_train, y_train)

In [None]:
clf.score(x_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

In [None]:
confusion_matrix(y_test, y_preds)

In [None]:
accuracy_score(y_test, y_preds)

In [None]:
# 5. Improve the Model
# Try different amounts of n_estimates
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying Model with {i} estimatiors...")
    clf = RandomForestClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model accuracy on test set: {clf.score(x_test, y_test) * 100:2f}%")
    print("")

In [None]:
# 6. Save the model and load it
import pickle

pickle.dump(clf, open("random_forest_model_1.pkl", "wb"))

In [None]:
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.score(x_test, y_test)

## 1. Getting Data Ready to be used for Machine Learning

1. Split the data into features and labels x and y
2. filling (imputing) or disregarding missing values
3. Converting non numerical values to numerical values (Feature Encoding)

In [None]:
heart_disease.head()

In [None]:
x = heart_disease.drop("target", axis=1)
x

In [None]:
y = heart_disease["target"]
y

In [None]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,  y_test = train_test_split(x, y, test_size=0.2)

In [None]:
x_test.shape, x_train.shape, y_test.shape, y_train.shape

### 1.1 Make sure its all numerical

In [None]:
car_sales = pd.read_csv("Data/car-sales-extended.csv")
car_sales.head()

In [None]:
len(car_sales)

In [None]:
car_sales.dtypes

In [None]:
car_sales["Doors"].value_counts()

In [None]:
# Split int x/y
x = car_sales.drop("Price", axis=1)
y = car_sales["Price"]

In [None]:
# Split into training and test
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,  y_test = train_test_split(x, y, test_size=0.2)

In [None]:
# Build machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)],
                               remainder="passthrough")
transformed_x = transformer.fit_transform(x)
transformed_x

In [None]:
x.head()

In [None]:
pd.DataFrame(transformed_x)

In [None]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

In [None]:
# Lets refit the model
np.random.seed(42)
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)
model.fit(x_train, y_train)

In [None]:
model.score(x_test, y_test)

In [None]:
print(sklearn.__version__)

### 1.2 What if there were missing values?

1. Fill them with some value (also known as imputation
2. Remove the samples with missing data altogether.

In [None]:
# Import car sales missing data
car_sales_missing = pd.read_csv("Data/car-sales-missing-data.csv")
car_sales_missing.head()

In [None]:
car_sales_missing.isna().sum()

## Option 1

In [None]:
car_sales_missing["Make"] = car_sales_missing["Make"].fillna(value="missing")
car_sales_missing["Colour"] = car_sales_missing["Colour"].fillna(value="missing")
car_sales_missing.isna().sum()

In [None]:
car_sales_missing["Doors"].value_counts()

In [None]:
car_sales_missing["Doors"] = car_sales_missing["Doors"].fillna(value=4)
car_sales_missing["Odometer (KM)"] = car_sales_missing["Odometer (KM)"].fillna(value=car_sales_missing["Odometer (KM)"].mean())
car_sales_missing.isna().sum()

In [None]:
car_sales_missing.dropna(inplace=True)
car_sales_missing.isna().sum()
car_sales_missing.head()

In [None]:
# Create x and y
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# Lets try and convert our data to numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)],
                               remainder="passthrough")
transformed_x = transformer.fit_transform(x)
print(transformed_x)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
np.random.seed(42)

x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

## Option 2: Fill missing values with Scikit learn

In [None]:
# Import car sales missing data
car_sales_missing = pd.read_csv("Data/car-sales-missing-data.csv")
car_sales_missing.head()

In [None]:
car_sales_missing.isna().sum()

In [None]:
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

In [None]:
# Split into x and y
x = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

cat_features = ["Make", "Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

imputer = ColumnTransformer ([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_features),
    ("num_imputer", num_imputer, num_features)
])

filled_x = imputer.fit_transform(x)
filled_x

In [None]:
car_sales_filled = pd.DataFrame(filled_x, columns=["Make", "Colour", "Doors", "Odometer (KM)"])
car_sales_filled.head()

In [None]:
car_sales_filled.isna().sum()

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot", one_hot, categorical_features)],
                               remainder="passthrough")
transformed_x = transformer.fit_transform(car_sales_filled)
print(transformed_x)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
np.random.seed(42)

x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)

model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

# 2. Choosing the right estimator

Some things to note:
* Sklearn refers to ML models, algos as estimators
* Classification problem: Predicting a category (Heart disease or not)
    * Sometimes I might see `clf` (Short for classifier)
* Regression Problem: Preeicting a number, like the SP of a car

Refer to the Sklearn ML Map: https://scikit-learn.org/stable/machine_learning_map.html

### 2.1 Picking a machine learning model regression model

Using California Housing dataset

In [None]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

In [None]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df

In [None]:
housing_df["target"] = housing["target"]
housing_df = housing_df.drop("MedHouseVal", axis=1)
housing_df.head()

In [None]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split

np.random.seed(42)

x = housing_df.drop("target", axis=1)
y = housing_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = Ridge()

model.fit(x_train, y_train)

model.score(x_test, y_test)

What if `Ridge` didn't work or the score didn't fit our needs?

Well we can always try a different model

How about we try an ensemble model. Its a combination of smaller models than just a single model.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

x = housing_df.drop("target", axis=1)
y = housing_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model  =RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

## 2.2 Choose an estimator for classification model

In [None]:
heart_disease = pd.read_csv("Data/heart-disease.csv")
heart_disease.head()

In [None]:
heart_disease.isna().sum()

In [None]:
len(heart_disease)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

clf = LinearSVC()
clf.fit(x_train, y_train)

clf.score(x_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

clf = RandomForestClassifier()
clf.fit(x_train, y_train)

clf.score(x_test, y_test)

## 3. Fitting the model/algo and use it to make predictions

### 3.1 Fitting the Model

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

clf = RandomForestClassifier()

clf.fit(x_train, y_train)

clf.score(x_test, y_test)

### 3.2 Make predictions using a ML model 

2 Ways to make predictions

1. predict()
2. predict_proba()

In [None]:
x.head()

In [None]:
y.head()

In [None]:
# Use a trained model to make predictions
clf.predict(np.array([1, 7, 8, 3, 4]))

In [None]:
x_test.head()

In [None]:
clf.predict(x_test)

In [None]:
np.array(y_test)

In [None]:
# Compare predictions to truth labels to evaluate the model
y_preds = clf.predict(x_test)
print(np.mean(y_preds == y_test))

In [None]:
clf.score(x_test, y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

Make predictions with `predict_proba()`

In [None]:
# Predict predict_proba() returns probabilities of a classification label

clf.predict_proba(x_test[:5])

In [None]:
# Lets predict on the same data...
clf.predict(x_test[:5])

In [None]:
np.array(y_test.head())

`predict()` can also be used for regression model

In [None]:
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

In [None]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df

In [None]:
housing_df["target"] = housing["target"]
# housing_df = housing_df.drop("MedHouseVal", axis=1)
housing_df.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)

x = housing_df.drop("target", axis=1)
y = housing_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = RandomForestRegressor()

model.fit(x_train, y_train)

model.score(x_test, y_test)

y_preds = model.predict(x_test)

In [None]:
y_preds[:10]

In [None]:
np.array(y_test.head(10))

In [None]:
# Compare the preds to the truth

from sklearn.metrics import mean_absolute_error
print(mean_absolute_error(y_test, y_preds))

## 4. Evaluating a ML Model

Three ways to evaluate Scikit-Learn models/estimators:
1. Estimator's built-in `score()` method
2. The `scoring` parameter
3. Problem-specific metric function

### 4.1 valuating a model with `score` method

In [None]:
heart_disease = pd.read_csv("Data/heart-disease.csv")
heart_disease.head()

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = RandomForestClassifier(n_estimators=130)

model.fit(x_train, y_train)

model.score(x_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

x = housing_df.drop("target", axis=1)
y = housing_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = RandomForestRegressor(n_estimators=150)

model.fit(x_train, y_train)

model.score(x_test, y_test)

### 4.2 Evaluating a model using `scoring` parameter

In [None]:
from sklearn.model_selection import cross_val_score

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = RandomForestClassifier(n_estimators=130)

model.fit(x_train, y_train);

In [None]:
model.score(x_test, y_test)

In [None]:
cross_val_score(model, x, y, cv=10)

In [None]:
np.random.seed(42)

model_single_score = model.score(x_test, y_test)

model_cross_val_score = np.mean(cross_val_score(model, x, y, cv=5))

model_single_score, model_cross_val_score

### 4.2.1 Classification model evaluation metrics

1. Accuracy
2. Area under ROC Curve
3. Confusion Matrix
4. Classification Report

In [None]:
np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

clf = RandomForestClassifier(n_estimators=130)

cross_val_score = cross_val_score(clf, x, y, cv=5)
np.mean(cross_val_score)

In [None]:
print(f"Heart Disease Classifier Cross-Validated Accuracy: {np.mean(cross_val_score)*100:.2f}%")

**Area under the receiver operating characterstic curve (AUC/ROC)**

* Area Under Curve (AUC)
* ROC Curve

ROC curces are a comparison of a model's true positive rate (tpr) versus a model's false positive rate (fpr)

* True Positive= Model predicts 1 when truth is 1
* False Positive= Model predicts 1 when truth is 0
* True Negative= Model predicts 0 when truth is 0
* True Negative= Model predicts 0 when truth is 1

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [None]:
from sklearn.metrics import roc_curve

clf.fit(x_train, y_train)

y_probs = clf.predict_proba(x_test)

y_probs[:10], len(y_probs)

In [None]:
y_probs_positive = y_probs[:, 1]
y_probs_positive[:10]

In [None]:
# Calculate fpr, tpr, and threshold

fpr, tpr, thresholds = roc_curve(y_test, y_probs_positive)

fpr

In [None]:
# Plotting ROC Curve

import matplotlib.pyplot as plt

def plot_roc_curve (fpr, tpr):
    """
    Plots a ROC Curve given the fpr and tpr of a model
    """
    # Plot roc curve
    plt.plot(fpr, tpr, color="orange", label="ROC")
    
    #Plot line with no predictive power (baseline)
    plt.plot([0,1],[0,1], color="darkblue", linestyle="--", label="Guessing")

    # Customize the plot
    plt.xlabel("False posotive rate(fpr)")
    plt.ylabel("True Positive rate (tpr)")
    plt.title("Reciver Operating Characterstic (ROC) Curve")
    plt.legend()
    plt.show()

plot_roc_curve (fpr, tpr)

In [None]:
from sklearn.metrics import roc_auc_score

print(roc_auc_score(y_test, y_probs_positive))

In [None]:
# Plot perfect ROC Curve and AUC Score
fpr, tpr, thresholds = roc_curve(y_test, y_test)
plot_roc_curve(fpr, tpr)

In [None]:
# Perfect AUC Score
print(roc_auc_score(y_test, y_test))

** Confusion Matrix **

A confusion matrix is a quick way to compare the labels a model predicts and the actual labels it was supposed to predict.

In essence, giving you an idea of where the model is getting confused

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = model.predict(x_test)

confusion_matrix(y_test, y_preds)

In [None]:
# Visualize confusion matrix with pd.crosstab()

pd.crosstab(y_test,
           y_preds,
           rownames=["Actual Label"],
           colnames=["predicted Labels"])

In [None]:
24 + 5 + 3 + 29

In [None]:
len(x_test)

In [None]:
# Make our confusion matrix visual with Seaborn's heatmap()
import seaborn as sns

sns.set(font_scale=1.5)

conf_mat = confusion_matrix(y_test, y_preds)

sns.heatmap(conf_mat);

In [None]:
sklearn.__version__

In [None]:
### Creating a confusion matrix using Scikit-Learn
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(estimator=model, X=x, y=y)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true=y_test, y_pred=y_preds);

### Classification Report

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

In [None]:
# Where precision and recall become valuable
disease_true = np.zeros(10000)
disease_true[0] = 1

disease_preds = np.zeros(10000)

pd.DataFrame(classification_report(disease_true, disease_preds, output_dict=True))

### 4.2.2 Regression Model Evaluation Merics

 1. R^2 or coefficient of determination
 2. Mean Absolute Error (MAE)
 3. Mean Square Error (MSE)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)

x = housing_df.drop("target", axis=1)
y = housing_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = RandomForestRegressor(n_estimators=150)

model.fit(x_train, y_train)

model.score(x_test, y_test)

In [None]:
housing_df.head()

In [None]:
y_test

In [None]:
print(y_test.mean())

In [None]:
from sklearn.metrics import r2_score

y_test_mean = np.full(len(y_test), y_test.mean())

In [None]:
y_test_mean[:10]

In [None]:
r2_score(y_true=y_test,
        y_pred=y_test_mean)

In [None]:
r2_score(y_true=y_test,
        y_pred=y_test)

**Mean absolute error (MAE)**

MAE is the average of the absolute differences between predictions and actual values.
It gives you an idea of how wrong your modes predictions are

In [None]:
from sklearn.metrics import mean_absolute_error

y_preds = model.predict(x_test)
mae = mean_absolute_error(y_test, y_preds)
print(mae)

In [None]:
df = pd.DataFrame(data={"actual values" : y_test,
                        "predicted values" : y_preds})
df["differences"] = df["predicted values"] - df["actual values"]
df.head(10)

In [None]:
y_test

In [None]:
print(np.abs(df["differences"].mean()), " and " , df["differences"].mean())

**Mean square error (MSE)**

MSE is the mean of the square of the errors between actual and predicted values.

In [None]:
from sklearn.metrics import mean_squared_error

y_preds = model.predict(x_test)
mse = mean_squared_error(y_test, y_preds)
print(mse)

In [None]:
df["squared differences"] = np.square(df["differences"])
df.head()

In [None]:
squared = np.square(df["differences"])
print(squared.mean())

In [None]:
df_large = df.copy()
df_large.iloc[0, df.columns.get_loc("squared differences")] = 16

In [None]:
df_large

In [None]:
print(df_large["squared differences"].mean())

In [None]:
df_large.iloc[:100, df.columns.get_loc("squared differences")] = 20

In [None]:
df_large

In [None]:
print(df_large["squared differences"].mean())

### 4.2.3 Finally using the `scoring` parameter

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

clf = RandomForestClassifier(n_estimators=100)

In [None]:
np.random.seed(42)

cv_acc = cross_val_score(clf, x, y, cv=5, scoring=None)
cv_acc

In [None]:
print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)

cv_acc = cross_val_score(clf, x, y, cv=5, scoring="accuracy")
cv_acc

In [None]:
print(f"The cross-validated accuracy is: {np.mean(cv_acc)*100:.2f}%")

In [None]:
np.random.seed(42)
cv_precision = cross_val_score(clf, x, y, cv=5, scoring="precision")
cv_precision

In [None]:
print(f"The cross-validated Precision is: {np.mean(cv_precision)}")

In [None]:
np.random.seed(42)
cv_recall = cross_val_score(clf, x, y, cv=5, scoring="recall")
cv_recall

In [None]:
print(f"The cross-validated Recall is: {np.mean(cv_recall)}")

Lets see the `scoring` model being used for regression model

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_score

np.random.seed(42)

x = housing_df.drop("target", axis=1)
y = housing_df["target"]

model = RandomForestRegressor(n_estimators=100)

In [None]:
np.random.seed(42)
cv_r2 = cross_val_score(model, x, y, cv=3, scoring=None)
np.mean(cv_r2)

In [None]:
np.random.seed(42)
cv_mse = cross_val_score(model, x, y, cv=3, scoring="neg_mean_squared_error")
np.mean(cv_mse)

In [None]:
np.random.seed(42)
cv_mae = cross_val_score(model, x, y, cv=3, scoring="neg_mean_absolute_error")
np.mean(cv_mae)

### 4.3 Using Different evaluation metrics as Scikit-Learn functions

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

np.random.seed(42)

x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = RandomForestClassifier()

model.fit(x_train, y_train);

y_preds = model.predict(x_test)

print("Classifier metrics on the test set")
print(f"Accuracy: {accuracy_score(y_test, y_preds)*100:2f}%")
print(f"Precision: {precision_score(y_test, y_preds)*100:2f}%")
print(f"Recall: {recall_score(y_test, y_preds)*100:2f}%")
print(f"F1: {f1_score(y_test, y_preds)*100:2f}%")

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

np.random.seed(42)

x = housing_df.drop("target", axis=1)
y = housing_df["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

model = RandomForestRegressor(n_estimators=150)

model.fit(x_train, y_train)

model.score(x_test, y_test)

y_preds = model.predict(x_test)

print("Regression metrics on the test set")
print(f"R^2 Score: {r2_score(y_test, y_preds)*100:2f}%")
print(f"MAE: {mean_absolute_error(y_test, y_preds)*100:2f}%")
print(f"MSE: {mean_squared_error(y_test, y_preds)*100:2f}%")

## 5. Improve a model

First prediction = Baseline Predictions.
First model = Baseline Model

From a data prespective: 
* Could we collect more data ? (Generally, the more data, the better)
* Could we improve our data?

From a Model presepective:
* Is there a beter Model we coukd use?
* Could we improve the current model?

Hyperparameters vs Parameters
* Parameters = Model find these patterns in data
* Hyperparameters = Setting on a model you can adjust to (Potentially) improve its ability to find patterns

Three ways to adjust hyperparameters: 
* By Hand
* Randomly with RandomSearchCV
* Exhaustly with GridSearchCV

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

In [None]:
clf.get_params()

### 5.1 Tuning Hyperparameters by hand

 Lets make 3 sets, training, validation and test

In [None]:
clf.get_params()

We're going to to try and adjust:

* `max_depth`
* `max_features`
* `min_samples_leaf`
* `min_samples_split`
* `n_estimators`

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def evaluate_preds(y_true, y_preds):
    """
    Perform evaluation comparison on y_true labels vs, y_preds labels on a classification.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metric_dict = {"accuracy": round(accuracy, 2),
                    "precision": round(precision, 2),
                    "recall": round(recall, 2),
                    "f1": round(f1, 2)}
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}%")
    print(f"Recall: {recall:.2f}%")
    print(f"F1: {f1:.2f}%")

    return metric_dict

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

heart_disease_sh = heart_disease.sample(frac=1)

x = heart_disease_sh.drop("target", axis=1)
y = heart_disease_sh["target"]

train_split = round(0.7 * len(heart_disease_sh))
valid_split = round(train_split + 0.15 * len(heart_disease_sh))

x_train, y_train = x[:train_split], y[:train_split]
x_valid, y_valid = x[train_split:valid_split], y[train_split:valid_split]

x_test, y_test = x[valid_split:], y[valid_split:]

clf = RandomForestClassifier()

clf.fit(x_train, y_train)

y_preds = clf.predict(x_valid)

baseline_metrics = evaluate_preds(y_valid, y_preds)
baseline_metrics

In [None]:
np.random.seed(42)

clf_2 = RandomForestClassifier(n_estimators=100)

clf_2.fit(x_train, y_train)

y_preds_2 = clf_2.predict(x_valid)

clf_2_metrics = evaluate_preds(y_valid, y_preds_2)
baseline_metrics

### 5.2 Hyperparameters tuning with RandomizedSearchCV

In [None]:
from sklearn.model_selection import RandomizedSearchCV

grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["sqrt", "log2", None],
        "min_samples_leaf": [2, 4, 6],
        "min_samples_split": [2, 4]}

np.random.seed(42)

x = heart_disease_sh.drop("target", axis=1)
y = heart_disease_sh["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

clf = RandomForestClassifier(n_jobs=1)

rs_clf = RandomizedSearchCV(estimator=clf,
                               param_distributions=grid,
                               n_iter=10,
                               cv=5,
                               verbose=2)

rs_clf.fit(x_train, y_train);

In [None]:
rs_clf.best_params_

In [None]:
rs_y_preds = rs_clf.predict(x_test)

rs_metrics = evaluate_preds(y_test, rs_y_preds)

### 5.3 Hyperparameter tuning using GridSearchCV

In [None]:
grid

In [None]:
6*5*3*3*2*5

In [None]:
grid_2 = {'n_estimators': [100, 200, 500],
         'max_depth': [None],
         'max_features': ['sqrt', 'log2', None],
         'min_samples_leaf': [6],
         'min_samples_split': [2, 4]}

In [None]:
3*1*3*1*2*5

In [None]:
from sklearn.model_selection import GridSearchCV

np.random.seed(42)

x = heart_disease_sh.drop("target", axis=1)
y = heart_disease_sh["target"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

clf = RandomForestClassifier(n_jobs=1)

gs_clf = GridSearchCV(estimator=clf,
                               param_grid=grid_2,
                               cv=5,
                               verbose=2)

gs_clf.fit(x_train, y_train);

In [None]:
gs_clf.best_params_

In [None]:
gs_y_preds = gs_clf.predict(x_test)

gs_metrics = evaluate_preds(y_test, gs_y_preds)

Let's compare our different models metrics

In [None]:
compare_metrics = pd.DataFrame({ "baseline": baseline_metrics,
                               "clf_2": clf_2_metrics,
                               "random search": rs_metrics,
                               "grid search": gs_metrics})

compare_metrics.plot.bar(figsize=(14, 10));

## 6. Saving and loading trained machine learning models

Two ways to save and load machine learning models:

1. With Python's `pickel` module
2. With the `joblib` module    

### Pickle

In [None]:
import pickle

pickle.dump(gs_clf, open("gs_random_forest_model_1.pkl", "wb"))

In [None]:
loaded_model = pickle.load(open("gs_random_forest_model_1.pkl", "rb"))

In [None]:
pickle_y_preds = loaded_model.predict(x_test)
evaluate_preds(y_test, pickle_y_preds)

### Joblib

In [None]:
from joblib import dump, load

dump(gs_clf, filename="gs_random_frest_model_2.joblib")

In [None]:
loaded_job_model = load(filename="gs_random_frest_model_2.joblib") 

In [None]:
joblib_y_preds = loaded_job_model.predict(x_test)
evaluate_preds(y_test, joblib_y_preds)

## 7. Putting it all Together!

In [None]:
data = pd.read_csv("Data/car-sales-extended-missing-data.csv")
data

In [None]:
data.dtypes

In [None]:
data.isna().sum()

Steps all in one cell:
1. Fill missing data
2. Convert data to numbers
3. Build a model on the data

In [None]:
# Getting Data Ready
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modelling
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV

# Setup Random Seed
import numpy as np
np.random.seed(42)

# Import Data and drop rows with missing labels
data = pd.read_csv("Data/car-sales-extended-missing-data.csv")
data.dropna(subset=["Price"], inplace=True)

# Define different features and transformer pipeline
categorical_features = ["Make", "Colour"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

door_feature = ["Doors"]
doors_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))
])

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

# Setup preprocessing Steps (fillinf missing values and then convert to number)
preprocessor = ColumnTransformer(
    transformers=[
        ("cat", categorical_transformer, categorical_features),
        ("door", doors_transformer, door_feature),
        ("num", numeric_transformer, numeric_features)
    ]
)

# Creating a preprocessing and modelling pipeline
model = Pipeline(steps=[("preprocessor", preprocessor),
                        ("model", RandomForestRegressor())])

# Split Data
X = data.drop("Price", axis=1)
y = data["Price"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit and score the model
model.fit(X_train, y_train)
model.score(X_test, y_test)



Its also possible to use `GridSearchCV` or `RandomizedSearchCV` with our pipeline

In [None]:
# Use GridSearchCV with our Regression Pipeline
pipe_grid = {
    "preprocessor__num__imputer__strategy": ["mean", "median"],
    "model__n_estimators": [100, 1000],
    "model__max_depth": [None, 5],
    "model__max_features": ["sqrt", None],
    "model__min_samples_split": [2, 4]
}

gs_model = GridSearchCV(model, pipe_grid, cv=5, verbose=2)
gs_model.fit(X_train, y_train)

In [None]:
gs_model.score(X_test, y_test)