# Fundamentals of Machine Learning: 
> # Session 2

### Defining our imports

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import itertools

from sklearn import datasets, linear_model, svm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures

### Loading Dataset

In [None]:
# Load iris dataset as dictionary (with some extra metadata)
iris = datasets.load_iris()

# Get iris feature data
features = iris.data

# Get the names of those features
feature_labels = iris.feature_names

# Get the numeric representation of iris species (our prediction targets)
targets = iris.target

# Get the names of those species (so we can recognise prediction targets)
target_labels = iris.target_names

# Load up the dataframe for later
df = pd.DataFrame(data= features, columns= feature_labels)

### Summarising the Data

In [None]:
# Determine the number of samples and features based on the data 'shape'
n_samples = features.shape[0]
n_features = features.shape[1]
n_targets = len(target_labels)
shape = features.shape

print("SUMMARY:")
print(f"> The input data has the 'shape' {shape}.")
print(f"> The input data is {n_features}-dimensional.")
print(f"\nThat is to say:")
print(f"> This dataset has {n_samples} iris samples.")
print(f"> Each sample has {n_features} features.")
print(f"> The features are:")
for label in feature_labels:
    print(f"\t. {label}")
print(f"> Those {n_features} features will belong to 1 of {n_targets} 'classes' or species of iris")
print(f"> Those classes are:")
for i, label in enumerate(target_labels):
    print(f"\t. {label} - {(i)}")

### Plotting Features

-----
**Remember:** The number of _features_ (or _independent variables_) within a dataset determines the _dimensionality_ of the input data. 

In the case of the Iris dataset, we have 4 dimensions (5 if you include the dependent variable 'species'), which is a little challenging to visualise. 

There is, however, a way around this. We can peek at _2-dimensional_ slices of this data - in other words, use a scatterplot matrix!

Let's begin.

-----

In [None]:
# Calculate the number of unique feature pairs to plot
n_pairs = (n_features * (n_features-1)) / 2

# Set number of subplot rows 
n_rows = 2

# Set number of subplot columns 
n_columns = int(n_pairs / n_rows)

In [None]:
# Generate a figure to contain a list of 'axes'. 
# Each axis represents an individual subplot.
fig, axes = plt.subplots(n_rows, n_columns, figsize = (16,10))

# Flatten the list to 1-dimension, simplifying our access
axes = axes.flatten()

# Get a list of tuples for unique pairs of all features
# The tuples hold integers: E.g (0,1) to represent a pair of features
pairs = list(itertools.combinations(range(n_features), 2))

# Count each unique feature pair and grab their indices 
font_size = 14
for index, (i,j) in enumerate(pairs): 

    # Set scatter plot data for the current feature pair
    x_data = features[:,i]
    y_data = features[:,j]

    # Get the name of those features to annotate the subplot
    x_label = feature_labels[i]
    y_label = feature_labels[j]
    
    # Draw a unique subplot for the current pair
    ax = axes[index]
    ax.scatter(x_data, y_data, c = targets, cmap='viridis')
    ax.set_xlabel(x_label, fontsize=font_size)
    ax.set_ylabel(y_label, fontsize=font_size)

### Remembering Linear Regression

-----
Last week we looked at a type of supervised machine learning where we train a model using continuous numeric data to predict continuous numeric data. This is called a **regression** problem.

> This could be something like, using an individual's _salary_ to predict their life expectancy (in minutes, let's say).

> Similarly, you could train a model using historic _share values_ to predict the future value of stock over time.

Last week we used individual iris features to train a number of _linear regression_ models to predict other iris features.

Let's take another look.

-----

In [None]:
#  Create a function to prepare data and train based on feature names
def train_linear_regression(x, y):
    
    # Reshape the data into a single column
    x_train = x.reshape(-1, 1)
    y_train = y.reshape(-1, 1)
    
    # Build linear model
    model = linear_model.LinearRegression()
    
    # Train using predictor and target data
    model.fit(x_train, y_train)

    # Delete the variables so 
    del x_train, y_train
    return model

In [None]:
# Create a list of feature pairs so we can access DataFrame columns
label_pairs = list(itertools.permutations(feature_labels, 2)) 

# Calculate the number of unique feature pairs to plot
n_pairs = len(label_pairs)

# Set number of subplot rows 
n_rows = 4

# Set number of subplot columns 
n_columns = int(n_pairs / n_rows)

# Generate figure to contain plots and axes to represent a list of individual subplots 
fig, axes = plt.subplots(n_rows, n_columns, figsize = (16,16))
axes = axes.flatten()

# Count each unique feature pair and grab their indices  
for index, (predictor, target) in enumerate(label_pairs): 
    
    # Convert iris data from Pandas Series to numpy arrays
    x_data = df[predictor].values
    y_data = df[target].values

    # Get a trained model for this feature pair
    model = train_linear_regression(x_data, y_data)
    
    # Get predictions from the linear model
    y_pred = model.predict(x_data.reshape(-1, 1))
    
    # Draw a subplot for each pair
    ax = axes[index]
    ax.scatter(x_data, y_pred, marker='^', label='Prediction')
    ax.scatter(x_data, y_data, marker='x', label='Ground Truth')
    ax.set_xlabel(predictor, fontsize=font_size)
    ax.set_ylabel(target, fontsize=font_size)
ax.legend(loc='lower right')

----

Now, we rarely see accurate predictions for these regression models. 

Why?

Because the data isn't linear (or [linearly seperable](https://en.wikipedia.org/wiki/Linear_separability) - more on that later). Most of these regression lines (the lines of best fit) **do not** fit the data properly!

 The data sits in these strange, tangled but also disconnected clouds of data points.

The further away the ground truth values are to the predicted ones, the higher the error our model incurs, and the less accurate it will be in deployment. 

In other words, the _function_ between the input feature and output feature is mostly too complex to represent linearly.

In fact, most serious prediction tasks in machine learning require a complex function to accurately capture the relationship between input and output data.

This is where _non-linear_ models comes in.

----

## Dealing with Non-linear data

### Polynomial Regression
----

Polynomial regression is like drawing a curve instead of a straight line to fit your data points. This curve can bend and twist to better match the patterns in your data. 

Where:

 $y = mx + b$

describes a straight regression line

$y = ax^{2} + bx + c$

describes a polynomial regession curve.

Essentially, it's an extension of linear regression but allows for a more complex, curved relationship between the independent variable and the dependent variable.

This, in essence, is all of machine learning - finding the right function to fit a line!

---

#### Play around with the following **TODOs** to see how the polynomial regression line changes!

- How does the degree influence the regression line?
- Are any predictors more helpful than others?

In [None]:
# TODO: Change these index values [0...3] to set up feature indices
    # 0 - sepal length
    # 1 - sepal width
    # 2 - petal length
    # 3 - petal width

predictor = feature_labels[0]   # Change value to choose predictor feature
target = feature_labels[1]      # Change value to choose target feature

# ---------------------------------------------------------------------- #

# Get a local copy of the predictor feature and target feature
X = df[predictor].values
y = df[target].values

# Reshape X to fit sklearn's requirements
X = X.reshape(-1, 1)

# Split the data into training and testing sets (we'll write our own later)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=0)


# TODO: Change this degree to alter the regression curve
degree = 2
# ---------------------------------------------------------------------- #

# Transform the features into polynomial features
poly = PolynomialFeatures(degree)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)

# Fit the Linear Regression model
model = linear_model.LinearRegression()
model.fit(X_train_poly, y_train)

# Make predictions
y_train_pred = model.predict(X_train_poly)
y_test_pred = model.predict(X_test_poly)

# ---------------------------------------------------------------------- #

# Plot the results
plt.figure(figsize=(18, 10))
plt.scatter(X, y, color='orange', label='Data Points', marker='x')
plt.scatter(X_train, y_train_pred, color='red', label='Training Fit', marker='^')
plt.scatter(X_test, y_test_pred, color='green', label='Testing Fit', marker='o')

# Sort values for a smoother plot line
sort_idx = np.argsort(X.flatten())
plt.plot(X[sort_idx], np.concatenate([y_train_pred, y_test_pred])[sort_idx], color='grey', label='Polynomial Fit', linestyle='--')

plt.xlabel(f'{predictor}', fontsize=font_size)
plt.ylabel(f'{target}', fontsize=font_size)
plt.title(f'Polynomial Regression (degree = {degree})', fontsize=font_size)
plt.legend()
plt.show()

## Classification with Support Vector Machines

-----
Something we have neglected to address is how we expect our model to behave post-training; that is how it will function "in the wild".

Ultimately the goal of machine learning is "generalisation" so that when we feed a trained model input data that was not used during the training process, it should still give an accurate prediction.

Practically, this needs to be simulated when we build our model. The easiest way to do this is by splitting our dataset in two.

We call these the **_training set_** and the **_testing set_**.

-----

### Training and Testing Split

----

Making up a training and testing set sufficiently _representative_ of the task we want to model requires some reordering and random sampling of our dataset.

Before we start shifting things around, we would like to keep track of which samples belong to which species, or 'class' of iris. Otherwise there's no way of properly validating our trained model.

So let's stitch some data together.

----


#### Joining features and labels together

In [None]:
# Add iris classes (species) to the dataframe
# There are 4 columns in the dataframe, we want to add a fifth
df.insert(loc= n_features,
        column= "species",
        value= targets)
df

----

Now that we have a complete dataframe containing features and targets (input and output data), we can start building our training and testing sets.

Ideally, we want to use most of our dataset to train the model - otherwise it may not learn the correct relationship between the iris features and the species!

We call this the [train-test split](https://builtin.com/data-science/train-test-split).

A reasonable approach is to use 70% of the dataset for training, and the remaining 30% for testing. 

This way we (should) get a good idea of what each species of iris looks like, and still reserve a decent amount of data to test the accuracy of our model.

Here's a function to do just that!

----

In [None]:
def get_training_test_sets(x, train_size=0.7):
    """
    Function takes a complete DataFrame {x}, splitting it into a training set
    and a testing set. The size of the training set (and by extension the 
    testing set) is determined by {train_size}. 

    Performs a 70/30 train/test split by default.

    """
    # Ensure the dataset is a numpy array.
    assert (type(x) == pd.DataFrame), "X data must be a dataframe!"
    assert (train_size >= 0.0 and train_size <= 1.0), "Training set size must be between 0 and 1!"

    print("Splitting dataset into training and test sets...\n")

    # Extract values from DataFrame
    dataset = x.values

    # Determine the number of samples in the dataset.
    n_samples = dataset.shape[0]

    # Randomly shuffle data to break up the ordered samples
    print(f"First sample in dataset is {dataset[0]}")
    print("Shuffling dataset...")
    np.random.shuffle(dataset)
    print(f"First sample in dataset is now {dataset[0]}")
    
    # Get the index of the last training sample (based on the train_size)
    last_train_sample_idx = int(n_samples * train_size)

    # Get first {train_size} percent of the dataset for training
    x_train = dataset[:last_train_sample_idx]
    

    # Get the last {1 - train_size} percent of the dataset for testing
    x_test = dataset[last_train_sample_idx:]

    print("_"*30)
    del dataset, x
    # Return bot datasets
    return x_train, x_test

----

Another thing to consider is whether our training and testing set have an adequate representation of each class within the training and testing sets. 

Say we have a training set made up of 2/3 of the complete dataset. This training set exclusively contains samples of the _seritosa_ and _versicolor_ iris. 

Now say we go to test our model using a testing set exclusively made up of _virginica_ samples. It would have never seen these before!

What do you think might happen?

We call this [_inter-class imbalance_](https://developers.google.com/machine-learning/data-prep/construct/sampling-splitting/imbalanced-data), and it's a big problem in statistical disciplines.

Here's a function that checks the class distribution within a given dataset.

----

In [None]:
def show_class_distribution(dataset, name = 'Dataset'):
    """
    Function finds the unique target values across the 'species' column
    (0, 1 or 2) and counts the number of samples that have those unique values.

    """
    # Count sumber of samples within each unique 'species' class
    _, counts = np.unique(dataset[:, -1], return_counts= True)
    n_samples = dataset.shape[0]

    print(f"{name} has {n_samples} samples.")
    for species, counts in zip(target_labels, counts):
        print(f"> {counts}/{n_samples} are Iris {species}")
    print('\n')

#### Choosing the training / testing split

- How does the train / test split impact predictions?
- How small can the training set be and still produce a reasonable model?

In [None]:
# TODO: Change this value (between 0.0 and 1.0) to see how it impacts performance!
training_set_size = 0.8

# Call our function to get training and testing sets
x_train, x_test =  get_training_test_sets(df, training_set_size)


# See how our datasets are broken up by iris species
show_class_distribution(df.values)
show_class_distribution(x_train, 'Training set')
show_class_distribution(x_test, 'Testing set')

### Function for Training the Model

----

Now we're ready to start training our model (the _classifier_). 

For this we're going to use _Support Vector Machines_. 

There's a nice [video](https://www.youtube.com/watch?v=efR1C6CvhmE) explaining this if you're interested. (Forgive Josh, he's just excitable.)

Without going into too much detail, the SVM projects our data into an even higher dimension. 

It does this to help draw a _hyper-plane_ that separates the groups of datapoints making up each class of iris (We'll visualise it later). 

Here's the train function we'll need.

----

In [None]:
def train(train_set, kernel='linear'):
    """
    Function takes training set and uses it to fit the model.
    The training set will contain both a set of features (iris measurements)
    and a set of labels (their species). These will be seperated into two different
    sets, x_data and y_data.

    The 'kernel' type changes how the SVM draws boundaries around samples.

    """
    # Check train set is in the correct numpy format
    assert (type(train_set) == np.ndarray), "Training data should be a numpy array!"

    # Get the first four columns (i.e the features) from the training set
    x_train = train_set[:, :-1]

    # Get the last column (i.e the species 'label') from the training set
    y_train = train_set[:, -1]

    # Build the model
    model = svm.SVC(kernel=kernel)

    # Train the model to learn the relationship between 
    # features {x_train} and labels {y_train}
    model.fit(x_train, y_train)

    # Delete the variables here so we don't confuse them in future training runs
    del x_train, y_train

    return model
    


#### Function for Testing the Model
----

Next the test function.

This one takes an already trained model and throws our training set at it.

Once it's done, we can generate a [_confusion matrix_](https://en.wikipedia.org/wiki/Confusion_matrix) to see how well the model did.

---

In [None]:
def test(model, test_set):
    """
    Function makes a trained model and a testing set.
    The model uses the test set as 'unseen' input data,
    to validate the accuracy of its predictions.

    """
    # Get and reshape the features and labels from the testing set
    x_test = test_set[:, :-1].reshape(-1, n_features)
    y_test = test_set[:, -1].reshape(-1, 1)
    print(f'Test set has {x_test.shape[0]} samples.')

    # Get 'species' predictions from the model
    print('Predicting. . .')
    y_pred = model.predict(x_test)

    # Generate a confusion matrix given the test data and predicted data
    print('Generating confusion matrix. . .')
    c_matrix = confusion_matrix(y_test, y_pred)
    
    del x_test, y_test

    return c_matrix

#### Function for Plotting Results

In [None]:
def plot_confusion_matrix(cm):
    """"
    Plot the confusion matrix generated from the model predictions.

    """
    # Clear any existing figures
    plt.clf()

    # Set the figure size before plotting
    fig, ax = plt.subplots(figsize=(10, 7))
    
    # Set a custom font size for the confusion matrix
    font_size = 14

    # Set the font size for all axes labels
    for label in ax.get_xticklabels() + ax.get_yticklabels():
        label.set_fontsize(font_size)

    # Display the confusion matrix
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=target_labels)

    # Colour the confusion matrix
    disp.plot(cmap=plt.cm.Reds, ax=ax)

    # Set the fontsize for values within the matrix
    for text in disp.text_.ravel():
        text.set_fontsize(font_size)

    # Add title and axes labels 
    plt.title("Confusion Matrix for Iris Classifier", fontsize= font_size)
    plt.xlabel("Predicted Label", fontsize=font_size)
    plt.ylabel("True Label", fontsize=font_size)
    plt.show()


### Training, Testing and Evaluations
-----

We have the functions to train our own SVM model. The only thing left to do is call them. 

We'll need to feed in our training set then use our test set against that models prediction to generate the confusion matrix.

The confusion matrix is grid with the same number of rows and columns.

Each row represents the ground truth - what the model should have guessed. The columns represents what the model actually predicts.

In [None]:
# TODO: Choose the kernel type to change how the model draws decision boundaries between our training samples
kernels = ['linear', 'poly', 'rbf', 'sigmoid']
kernel =  kernels[0]

# Traing the model using the training set 
model = train(x_train, kernel)

# Generate model predictions using the testing set 
# and return the confusion matrix
conf_matrix = test(model, x_test)

plot_confusion_matrix(conf_matrix)

#### Insepcting Performance Across Models

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

X = df.values[:,:-1]
y = df.values[:, -1]

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Define kernel types
kernels = ['linear', 'poly', 'rbf', 'sigmoid']

# Create a 2x2 grid for subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Flatten the axes for easier indexing
axes = axes.flatten()

# Iterate over each kernel type
for i, kernel in enumerate(kernels):
    # Train SVM model with current kernel type
    svm_model = svm.SVC(kernel=kernel, gamma='scale')
    svm_model.fit(X_scaled, y)

    # Plot confusion matrix in current subplot
    ConfusionMatrixDisplay.from_estimator(svm_model, X_scaled, y, ax=axes[i], cmap=plt.cm.Reds, normalize='true')
    axes[i].set_title(f'Confusion Matrix - {kernel.capitalize()} Kernel', fontsize=font_size)
    axes[i].set_xlabel('Predicted Class', fontsize=font_size)
    axes[i].set_ylabel('Ground Truth', fontsize=font_size)
    

# Adjust layout
plt.tight_layout()
plt.show()

### Visualising Hyper-Planes Across Models 

----

To simplify the visualisation, this example uses two features, instead of four, to solve the classification problem.

For each kernel type supported by the SVM, we use these two features to train the model and insepct how it draws a decision boundary to distringuish between classes of datapoints.

Play around with the **TODOs** to see how things change

- Do certain feature pairs make prediction harder?
- How do the contours change depending on these features?
- Are there any classes harder to distinguish from the others?

In [None]:
X = np.zeros((n_samples, 2))

# TODO: Change the features to view different hyperplanes
    # 0 - sepal length
    # 1 - sepal width
    # 2 - petal length
    # 3 - petal width

x1 = feature_labels[0]      # TODO: Change X feature
x2 = feature_labels[3]      # TODO: Change Y feature
    
X[:, 0] = df[x1].values  
X[:, 1] = df[x2].values   

# ---------------------------------------------------------------------- #

y = df.values[:, -1]

# Standardise the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Create a matrix of subplots
fig, axes = plt.subplots(2, 2, figsize=(16, 14))

# Flatten the axes for easier indexing
axes = axes.flatten()

# ---------------------------------------------------------------------- #

# Iterate over each kernel type
for i, kernel in enumerate(kernels):
    # Train SVM model with current kernel type
    svm_model = svm.SVC(kernel=kernel, gamma='scale')
    svm_model.fit(X_scaled, y)

    # Create a meshgrid to plot decision surface
    x_min, x_max = X_scaled[:, 0].min() - 1, X_scaled[:, 0].max() + 1
    y_min, y_max = X_scaled[:, 1].min() - 1, X_scaled[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, 0.1),
                         np.arange(y_min, y_max, 0.1))

    # Make predictions on meshgrid points
    Z = svm_model.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)

    # ---------------------------------------------------------------------- #
    # Plot decision surface in current subplot

    font_size = 18
    axes[i].contourf(xx, yy, Z, alpha=0.8, cmap=plt.cm.RdYlBu)
    axes[i].scatter(X_scaled[:, 0], X_scaled[:, 1], c=y, cmap=plt.cm.RdYlBu, edgecolors='k')
    axes[i].set_title(f'SVM Decision Surface with {kernel.capitalize()} Kernel', fontsize=font_size)
    axes[i].set_xlabel('Petal length (standardised)', fontsize=font_size)
    axes[i].set_ylabel('Petal width (standardised)', fontsize=font_size)

# Adjust layout
plt.tight_layout()
plt.show()
