<a href="https://colab.research.google.com/github/absolutemocha/sdaai/blob/main/C2329C_AY2022_T3_CWF_Lim_Zhao_Hong.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# C2329C Machine Learning Fundamentals

## Coursework Final
***
### Student Name: $<Lim Zhao Hong>$ 
### Student ID: $<20065320>$

***

## Features description from the $CleandDataV20210515.csv$ file:

**index_col** time step for the washing cycle

**avC:** average current 

**avP:** average power

**avR:** average resistant

**maxC:** maximum current 

**maxP:** maximum power

**sdC:** standard deviation for current

**sdP:** standard deviation for power

**stdCR:** standard deviation for resistant

**stdCP:** standard deviation for power

**AvRR:** average relative resistance to previous resistance reading

**mode:** positive class: Daily Wash | negative class: Not Daily Wash


In [None]:
# Import necessary Python libraries
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

## Data Preparation Process

In [None]:
from __future__ import print_function
import os
data_path = ['data']

In [None]:
# Import the data using the file path
filepath = os.sep.join(data_path + ['CleandDataV20210515.csv'])
data = pd.read_csv(filepath)

In [None]:
data.head(1).T

In [None]:
data.dtypes

In [None]:
# Remove extraneous columns
data.drop(['state', 'area_code', 'phone_number'], axis=1, inplace=True)

In [None]:
data.columns

Visualize the relationship between the variables.

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

%matplotlib inline

sns.set_context('notebook')
sns.set_palette('dark')
sns.set_style('white')

sns.pairplot(data);


***

## K-Nearest Neigbors

In [None]:
#K-nearest neighbors requires scaled data. Scale the data using one of the scaling methods discussed in the lecture.

from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()

for col in ['mode']:
    data[col] = lb.fit_transform(data[col])

In [None]:
# Mute the sklearn warning
import warnings
warnings.filterwarnings('ignore', module='sklearn')

from sklearn.preprocessing import MinMaxScaler

msc = MinMaxScaler()

data = pd.DataFrame(msc.fit_transform(data),  # this is an np.array, not a dataframe.
                    columns=data.columns)

In [None]:
# Get a list of all the columns that don't contain the label
x_cols = [x for x in data.columns if x != 'mode']

# Split the data into two dataframes
X_data = data[x_cols]
y_data = data['mode']

# # alternatively:
# X_data = data.copy()
# y_data = X_data.pop('mode')

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, 
                                                    test_size=0.3, random_state=42)

In [None]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=3)

knn = knn.fit(X_train, y_train)

y_test_pred = knn.predict(X_test)

In [None]:
# Function to calculate the % of values that were correctly predicted

def accuracy(real, predict):
    return sum(real == predict) / float(real.shape[0])

In [None]:
print(accuracy(y_test, y_test_pred))

In [None]:
# Pandas boxplot

data.boxplot(by='mode');

In [None]:
sns.set_context('mode')
sns.pairplot(data, hue='species');


***

## Logistic Regression 

In [None]:
from sklearn import pipeline, feature_selection, linear_model, preprocessing, metrics, model_selection

first_pipe = pipeline.Pipeline([
    ("scale", preprocessing.StandardScaler()),
    ("selection", feature_selection.SelectPercentile(feature_selection.f_regression, percentile=50)),
    ("regression", linear_model.LinearRegression()),
])

second_pipe = pipeline.Pipeline([
    ("scale", preprocessing.StandardScaler()),
    ("selection", feature_selection.SelectPercentile(feature_selection.f_regression, percentile=50)),
    ("regression", linear_model.Lasso(alpha=40)),
])

first_pipe.get_params()

Calculate the following metrics for each model using test dataset
---
* Accuracy
* Precision
* Recall
* F1 score

In [None]:
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score
from sklearn.preprocessing import label_binarize

metrics = list()
cm = dict()

for lab in coeff_labels:

    # Precision, recall, f-score from the multi-class support function
    precision, recall, fscore, _ = score(y_test, y_pred[lab], average='weighted')
    
    # The usual way to calculate accuracy
    accuracy = accuracy_score(y_test, y_pred[lab])
    
    # ROC-AUC scores can be calculated by binarizing the data
    auc = roc_auc_score(label_binarize(y_test, classes=[0,1,2,3,4,5]),
              label_binarize(y_pred[lab], classes=[0,1,2,3,4,5]), 
              average='weighted')
    
    # Last, the confusion matrix
    cm[lab] = confusion_matrix(y_test, y_pred[lab])
    
    metrics.append(pd.Series({'precision':precision, 'recall':recall, 
                              'fscore':fscore, 'accuracy':accuracy,
                              'auc':auc}, 
                             name=lab))

metrics = pd.concat(metrics, axis=1)

Generating Confusion Matrix
---
Display or plot the confusion matrix for each model.


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
fig, axList = plt.subplots(nrows=2, ncols=2)
axList = axList.flatten()
fig.set_size_inches(12, 10)

axList[-1].axis('off')

for ax,lab in zip(axList[:-1], coeff_labels):
    sns.heatmap(cm[lab], ax=ax, annot=True, fmt='d');
    ax.set(title=lab);
    
plt.tight_layout()

Learning Curve
---
To know if we are overfit or underfit, we need to plot a learning curve. A learning curve plots performance (either error or score) against some measure of complexity.



***

In [None]:
from sklearn import pipeline, feature_selection, linear_model, preprocessing, metrics, model_selection

first_pipe = pipeline.Pipeline([
    ("scale", preprocessing.StandardScaler()),
    ("selection", feature_selection.SelectPercentile(feature_selection.f_regression, percentile=50)),
    ("regression", linear_model.LinearRegression()),
])

second_pipe = pipeline.Pipeline([
    ("scale", preprocessing.StandardScaler()),
    ("selection", feature_selection.SelectPercentile(feature_selection.f_regression, percentile=50)),
    ("regression", linear_model.Lasso(alpha=40)),
])

first_pipe.get_params()

In [None]:
p_list = np.linspace(1, 100, 25, dtype='int')
train_score = []
test_score = []

for i, p in enumerate(p_list):

    first_pipe.set_params(selection__percentile=p)
    #second_pipe.set_params(selection__percentile=p)

    score = model_selection.cross_validate(
        first_pipe,
        X_train,
        y_train,
        scoring="r2",
        cv=5,
        return_train_score=True)

    train_score.append(score['train_score'].mean())
    test_score.append(score['test_score'].mean())
print(max(test_score))

In [None]:
%pylab inline

plt.plot(p_list, train_score, label='train')
plt.plot(p_list, test_score, label='test')
plt.ylabel("$r^2$")
plt.xlabel("percent of features")
plt.legend();