<a href="https://colab.research.google.com/github/alouisbroad/Machine_Learning/blob/main/Deep_Learning_Assessement.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Deep Learning

In [None]:
'''
Deep Learning - Project

Alistair Broad 

In the follow script I've used artificial neural networks to create a predictive model of 
whether breast cancer sufferers were likely to have recurrance. 
'''

# Fixed dependencies - do not remove or change.
import pytest
import pandas as pd
import numpy as np
from google.colab import drive
# drive.mount('/content/gdrive/')
# Import your dependencies

# Dependencies 
import collections
import datetime
import keras
from sklearn.preprocessing import OneHotEncoder # Encode the variables with more than one category that aren't ordinal. 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import Sequential
from sklearn.metrics import confusion_matrix
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.model_selection import GridSearchCV



# Import data
def import_local_data(file_path):
    """This function needs to import the data file into collab and return a pandas dataframe
    """
    raw_df = pd.read_excel(file_path)


    return raw_df


local_file_path = '/content/drive/My Drive/Colab Notebooks/Data/breast-cancer.xls'

# Dont change
raw_data = import_local_data(local_file_path)


# Exploring the Data

# Inspect the dataset - gives simply view of the data set.
print("Data looks like: \n {}".format(raw_data.head(10)), end ="\n \n")
# Find the number of rows. 
print("The data has {} rows.".format(len(raw_data.index)), end = "\n \n")
# Iterate over the columns and inspect the unique values that occur and how many times they occur. 
for col in raw_data:  
    print("Values in {} were:".format(col), end = "\n") 
    print(collections.Counter(raw_data[col]), end = "\n \n")


print("Of the 286 observations, {}% of them were 'no-recurrence-events'.".format(round((201/286)*100,1)), end = "\n \n")

# Explain your key findings
'''
From a quick look at our data-set, we can see that there is 286 samples, which is a fairly small dataset for this sort of thing. 
We can also see that 70.3% were "non-recurrance" events, which may be important when considering/evaluating our final model. 
Now, looking at the data: 
Age - Median range is "50-59" and there's a skew towards older age ranges, 
Menopause - even though there's a skew towards older individuals, more were premenopause, 
tumor-size - to note here, "10-14" and "5-9" have been coverted in the source file to dates, 
inv-node - as with the above, "3-5", "6-8", "9-11" and "12-14" have also been converted to dates, 
node-caps - this has 8 missing data points and 78% answered "no",
beast - this is fairly close to 50-50, as we would expect, 
breast-quad - this has 1 missing value.

Also to note, is that the data contains a mix of data types (binary nominal, ordinal categorical etc.). 

We will need to find a suitable way to deal with missing values and encode the data so a solution can 
be found. 
'''

# Split your data so that you can test the effectiveness of your model
x = raw_data.iloc[:, :-1].values # Obtain the independent variables. 
y = raw_data.iloc[:, 9].values # Split out the dependent variable.

# Encode the dependent variable.
y = LabelEncoder().fit_transform(y)


# Creating the Training set and Test set
# Here we take 25% of the data to test the model after learning from the other 80%.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25) 


class Module4_Model:
    
    def __init__(self):
        self.model = None
        
    def preprocess_training_data(self, training_df):
        """
        This function should process the training data and store any features required in the class
        """
        # Replacing Missing data
        # For categorical data, we can delete row or replace with the mode.
        # Below, I identify the required modes and sub in for missing values.

        # Replacing missing values in "node-caps".
        replace_missing = collections.Counter(training_df[:,4])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        training_df[:,4] = np.where(training_df[:,4] == '?', replace_missing, training_df[:,4]) # Applying the most common value to the missing ones. 

        # Replacing missing values in "breast-quad".
        replace_missing = collections.Counter(training_df[:,7])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        training_df[:,7] = np.where(training_df[:,7] == '?', replace_missing, training_df[:,7]) # Applying the most common value to the missing ones. 

        training_df = pd.DataFrame(training_df) # Convert X to a dataframe to make the below easier.

        # Encoding the Independent Variables
        # ordinal categorical 
        age_mapping = {'20-29':1, '30-39':2, '40-49':3, '50-59':4, '60-69':5, '70-79':6}
        training_df[0] = training_df[0].map(age_mapping)
        tumorsize_mapping = { '0-4':1, datetime.datetime(2019, 9, 5, 0, 0):2, datetime.datetime(2014, 10, 1, 0, 0):3, '15-19':4, '20-24':5, '25-29':6, '30-34':7, '35-39':8, '40-44':9, '45-49':10, '50-54':11}
        training_df[2] = training_df[2].map(tumorsize_mapping)
        inv_nodes_mapping = {'0-2':1, datetime.datetime(2019, 5, 3, 0, 0):2, datetime.datetime(2019, 8, 6, 0, 0):3, datetime.datetime(2019, 11, 9, 0, 0):4, datetime.datetime(2014, 12, 1, 0, 0):5, '15-17':6, '24-26':7}
        training_df[3] = training_df[3].map(inv_nodes_mapping)

        # Nominal 
        node_caps_mapping = {"no":0, "yes":1}
        training_df[4] = training_df[4].map(node_caps_mapping)
        breast_mapping = {"left":0, "right":1}
        training_df[6] = training_df[6].map(breast_mapping)
        irradiat_mapping = {"no":0, "yes":1}
        training_df[8] = training_df[8].map(irradiat_mapping)

        # Encode the variables with more than one category that aren't ordinal. 
        ct = ColumnTransformer([('encoder', OneHotEncoder(), [1,7])], remainder='passthrough') 
        training_df = np.array(ct.fit_transform(training_df), dtype=np.float)
        training_df = training_df[:, 1:] # Remove one dummy variable from menopause.
        t = training_df[:, [2,0,1]]
        training_df[:, [0,1,2]] = t
        training_df = training_df[:, 1:] # Rearranging and removing 1 dummy variable for the "breast-quad" variable. 

        return training_df

    def preprocess_test_data(self, test_df):
        # Replacing Missing data
        # For categorical data, we can delete row or replace with the mode.
        # Below, I identify the required modes and sub in for missing values.

        # Replacing missing values in "node-caps".
        replace_missing = collections.Counter(test_df[:,4])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        test_df[:,4] = np.where(test_df[:,4] == '?', replace_missing, test_df[:,4]) # Applying the most common value to the missing ones. 

        # Replacing missing values in "breast-quad".
        replace_missing = collections.Counter(test_df[:,7])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        test_df[:,7] = np.where(test_df[:,7] == '?', replace_missing, test_df[:,7]) # Applying the most common value to the missing ones. 

        test_df = pd.DataFrame(test_df) # Convert X to a dataframe to make the below easier.

        # Encoding the Independent Variables
        # ordinal categorical 
        age_mapping = {'20-29':1, '30-39':2, '40-49':3, '50-59':4, '60-69':5, '70-79':6}
        test_df[0] = test_df[0].map(age_mapping)
        tumorsize_mapping = { '0-4':1, datetime.datetime(2019, 9, 5, 0, 0):2, datetime.datetime(2014, 10, 1, 0, 0):3, '15-19':4, '20-24':5, '25-29':6, '30-34':7, '35-39':8, '40-44':9, '45-49':10, '50-54':11}
        test_df[2] = test_df[2].map(tumorsize_mapping)
        inv_nodes_mapping = {'0-2':1, datetime.datetime(2019, 5, 3, 0, 0):2, datetime.datetime(2019, 8, 6, 0, 0):3, datetime.datetime(2019, 11, 9, 0, 0):4, datetime.datetime(2014, 12, 1, 0, 0):5, '15-17':6, '24-26':7}
        test_df[3] = test_df[3].map(inv_nodes_mapping)

        # Nominal 
        node_caps_mapping = {"no":0, "yes":1}
        test_df[4] = test_df[4].map(node_caps_mapping)
        breast_mapping = {"left":0, "right":1}
        test_df[6] = test_df[6].map(breast_mapping)
        irradiat_mapping = {"no":0, "yes":1}
        test_df[8] = test_df[8].map(irradiat_mapping)

        # Encode the variables with more than one category that aren't ordinal. 
        ct = ColumnTransformer([('encoder', OneHotEncoder(), [1,7])], remainder='passthrough') 
        test_df = np.array(ct.fit_transform(test_df), dtype=np.float)
        test_df = test_df[:, 1:] # Remove one dummy variable from menopause.
        t = test_df[:, [2,0,1]]
        test_df[:, [0,1,2]] = t
        test_df = test_df[:, 1:] # Rearranging and removing 1 dummy variable for the "breast-quad" variable. 

        return test_df
    

# Dont change
my_model = Module4_Model()

# Dont change
x_train_processed = my_model.preprocess_training_data(x_train)

#### Model 1 #### 

# Create a model

# Initialising the ANN
classifier = Sequential() # model class

# Adding the input layer and the first hidden layer
# add method used to add layers.
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13)) 

# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu')) 

# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid')) # Sigmoid is used at the end to get probability at the end.
# for more dependent variables with more than two categories use  - change units to the number you have and the activation to a multiple sigmoid version "softmax".

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy']) 
# adam is a stocastic gradient decent algorithm
# for more dependent variables with more than two categories use "category_crossentropy"


# Dont change
x_test_processed = my_model.preprocess_test_data(x_test)


# Train your model
# Fitting the ANN to the Training set
classifier.fit(x_train_processed, y_train, batch_size = 10, epochs = 100)

# use your model to make a prediction on unseen data
# Predicting the Test set results
y_pred = classifier.predict(x_test_processed)
y_pred = (y_pred > 0.5) # Threshold of 50% 

# Asssess the accuracy of your model and explain your key findings
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

# Accuracy
accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
print("Accuracy: {} \n \n".format(accuracy))
# Precision
precision = (cm[1][1])/(cm[1][1]+cm[0][1])
print("Precision: {} \n \n".format(precision))
# Recall 
recall = (cm[1][1])/(cm[1][1]+cm[1][0])
print("Recall: {} \n \n".format(recall))
# F1 Score
print("F1 Score: {} \n \n".format((precision*recall)/(recall+precision)))

'''
OUTPUT:
[[46  7]
 [15  4]]
Accuracy: 0.6944444444444444 
 

Precision: 0.36363636363636365 
 

Recall: 0.21052631578947367 
 

F1 Score: 0.13333333333333333 

Looking over these outputs, we can see that this is currently a pretty terrible model. Percision, recall and f1 are exceptionally low
and accuracy is being below what would be achieved if the model always predicted "non-recurrance". Below I will try to impove the 
model by searching for the best parameters and dropout (which should help to prevent overfitting).
'''

In [None]:
'''
Parameter tuning with gridsearch 
Note: This section takes an exceptionally long time to run - hence, why I only used 200 epochs.

Here I've tested if having more epochs, batch size or a different optimiser will give us a 
better model. 
'''

# Tuning the ANN
def build_classifier(optimizer):
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier)
parameters = {'batch_size': [25, 32],
              'epochs': [100, 200],
              'optimizer': ['adam', 'rmsprop']}
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10)
grid_search = grid_search.fit(x_train_processed , y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

print(best_accuracy)
print(best_parameters)

'''
OUTPUT: 
0.7146245059288537
{'batch_size': 32, 'epochs': 100, 'optimizer': 'adam'}

So here we have found the the number of epochs and optimiser we'd chosen was the best, but this 
suggests that increasing batch size will give a marginally better result - the accuracy is now
at least above the percentage of "non-reccurance".
'''

In [None]:
'''
Below I've experimented with adding in dropout in each hidden layer and tested the effects of adding in
another hidden layer. 
'''

# Adding dropout to prevent over fitting. 
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 32, epochs = 100)
accuracies = cross_val_score(estimator = classifier, X = x_train_processed , y = y_train, cv = 10, n_jobs = -1)
mean = accuracies.mean()
variance = accuracies.std()

print("Adding in dropout the mean accuracy is now: {}".format(mean))
print("With variance of: {} \n".format(variance))

# Adding in an additional hidden layer. 
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 32, epochs = 100)
accuracies = cross_val_score(estimator = classifier, X = x_train_processed , y = y_train, cv = 10, n_jobs = -1)
mean = accuracies.mean()
variance = accuracies.std()

print("Adding in dropout the mean accuracy is now: {}".format(mean))
print("With variance of: {} \n".format(variance))

'''
OUTPUT:
Adding in dropout the mean accuracy is now: 0.7099567174911499
With variance of: 0.07820415336432304 

Adding in dropout the mean accuracy is now: 0.6958874583244323
With variance of: 0.07884998617267996 

From the output above we see the adding in dropout slightly decreases the mean accurancy, but this is 
likely worth it to prevent over fitting. 

Adding another hidden layer decreases the mean accuracy and increases the variance (very slightly) so, 
it's best not to add this new layer. 

So, we have finished tuning the model, though it's still not great, I have made a slight improvement.
To make the model better, the main point of action would be to obtain more data!

Below is the final and slightly improved model. 
'''

#### Final Model #### 

# Initialising the ANN
classifier = Sequential() # model class
# Adding the input layer and the first hidden layer
# add method used to add layers.
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
# Adding dropout.
classifier.add(Dropout(rate = 0.1))
# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
# Adding dropout.
classifier.add(Dropout(rate = 0.1))
# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy']) 

# Dont change
x_test_processed = my_model.preprocess_test_data(x_test)

# Train your model
# Fitting the ANN to the Training set
classifier.fit(x_train_processed, y_train, batch_size = 32, epochs = 100)

# use your model to make a prediction on unseen data
# Predicting the Test set results
y_pred = classifier.predict(x_test_processed)
y_pred = (y_pred > 0.5) # Threshold of 50% 

# Asssess the accuracy of your model and explain your key findings
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

# Accuracy
accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
print("Accuracy: {} \n \n".format(accuracy))
# Precision
precision = (cm[1][1])/(cm[1][1]+cm[0][1])
print("Precision: {} \n \n".format(precision))
# Recall 
recall = (cm[1][1])/(cm[1][1]+cm[1][0])
print("Recall: {} \n \n".format(recall))
# F1 Score
print("F1 Score: {} \n \n".format((precision*recall)/(recall+precision)))

In [None]:
"""
Testing on a single (made-up) value: 
age: 30-39
menopause: ge40
tumorsize: 25-29
inv-nodes: 0-2
node-caps: yes
deg-malig: 3
breast: right
breast-quad: left_up
irradiat: no
"""
# This can be encoded as: 
single_prediction = np.array([[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0, 6.0, 1.0, 1.0, 3.0, 1.0, 0.0]])

# Predicting 
print(classifier.predict(single_prediction) > 0.5) 
# This gave a single value of "True" so, this individual is predicted to have a reccurance event. 

In [None]:
aa = raw_data.iloc[:, :-1].values
a = pd.DataFrame(my_model.preprocess_training_data(aa))

print(a)
print(a.iloc[286, :])

print(y)
#v = processed.iloc[287, :-1].values
#print(v)
#print(my_model.prediction_data([v]))

#print(my_model.prediction_data())

# Version 1

In [None]:
'''
Deep Learning - Project

Alistair Broad 

In the follow script I've used artificial neural networks to create a predictive model of 
whether breast cancer sufferers were likely to have recurrance. 
'''

# Fixed dependencies - do not remove or change.
import pytest
import pandas as pd
import numpy as np
from google.colab import drive
# drive.mount('/content/gdrive/')
# Import your dependencies

# Dependencies 
import collections
import datetime
import keras
from sklearn.preprocessing import OneHotEncoder # Encode the variables with more than one category that aren't ordinal. 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import Sequential
from sklearn.metrics import confusion_matrix
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.model_selection import GridSearchCV



# Import data
def import_local_data(file_path):
    """This function needs to import the data file into collab and return a pandas dataframe
    """
    raw_df = pd.read_excel(file_path)


    return raw_df


local_file_path = '/content/drive/My Drive/Colab Notebooks/Data/breast-cancer.xls'

# Dont change
raw_data = import_local_data(local_file_path)


# Exploring the Data

# Inspect the dataset - gives simply view of the data set.
print("Data looks like: \n {}".format(raw_data.head(10)), end ="\n \n")
# Find the number of rows. 
print("The data has {} rows.".format(len(raw_data.index)), end = "\n \n")
# Iterate over the columns and inspect the unique values that occur and how many times they occur. 
for col in raw_data:  
    print("Values in {} were:".format(col), end = "\n") 
    print(collections.Counter(raw_data[col]), end = "\n \n")


print("Of the 286 observations, {}% of them were 'no-recurrence-events'.".format(round((201/286)*100,1)), end = "\n \n")

# Explain your key findings
'''
From a quick look at our data-set, we can see that there is 286 samples, which is a fairly small dataset for this sort of thing. 
We can also see that 70.3% were "non-recurrance" events, which may be important when considering/evaluating our final model. 
Now, looking at the data: 
Age - Median range is "50-59" and there's a skew towards older age ranges, 
Menopause - even though there's a skew towards older individuals, more were premenopause, 
tumor-size - to note here, "10-14" and "5-9" have been coverted in the source file to dates, 
inv-node - as with the above, "3-5", "6-8", "9-11" and "12-14" have also been converted to dates, 
node-caps - this has 8 missing data points and 78% answered "no",
beast - this is fairly close to 50-50, as we would expect, 
breast-quad - this has 1 missing value.

Also to note, is that the data contains a mix of data types (binary nominal, ordinal categorical etc.). 

We will need to find a suitable way to deal with missing values and encode the data so a solution can 
be found. 
'''

# Split your data so that you can test the effectiveness of your model
x = raw_data.iloc[:, :-1].values # Obtain the independent variables. 
y = raw_data.iloc[:, 9].values # Split out the dependent variable.

# Encode the dependent variable.
y = LabelEncoder().fit_transform(y)


# Creating the Training set and Test set
# Here we take 25% of the data to test the model after learning from the other 80%.
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size = 0.25, random_state = 1) 


class Module4_Model:
    
    def __init__(self):
        self.model = None
        
    def preprocess_training_data(self, training_df):
        """
        This function should process the training data and store any features required in the class
        """
        # Replacing Missing data
        # For categorical data, we can delete row or replace with the mode.
        # Below, I identify the required modes and sub in for missing values.

        # Replacing missing values in "node-caps".
        replace_missing = collections.Counter(training_df[:,4])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        training_df[:,4] = np.where(training_df[:,4] == '?', replace_missing, training_df[:,4]) # Applying the most common value to the missing ones. 

        # Replacing missing values in "breast-quad".
        replace_missing = collections.Counter(training_df[:,7])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        training_df[:,7] = np.where(training_df[:,7] == '?', replace_missing, training_df[:,7]) # Applying the most common value to the missing ones. 

        training_df = pd.DataFrame(training_df) # Convert X to a dataframe to make the below easier.

        # Encoding the Independent Variables
        # ordinal categorical 
        age_mapping = {'20-29':1, '30-39':2, '40-49':3, '50-59':4, '60-69':5, '70-79':6}
        training_df[0] = training_df[0].map(age_mapping)
        tumorsize_mapping = { '0-4':1, datetime.datetime(2019, 9, 5, 0, 0):2, datetime.datetime(2014, 10, 1, 0, 0):3, '15-19':4, '20-24':5, '25-29':6, '30-34':7, '35-39':8, '40-44':9, '45-49':10, '50-54':11}
        training_df[2] = training_df[2].map(tumorsize_mapping)
        inv_nodes_mapping = {'0-2':1, datetime.datetime(2019, 5, 3, 0, 0):2, datetime.datetime(2019, 8, 6, 0, 0):3, datetime.datetime(2019, 11, 9, 0, 0):4, datetime.datetime(2014, 12, 1, 0, 0):5, '15-17':6, '24-26':7}
        training_df[3] = training_df[3].map(inv_nodes_mapping)

        # Nominal 
        node_caps_mapping = {"no":0, "yes":1}
        training_df[4] = training_df[4].map(node_caps_mapping)
        breast_mapping = {"left":0, "right":1}
        training_df[6] = training_df[6].map(breast_mapping)
        irradiat_mapping = {"no":0, "yes":1}
        training_df[8] = training_df[8].map(irradiat_mapping)

        # Encode the variables with more than one category that aren't ordinal. 
        ct = ColumnTransformer([('encoder', OneHotEncoder(), [1,7])], remainder='passthrough') 
        training_df = np.array(ct.fit_transform(training_df), dtype=np.float)
        training_df = training_df[:, 1:] # Remove one dummy variable from menopause.
        t = training_df[:, [2,0,1]]
        training_df[:, [0,1,2]] = t
        training_df = training_df[:, 1:] # Rearranging and removing 1 dummy variable for the "breast-quad" variable. 

        return training_df

    def preprocess_test_data(self, test_df):
        # Replacing Missing data
        # For categorical data, we can delete row or replace with the mode.
        # Below, I identify the required modes and sub in for missing values.

        # Replacing missing values in "node-caps".
        replace_missing = collections.Counter(test_df[:,4])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        test_df[:,4] = np.where(test_df[:,4] == '?', replace_missing, test_df[:,4]) # Applying the most common value to the missing ones. 

        # Replacing missing values in "breast-quad".
        replace_missing = collections.Counter(test_df[:,7])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        test_df[:,7] = np.where(test_df[:,7] == '?', replace_missing, test_df[:,7]) # Applying the most common value to the missing ones. 

        test_df = pd.DataFrame(test_df) # Convert X to a dataframe to make the below easier.

        # Encoding the Independent Variables
        # ordinal categorical 
        age_mapping = {'20-29':1, '30-39':2, '40-49':3, '50-59':4, '60-69':5, '70-79':6}
        test_df[0] = test_df[0].map(age_mapping)
        tumorsize_mapping = { '0-4':1, datetime.datetime(2019, 9, 5, 0, 0):2, datetime.datetime(2014, 10, 1, 0, 0):3, '15-19':4, '20-24':5, '25-29':6, '30-34':7, '35-39':8, '40-44':9, '45-49':10, '50-54':11}
        test_df[2] = test_df[2].map(tumorsize_mapping)
        inv_nodes_mapping = {'0-2':1, datetime.datetime(2019, 5, 3, 0, 0):2, datetime.datetime(2019, 8, 6, 0, 0):3, datetime.datetime(2019, 11, 9, 0, 0):4, datetime.datetime(2014, 12, 1, 0, 0):5, '15-17':6, '24-26':7}
        test_df[3] = test_df[3].map(inv_nodes_mapping)

        # Nominal 
        node_caps_mapping = {"no":0, "yes":1}
        test_df[4] = test_df[4].map(node_caps_mapping)
        breast_mapping = {"left":0, "right":1}
        test_df[6] = test_df[6].map(breast_mapping)
        irradiat_mapping = {"no":0, "yes":1}
        test_df[8] = test_df[8].map(irradiat_mapping)

        # Encode the variables with more than one category that aren't ordinal. 
        ct = ColumnTransformer([('encoder', OneHotEncoder(), [1,7])], remainder='passthrough') 
        test_df = np.array(ct.fit_transform(test_df), dtype=np.float)
        test_df = test_df[:, 1:] # Remove one dummy variable from menopause.
        t = test_df[:, [2,0,1]]
        test_df[:, [0,1,2]] = t
        test_df = test_df[:, 1:] # Rearranging and removing 1 dummy variable for the "breast-quad" variable. 

        return test_df
    

# Dont change
my_model = Module4_Model()

# Dont change
x_train_processed = my_model.preprocess_training_data(x_train)

#### Model 1 #### 

# Create a model

# Initialising the ANN
classifier = Sequential() # model class

# Adding the input layer and the first hidden layer
# add method used to add layers.
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13)) 

# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu')) 

# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid')) # Sigmoid is used at the end to get probability at the end.
# for more dependent variables with more than two categories use  - change units to the number you have and the activation to a multiple sigmoid version "softmax".

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy']) 
# adam is a stocastic gradient decent algorithm
# for more dependent variables with more than two categories use "category_crossentropy"


# Dont change
x_test_processed = my_model.preprocess_test_data(x_test)


# Train your model
# Fitting the ANN to the Training set
classifier.fit(x_train_processed, y_train, batch_size = 10, epochs = 100)

# use your model to make a prediction on unseen data
# Predicting the Test set results
y_pred = classifier.predict(x_test_processed)
y_pred = (y_pred > 0.5) # Threshold of 50% 

# Asssess the accuracy of your model and explain your key findings
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

# Accuracy
accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
print("Accuracy: {} \n \n".format(accuracy))
# Precision
precision = (cm[1][1])/(cm[1][1]+cm[0][1])
print("Precision: {} \n \n".format(precision))
# Recall 
recall = (cm[1][1])/(cm[1][1]+cm[1][0])
print("Recall: {} \n \n".format(recall))
# F1 Score
print("F1 Score: {} \n \n".format((precision*recall)/(recall+precision)))

'''
OUTPUT:
[[46  7]
 [15  4]]
Accuracy: 0.6944444444444444 
 

Precision: 0.36363636363636365 
 

Recall: 0.21052631578947367 
 

F1 Score: 0.13333333333333333 

Looking over these outputs, we can see that this is currently a pretty terrible model. Percision, recall and f1 are exceptionally low
and accuracy is being below what would be achieved if the model always predicted "non-recurrance". Below I will try to impove the 
model by searching for the best parameters and dropout (which should help to prevent overfitting).

'''


'''
Below I've experimented with adding in dropout in each hidden layer and tested the effects of adding in
another hidden layer. 
'''

# Adding dropout to prevent over fitting. 
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 32, epochs = 100)
accuracies = cross_val_score(estimator = classifier, X = x_train_processed , y = y_train, cv = 10, n_jobs = -1)
mean = accuracies.mean()
variance = accuracies.std()

print("Adding in dropout the mean accuracy is now: {}".format(mean))
print("With variance of: {} \n".format(variance))

# Adding in an additional hidden layer. 
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 32, epochs = 100)
accuracies = cross_val_score(estimator = classifier, X = x_train_processed , y = y_train, cv = 10, n_jobs = -1)
mean = accuracies.mean()
variance = accuracies.std()

print("Adding in dropout the mean accuracy is now: {}".format(mean))
print("With variance of: {} \n".format(variance))

'''
OUTPUT:
Adding in dropout the mean accuracy is now: 0.7099567174911499
With variance of: 0.07820415336432304 

Adding in dropout the mean accuracy is now: 0.6958874583244323
With variance of: 0.07884998617267996 

From the output above we see the adding in dropout slightly decreases the mean accurancy, but this is 
likely worth it to prevent over fitting. 

Adding another hidden layer decreases the mean accuracy and increases the variance (very slightly) so, 
it's best not to add this new layer. 

So, we have finished tuning the model, though it's still not great, I have made a slight improvement.
To make the model better, the main point of action would be to obtain more data!

Below is the final and slightly improved model. 
'''

#### Final Model #### 

# Initialising the ANN
classifier = Sequential() # model class
# Adding the input layer and the first hidden layer
# add method used to add layers.
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
# Adding dropout.
classifier.add(Dropout(rate = 0.1))
# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
# Adding dropout.
classifier.add(Dropout(rate = 0.1))
# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy']) 

# Dont change
x_test_processed = my_model.preprocess_test_data(x_test)

# Train your model
# Fitting the ANN to the Training set
classifier.fit(x_train_processed, y_train, batch_size = 32, epochs = 100)

# use your model to make a prediction on unseen data
# Predicting the Test set results
y_pred = classifier.predict(x_test_processed)
y_pred = (y_pred > 0.5) # Threshold of 50% 

# Asssess the accuracy of your model and explain your key findings
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

# Accuracy
accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
print("Accuracy: {} \n \n".format(accuracy))
# Precision
precision = (cm[1][1])/(cm[1][1]+cm[0][1])
print("Precision: {} \n \n".format(precision))
# Recall 
recall = (cm[1][1])/(cm[1][1]+cm[1][0])
print("Recall: {} \n \n".format(recall))
# F1 Score
print("F1 Score: {} \n \n".format((precision*recall)/(recall+precision)))

"""
Testing on a single (made-up) value: 
age: 30-39
menopause: ge40
tumorsize: 25-29
inv-nodes: 0-2
node-caps: yes
deg-malig: 3
breast: right
breast-quad: left_up
irradiat: no
"""
# This can be encoded as: 
single_prediction = np.array([[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0, 6.0, 1.0, 1.0, 3.0, 1.0, 0.0]])

# Predicting 
print(classifier.predict(single_prediction) > 0.5) 
# This gave a single value of "True" so, this individual is predicted to have a reccurance event. 

Data looks like: 
      age menopause  ... irradiat                 Class
0  40-49   premeno  ...       no     recurrence-events
1  50-59      ge40  ...       no  no-recurrence-events
2  50-59      ge40  ...       no     recurrence-events
3  40-49   premeno  ...      yes  no-recurrence-events
4  40-49   premeno  ...       no     recurrence-events
5  50-59   premeno  ...      yes  no-recurrence-events
6  50-59      ge40  ...       no  no-recurrence-events
7  40-49   premeno  ...       no  no-recurrence-events
8  40-49   premeno  ...       no  no-recurrence-events
9  40-49      ge40  ...      yes  no-recurrence-events

[10 rows x 10 columns]
 
The data has 287 rows.
 
Values in age were:
Counter({'50-59': 96, '40-49': 90, '60-69': 57, '30-39': 37, '70-79': 6, '20-29': 1})
 
Values in menopause were:
Counter({'premeno': 150, 'ge40': 130, 'lt40': 7})
 
Values in tumor-size were:
Counter({'30-34': 60, '25-29': 55, '20-24': 50, '15-19': 30, datetime.datetime(2014, 10, 1, 0, 0): 28, '40-44': 



Adding in dropout the mean accuracy is now: 0.7017316162586212
With variance of: 0.05825527233621434 

Adding in dropout the mean accuracy is now: 0.7017316162586212
With variance of: 0.05825527233621434 

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/1

# Assessment Final 

In [None]:
'''
Deep Learning - Project

Alistair Broad 

In the follow script I've used artificial neural networks to create a predictive model of 
whether breast cancer sufferers were likely to have recurrance. 
'''

# Fixed dependencies - do not remove or change.
import pytest
import pandas as pd
import numpy as np
from google.colab import drive
# drive.mount('/content/gdrive/')
# Import your dependencies

# Dependencies 
import collections
import datetime
import keras
from sklearn.preprocessing import OneHotEncoder # Encode the variables with more than one category that aren't ordinal. 
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import Sequential
from sklearn.metrics import confusion_matrix
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from sklearn.model_selection import GridSearchCV



# Import data
def import_local_data(file_path):
    """This function needs to import the data file into collab and return a pandas dataframe
    """
    raw_df = pd.read_excel(file_path)


    return raw_df


local_file_path = '/content/drive/My Drive/Colab Notebooks/Data/breast-cancer.xls'

# Dont change
raw_data = import_local_data(local_file_path)


# Exploring the Data

# Inspect the dataset - gives simply view of the data set.
print("Data looks like: \n {}".format(raw_data.head(10)), end ="\n \n")
# Find the number of rows. 
print("The data has {} rows.".format(len(raw_data.index)), end = "\n \n")
# Iterate over the columns and inspect the unique values that occur and how many times they occur. 
for col in raw_data:  
    print("Values in {} were:".format(col), end = "\n") 
    print(collections.Counter(raw_data[col]), end = "\n \n")


print("Of the 286 observations, {}% of them were 'no-recurrence-events'.".format(round((201/286)*100,1)), end = "\n \n")

# Explain your key findings
'''
From a quick look at our data-set, we can see that there is 286 samples, which is a fairly small dataset for this sort of thing. 
We can also see that 70.3% were "non-recurrance" events, which may be important when considering/evaluating our final model. 
Now, looking at the data: 
Age - Median range is "50-59" and there's a skew towards older age ranges, 
Menopause - even though there's a skew towards older individuals, more were premenopause, 
tumor-size - to note here, "10-14" and "5-9" have been coverted in the source file to dates, 
inv-node - as with the above, "3-5", "6-8", "9-11" and "12-14" have also been converted to dates, 
node-caps - this has 8 missing data points and 78% answered "no",
beast - this is fairly close to 50-50, as we would expect, 
breast-quad - this has 1 missing value.

Also to note, is that the data contains a mix of data types (binary nominal, ordinal categorical etc.). 

We will need to find a suitable way to deal with missing values and encode the data so a solution can 
be found. 
'''

# Split your data so that you can test the effectiveness of your model
x = raw_data.iloc[:, :-1].values # Obtain the independent variables. 
y = raw_data.iloc[:, 9].values # Split out the dependent variable.

# Encode the dependent variable.
y = LabelEncoder().fit_transform(y)


# Creating the Training set and Test set
# Here we take 25% of the data to test the model after learning from the other 75%.
x_train, x_test, y_train, y_test = train_test_split( x, y, test_size = 0.25, random_state = 1) 


class Module4_Model:
    
    def __init__(self):
        self.model = None
        
    def preprocess_training_data(self, training_df):
        """
        This function should process the training data and store any features required in the class
        """
        # Replacing Missing data
        # For categorical data, we can delete row or replace with the mode.
        # Below, I identify the required modes and sub in for missing values.

        # Replacing missing values in "node-caps".
        replace_missing = collections.Counter(training_df[:,4])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        training_df[:,4] = np.where(training_df[:,4] == '?', replace_missing, training_df[:,4]) # Applying the most common value to the missing ones. 

        # Replacing missing values in "breast-quad".
        replace_missing = collections.Counter(training_df[:,7])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        training_df[:,7] = np.where(training_df[:,7] == '?', replace_missing, training_df[:,7]) # Applying the most common value to the missing ones. 

        training_df = pd.DataFrame(training_df) # Convert X to a dataframe to make the below easier.

        # Encoding the Independent Variables
        # ordinal categorical 
        age_mapping = {'20-29':1, '30-39':2, '40-49':3, '50-59':4, '60-69':5, '70-79':6}
        training_df[0] = training_df[0].map(age_mapping)
        tumorsize_mapping = { '0-4':1, datetime.datetime(2019, 9, 5, 0, 0):2, datetime.datetime(2014, 10, 1, 0, 0):3, '15-19':4, '20-24':5, '25-29':6, '30-34':7, '35-39':8, '40-44':9, '45-49':10, '50-54':11}
        training_df[2] = training_df[2].map(tumorsize_mapping)
        inv_nodes_mapping = {'0-2':1, datetime.datetime(2019, 5, 3, 0, 0):2, datetime.datetime(2019, 8, 6, 0, 0):3, datetime.datetime(2019, 11, 9, 0, 0):4, datetime.datetime(2014, 12, 1, 0, 0):5, '15-17':6, '24-26':7}
        training_df[3] = training_df[3].map(inv_nodes_mapping)

        # Nominal 
        node_caps_mapping = {"no":0, "yes":1}
        training_df[4] = training_df[4].map(node_caps_mapping)
        breast_mapping = {"left":0, "right":1}
        training_df[6] = training_df[6].map(breast_mapping)
        irradiat_mapping = {"no":0, "yes":1}
        training_df[8] = training_df[8].map(irradiat_mapping)

        # Encode the variables with more than one category that aren't ordinal. 
        ct = ColumnTransformer([('encoder', OneHotEncoder(), [1,7])], remainder='passthrough') 
        training_df = np.array(ct.fit_transform(training_df), dtype=np.float)
        training_df = training_df[:, 1:] # Remove one dummy variable from menopause.
        t = training_df[:, [2,0,1]]
        training_df[:, [0,1,2]] = t
        training_df = training_df[:, 1:] # Rearranging and removing 1 dummy variable for the "breast-quad" variable. 

        return training_df

    def preprocess_test_data(self, test_df):
        # Replacing Missing data
        # For categorical data, we can delete row or replace with the mode.
        # Below, I identify the required modes and sub in for missing values.

        # Replacing missing values in "node-caps".
        replace_missing = collections.Counter(test_df[:,4])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        test_df[:,4] = np.where(test_df[:,4] == '?', replace_missing, test_df[:,4]) # Applying the most common value to the missing ones. 

        # Replacing missing values in "breast-quad".
        replace_missing = collections.Counter(test_df[:,7])            # This line counts the frequency of each entry.
        replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
        test_df[:,7] = np.where(test_df[:,7] == '?', replace_missing, test_df[:,7]) # Applying the most common value to the missing ones. 

        test_df = pd.DataFrame(test_df) # Convert X to a dataframe to make the below easier.

        # Encoding the Independent Variables
        # ordinal categorical 
        age_mapping = {'20-29':1, '30-39':2, '40-49':3, '50-59':4, '60-69':5, '70-79':6}
        test_df[0] = test_df[0].map(age_mapping)
        tumorsize_mapping = { '0-4':1, datetime.datetime(2019, 9, 5, 0, 0):2, datetime.datetime(2014, 10, 1, 0, 0):3, '15-19':4, '20-24':5, '25-29':6, '30-34':7, '35-39':8, '40-44':9, '45-49':10, '50-54':11}
        test_df[2] = test_df[2].map(tumorsize_mapping)
        inv_nodes_mapping = {'0-2':1, datetime.datetime(2019, 5, 3, 0, 0):2, datetime.datetime(2019, 8, 6, 0, 0):3, datetime.datetime(2019, 11, 9, 0, 0):4, datetime.datetime(2014, 12, 1, 0, 0):5, '15-17':6, '24-26':7}
        test_df[3] = test_df[3].map(inv_nodes_mapping)

        # Nominal 
        node_caps_mapping = {"no":0, "yes":1}
        test_df[4] = test_df[4].map(node_caps_mapping)
        breast_mapping = {"left":0, "right":1}
        test_df[6] = test_df[6].map(breast_mapping)
        irradiat_mapping = {"no":0, "yes":1}
        test_df[8] = test_df[8].map(irradiat_mapping)

        # Encode the variables with more than one category that aren't ordinal. 
        ct = ColumnTransformer([('encoder', OneHotEncoder(), [1,7])], remainder='passthrough') 
        test_df = np.array(ct.fit_transform(test_df), dtype=np.float)
        test_df = test_df[:, 1:] # Remove one dummy variable from menopause.
        t = test_df[:, [2,0,1]]
        test_df[:, [0,1,2]] = t
        test_df = test_df[:, 1:] # Rearranging and removing 1 dummy variable for the "breast-quad" variable. 

        return test_df
    

# Dont change
my_model = Module4_Model()

# Dont change
x_train_processed = my_model.preprocess_training_data(x_train)

#### Model 1 #### 

# Create a model

# Initialising the ANN
classifier = Sequential() # model class

# Adding the input layer and the first hidden layer
# add method used to add layers.
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13)) 

# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu')) 

# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid')) # Sigmoid is used at the end to get probability at the end.
# for more dependent variables with more than two categories use  - change units to the number you have and the activation to a multiple sigmoid version "softmax".

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy']) 
# adam is a stocastic gradient decent algorithm
# for more dependent variables with more than two categories use "category_crossentropy"


# Dont change
x_test_processed = my_model.preprocess_test_data(x_test)


# Train your model
# Fitting the ANN to the Training set
classifier.fit(x_train_processed, y_train, batch_size = 10, epochs = 100)

# use your model to make a prediction on unseen data
# Predicting the Test set results
y_pred = classifier.predict(x_test_processed)
y_pred = (y_pred > 0.5) # Threshold of 50% 

# Asssess the accuracy of your model and explain your key findings
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

# Accuracy
accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
print("Accuracy: {} \n \n".format(accuracy))
# Precision
precision = (cm[1][1])/(cm[1][1]+cm[0][1])
print("Precision: {} \n \n".format(precision))
# Recall 
recall = (cm[1][1])/(cm[1][1]+cm[1][0])
print("Recall: {} \n \n".format(recall))
# F1 Score
print("F1 Score: {} \n \n".format((precision*recall)/(recall+precision)))

'''
OUTPUT:
[[46  7]
 [15  4]]
Accuracy: 0.6944444444444444 
 

Precision: 0.36363636363636365 
 

Recall: 0.21052631578947367 
 

F1 Score: 0.13333333333333333 

Looking over these outputs, we can see that this is currently a pretty terrible model. Percision, recall and f1 are exceptionally low
and accuracy is being below what would be achieved if the model always predicted "non-recurrance". Below I will try to impove the 
model by searching for the best parameters and dropout (which should help to prevent overfitting).

'''

'''
Parameter tuning with gridsearch 
Note: This section takes an exceptionally long time to run - hence, why I only used 200 epochs.

Here I've tested if having more epochs, batch size or a different optimiser will give us a 
better model. 
'''

# Tuning the ANN
def build_classifier(optimizer):
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier)
parameters = {'batch_size': [25, 32],
              'epochs': [100, 200],
              'optimizer': ['adam', 'rmsprop']}
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10)
grid_search = grid_search.fit(x_train_processed , y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

print(best_accuracy)
print(best_parameters)

'''
OUTPUT: 
0.7146245059288537
{'batch_size': 32, 'epochs': 100, 'optimizer': 'adam'}

So here we have found the the number of epochs and optimiser we'd chosen was the best, but this 
suggests that increasing batch size will give a marginally better result - the accuracy is now
at least above the percentage of "non-reccurance".
'''


'''
Below I've experimented with adding in dropout in each hidden layer and tested the effects of adding in
another hidden layer. 
'''

# Adding dropout to prevent over fitting. 
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 32, epochs = 100)
accuracies = cross_val_score(estimator = classifier, X = x_train_processed , y = y_train, cv = 10, n_jobs = -1)
mean = accuracies.mean()
variance = accuracies.std()

print("Adding in dropout the mean accuracy is now: {}".format(mean))
print("With variance of: {} \n".format(variance))

# Adding in an additional hidden layer. 
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dropout(rate = 0.1))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 32, epochs = 100)
accuracies = cross_val_score(estimator = classifier, X = x_train_processed , y = y_train, cv = 10, n_jobs = -1)
mean = accuracies.mean()
variance = accuracies.std()

print("Adding in dropout the mean accuracy is now: {}".format(mean))
print("With variance of: {} \n".format(variance))

'''
OUTPUT:
Adding in dropout the mean accuracy is now: 0.7099567174911499
With variance of: 0.07820415336432304 

Adding in dropout the mean accuracy is now: 0.6958874583244323
With variance of: 0.07884998617267996 

From the output above we see the adding in dropout slightly decreases the mean accurancy, but this is 
likely worth it to prevent over fitting. 

Adding another hidden layer decreases the mean accuracy and increases the variance (very slightly) so, 
it's best not to add this new layer. 

So, we have finished tuning the model, though it's still not great, I have made a slight improvement.
To make the model better, the main point of action would be to obtain more data!

Below is the final and slightly improved model. 
'''

#### Final Model #### 

# Initialising the ANN
classifier = Sequential() # model class
# Adding the input layer and the first hidden layer
# add method used to add layers.
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
# Adding dropout.
classifier.add(Dropout(rate = 0.1))
# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
# Adding dropout.
classifier.add(Dropout(rate = 0.1))
# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy']) 

# Dont change
x_test_processed = my_model.preprocess_test_data(x_test)

# Train your model
# Fitting the ANN to the Training set
classifier.fit(x_train_processed, y_train, batch_size = 32, epochs = 100)

# use your model to make a prediction on unseen data
# Predicting the Test set results
y_pred = classifier.predict(x_test_processed)
y_pred = (y_pred > 0.5) # Threshold of 50% 

# Asssess the accuracy of your model and explain your key findings
# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)

# Accuracy
accuracy = (cm[0][0]+cm[1][1])/(cm[0][0]+cm[0][1]+cm[1][0]+cm[1][1])
print("Accuracy: {} \n \n".format(accuracy))
# Precision
precision = (cm[1][1])/(cm[1][1]+cm[0][1])
print("Precision: {} \n \n".format(precision))
# Recall 
recall = (cm[1][1])/(cm[1][1]+cm[1][0])
print("Recall: {} \n \n".format(recall))
# F1 Score
print("F1 Score: {} \n \n".format((precision*recall)/(recall+precision)))

"""
Testing on a single (made-up) value: 
age: 30-39
menopause: ge40
tumorsize: 25-29
inv-nodes: 0-2
node-caps: yes
deg-malig: 3
breast: right
breast-quad: left_up
irradiat: no
"""
# This can be encoded as: 
single_prediction = np.array([[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 2.0, 6.0, 1.0, 1.0, 3.0, 1.0, 0.0]])

# Predicting 
print(classifier.predict(single_prediction) > 0.5) 
# This gave a single value of "True" so, this individual is predicted to have a reccurance event. 