<a href="https://colab.research.google.com/github/alouisbroad/Machine_Learning/blob/main/Code_For_Cash_Machine_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project

In [None]:
'''
Machine Leaning - Project

Alistair Broad 

Note: I've tried to structure this as if it was a piece of work, where I was investingating the data
and trying to find the model with the best fit (using the knowledge from the course).

Instructions:
I've used google colab to produce this, so the only part that 
might have to be changed is the directory of the data (line 17).
'''
# Importing standard use libraries
import numpy as np
import pandas as pd
import datetime

# Importing the dataset
dataset = pd.read_excel('/content/drive/My Drive/Colab Notebooks/Data/breast-cancer.xls') # You may need to change this location - currently the data sits in google drive with the code. 
X = dataset.iloc[:, :-1].values # Obtain the independent variables. 
y = dataset.iloc[:, 9].values # Split out the dependent variable.

#print(dataset.head(10)) # Inspect independent variables X.
#print(Counter(dataset))
#for col in dataset:   
#  print(dataset[col].unique()) # Two columns have missing values (as "?").

# Replacing Missing data
# For categorical data, we can delete row or replace with the mode.
# Below, I identify the required modes and sub in for missing values.
from collections import Counter
# Replacing missing values in "node-caps".
replace_missing = Counter(X[:,4])            # This line counts the frequency of each entry.
replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
X[:,4] = np.where(X[:,4] == '?', replace_missing, X[:,4]) # Applying the most common value to the missing ones. 

# Replacing missing values in "breast-quad".
replace_missing = Counter(X[:,7])            # This line counts the frequency of each entry.
replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
X[:,7] = np.where(X[:,7] == '?', replace_missing, X[:,7]) # Applying the most common value to the missing ones. 

X = pd.DataFrame(X) # Convert X to a dataframe to make the below easier.

# Encoding the Independent Variables
'''
Here I have replaced the non-numeric independent variables via encoding. The ordinal I thought could simply 
be replaced with an increasing scale. I also used the same method for binary nominal as using OneHotEncoder 
and removing dummy variables for each seemed redundant. 
'''
# ordinal categorical 
age_mapping = {'20-29':1, '30-39':2, '40-49':3, '50-59':4, '60-69':5, '70-79':6}
X[0] = X[0].map(age_mapping)
tumorsize_mapping = { '0-4':1, datetime.datetime(2019, 9, 5, 0, 0):2, datetime.datetime(2014, 10, 1, 0, 0):3, '15-19':4, '20-24':5, '25-29':6, '30-34':7, '35-39':8, '40-44':9, '45-49':10, '50-54':11}
X[2] = X[2].map(tumorsize_mapping)
inv_nodes_mapping = {'0-2':1, datetime.datetime(2019, 5, 3, 0, 0):2, datetime.datetime(2019, 8, 6, 0, 0):3, datetime.datetime(2019, 11, 9, 0, 0):4, datetime.datetime(2014, 12, 1, 0, 0):5, '15-17':6, '24-26':7}
X[3] = X[3].map(inv_nodes_mapping)

# Nominal 
node_caps_mapping = {"no":0, "yes":1}
X[4] = X[4].map(node_caps_mapping)
breast_mapping = {"left":0, "right":1}
X[6] = X[6].map(breast_mapping)
irradiat_mapping = {"no":0, "yes":1}
X[8] = X[8].map(irradiat_mapping)

from sklearn.preprocessing import OneHotEncoder # Encode the variables with more than one category that aren't ordinal. 
from sklearn.compose import ColumnTransformer
transf = ColumnTransformer([('encoder', OneHotEncoder(), [1,7])], remainder='passthrough') 
X = np.array(transf.fit_transform(X), dtype=np.float)
X = X[:, 1:] # Remove one dummy variable from menopause.
t = X[:, [2,0,1]]
X[:, [0,1,2]] = t
X = X[:, 1:] # Rearranging and removing 1 dummy variable for the "breast-quad" variable. 

# Creating the Training set and Test set
# Here we take 20% of the data to test the model after learning from the other 80%.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) 

# Add feature scaling as we are using SVM. 
from sklearn.preprocessing import StandardScaler
scaling = StandardScaler()
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)

'''
From here each classifier is tested with an output of an example confusion matrix,
standard deviation and accuracy to help judge the best model. The classifiers
tested are: 
  - Kernel SVM,
  - XGBoost,
  - Logistic Regression,
  - KNN,
  - Naive Bayes,
  - Decision Tree, 
  - Random Tree.
Then the best best classifier is chosen. 
'''
#####  Kernel SVM ##########
# Fitting Kernel SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf')
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print("Kernel SVM confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of kernel SVM is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of kernel SVM is", accuracies.std())   # Standard deviation of accuracies. 


# Applying Grid Search to find the best model and the best parameters - helps you pick the best parameters for a model. 
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.01, 0.1, 0.11, 0.12, 0.13, 0.14, 0.150, 0.16, 0.2, 0.3, 0.4, 0.5,0.6,0.7,0.8,0.9]}] # gamma default is 1/n (n features) so search around here.
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)  # 10 fold cross validation
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print("The best accuracy given in the grid search was: ", best_accuracy)
print("Where the best parameters are: ", best_parameters)

##### XGBoost ##############
# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("XGBoost confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of XGBoost is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of XGBoost is", accuracies.std())   # Standard deviation of accuracies. 

#### Logistic Regression #####
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)  # Fit to training set - classifier learns correlations of the training set to then apply them to the test set. 

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix - compares the true values and the predicted values.
cm = confusion_matrix(y_test, y_pred)

print("Logistic Regression confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of Logistic Regression is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of Logistic Regression is", accuracies.std())   # Standard deviation of accuracies. 

#### knn #########
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) # p is euclidean geometry
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print("knn confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of knn is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of knn is", accuracies.std())   # Standard deviation of accuracies. 

#### Naive Bayes #####
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print("Naive Bayesn confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of Naive Bayes is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of Naive Bayes is", accuracies.std())   # Standard deviation of accuracies. 

#### Decision Tree ####
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy')
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print("Decision Tree confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of Decision Tree is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of Decision Tree is", accuracies.std())   # Standard deviation of accuracies. 

#### Random Tree ####
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 5, criterion = 'entropy')
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print("Random Tree confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of Random Tree is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of Random Tree is", accuracies.std())   # Standard deviation of accuracies. 

#### Final Model#### 
'''
After searching and testing classifiers, I have found that SVM seems to be the best fit for
this problem - with consistently best results for both accuracy and standard deviation.
I have then used grid search to then further optimise this result. Below is the code for the 
final and best classifier with the parameters the grid search found. 

This is likely the best as, SVM work quite well with smaller data sets. If more data was 
available, we might expect another classifier to work better (XGBoost is a particularly good
ensemble method). In the example confusion matrix given by kernel svm, we can see few/no type
1 errors occur, while many type 2 errors do. If more data was available, we might seek a
solution with fewer of type 2 errors. 
'''
#####  Best model found: Kernel SVM ##########
# Fitting Kernel SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(C = 1, kernel = 'rbf', gamma = 0.12)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print("An example a confusion matrix for this model is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies.  

print("The mean accuracy of Random Tree is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of Random Tree is", accuracies.std())   # Standard deviation of accuracies. 

# This model is then ready to use to predict those with further potential risk of recurrence. 

########## new - a prediction ####################
# Would need to alter this!!!!!
# Predicting a single new observation
"""Predict if the customer with the following informations will leave the bank:
Geography: France
Credit Score: 600
Gender: Male
Age: 40
Tenure: 3
Balance: 60000
Number of Products: 2
Has Credit Card: Yes
Is Active Member: Yes
Estimated Salary: 50000"""
#new_prediction = classifier.predict(sc.transform(np.array([[0.0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])))
#new_prediction = (new_prediction > 0.5)


Kernel SVM confusion matrix is given by:
[[34  4]
 [12  8]]
The mean accuracy of kernel SVM is 0.7019444444444444
The mean sd of kernel SVM is 0.05732999559621237
The best accuracy given in the grid search was:  0.7191699604743083
Where the best parameters are:  {'C': 1, 'gamma': 0.14, 'kernel': 'rbf'}
XGBoost confusion matrix is given by:
[[30  8]
 [13  7]]
The mean accuracy of XGBoost is 0.6888888888888887
The mean sd of XGBoost is 0.07635605354901631
Logistic Regression confusion matrix is given by:
[[34  4]
 [15  5]]
The mean accuracy of Logistic Regression is 0.7108333333333332
The mean sd of Logistic Regression is 0.044586578282740694
knn confusion matrix is given by:
[[33  5]
 [12  8]]
The mean accuracy of knn is 0.6891666666666666
The mean sd of knn is 0.10287307893890593
Naive Bayesn confusion matrix is given by:
[[30  8]
 [ 9 11]]
The mean accuracy of Naive Bayes is 0.6749999999999999
The mean sd of Naive Bayes is 0.16467560704474588
Decision Tree confusion matrix is given by

## Machine Learning

In [None]:
'''
Machine Leaning - Project

Alistair Broad 

Note: I've tried to structure this as if it was a piece of work, where I was investingating the data
and trying to find the model with the best fit (using the knowledge from the course).

Instructions:
I've used google colab to produce this, so the only part that 
might have to be changed is the directory of the data (line 17).
'''
# Importing standard use libraries
import numpy as np
import pandas as pd
import datetime

# Importing the dataset
dataset = pd.read_excel('/content/drive/My Drive/Colab Notebooks/Data/breast-cancer.xls') # You may need to change this location - currently the data sits in google drive with the code. 
X = dataset.iloc[:, :-1].values # Obtain the independent variables. 
y = dataset.iloc[:, 9].values # Split out the dependent variable.

#print(dataset.head(10)) # Inspect independent variables X.
#print(Counter(dataset))
#for col in dataset:   
#  print(dataset[col].unique()) # Two columns have missing values (as "?").

# Replacing Missing data
# For categorical data, we can delete row or replace with the mode.
# Below, I identify the required modes and sub in for missing values.
from collections import Counter
# Replacing missing values in "node-caps".
replace_missing = Counter(X[:,4])            # This line counts the frequency of each entry.
replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
X[:,4] = np.where(X[:,4] == '?', replace_missing, X[:,4]) # Applying the most common value to the missing ones. 

# Replacing missing values in "breast-quad".
replace_missing = Counter(X[:,7])            # This line counts the frequency of each entry.
replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
X[:,7] = np.where(X[:,7] == '?', replace_missing, X[:,7]) # Applying the most common value to the missing ones. 

X = pd.DataFrame(X) # Convert X to a dataframe to makethe below easier.

# Encoding the Independent Variables
'''
Here I have replaced the non-numeric independent variables via encoding. The ordinal I thought could simply 
be replaced with an increasing scale. I also used the same method for binary nominal as using OneHotEncoder 
and removing dummy variables for each seemed redundant. 
'''
# ordinal categorical 
age_mapping = {'20-29':1, '30-39':2, '40-49':3, '50-59':4, '60-69':5, '70-79':6}
X[0] = X[0].map(age_mapping)
tumorsize_mapping = { '0-4':1, datetime.datetime(2019, 9, 5, 0, 0):2, datetime.datetime(2014, 10, 1, 0, 0):3, '15-19':4, '20-24':5, '25-29':6, '30-34':7, '35-39':8, '40-44':9, '45-49':10, '50-54':11}
X[2] = X[2].map(tumorsize_mapping)
inv_nodes_mapping = {'0-2':1, datetime.datetime(2019, 5, 3, 0, 0):2, datetime.datetime(2019, 8, 6, 0, 0):3, datetime.datetime(2019, 11, 9, 0, 0):4, datetime.datetime(2014, 12, 1, 0, 0):5, '15-17':6, '24-26':7}
X[3] = X[3].map(inv_nodes_mapping)

# Nominal 
node_caps_mapping = {"no":0, "yes":1}
X[4] = X[4].map(node_caps_mapping)
breast_mapping = {"left":0, "right":1}
X[6] = X[6].map(breast_mapping)
irradiat_mapping = {"no":0, "yes":1}
X[8] = X[8].map(irradiat_mapping)

from sklearn.preprocessing import OneHotEncoder # Encode the variables with more than one category that aren't ordinal. 
from sklearn.compose import ColumnTransformer
transf = ColumnTransformer([('encoder', OneHotEncoder(), [1,7])], remainder='passthrough') 
X = np.array(transf.fit_transform(X), dtype=np.float)
X = X[:, 1:] # Remove one dummy variable from menopause.
t = X[:, [2,0,1]]
X[:, [0,1,2]] = t
X = X[:, 1:] # Rearranging and removing 1 dummy variable for the "breast-quad" variable. 

# Creating the Training set and Test set
# Here we take 20% of the data to test the model after learning from the other 80%.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2) 

# Add feature scaling as we are using SVM. 
from sklearn.preprocessing import StandardScaler
scaling = StandardScaler()
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)

'''
From here each classifier is tested with an output of an example confusion matrix,
standard deviation and accuracy to help judge the best model. The classifiers
tested are: 
  - Kernel SVM,
  - XGBoost,
  - Logistic Regression,
  - KNN,
  - Naive Bayes,
  - Decision Tree, 
  - Random Tree.
Then the best best classifier is chosen. 
'''
#####  Kernel SVM ##########
# Fitting Kernel SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf')
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print("Kernel SVM confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of kernel SVM is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of kernel SVM is", accuracies.std())   # Standard deviation of accuracies. 


# Applying Grid Search to find the best model and the best parameters - helps you pick the best parameters for a model. 
from sklearn.model_selection import GridSearchCV
parameters = [{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
              {'C': [1, 10, 100, 1000], 'kernel': ['rbf'], 'gamma': [0.001, 0.01, 0.1, 0.11, 0.12, 0.13, 0.14, 0.150, 0.16, 0.2, 0.3, 0.4, 0.5,0.6,0.7,0.8,0.9]}] # gamma default is 1/n (n features) so search around here.
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)  # 10 fold cross validation
grid_search = grid_search.fit(X_train, y_train)
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print("The best accuracy given in the grid search was: ", best_accuracy)
print("Where the best parameters are: ", best_parameters)

##### XGBoost ##############
# Fitting XGBoost to the Training set
from xgboost import XGBClassifier
classifier = XGBClassifier()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("XGBoost confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of XGBoost is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of XGBoost is", accuracies.std())   # Standard deviation of accuracies. 

#### Logistic Regression #####
# Fitting Logistic Regression to the Training set
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)  # Fit to training set - classifier learns correlations of the training set to then apply them to the test set. 

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix - compares the true values and the predicted values.
cm = confusion_matrix(y_test, y_pred)

print("Logistic Regression confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of Logistic Regression is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of Logistic Regression is", accuracies.std())   # Standard deviation of accuracies. 

#### knn #########
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2) # p is euclidean geometry
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print("knn confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of knn is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of knn is", accuracies.std())   # Standard deviation of accuracies. 

#### Naive Bayes #####
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print("Naive Bayesn confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of Naive Bayes is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of Naive Bayes is", accuracies.std())   # Standard deviation of accuracies. 

#### Decision Tree ####
# Fitting Decision Tree Classification to the Training set
from sklearn.tree import DecisionTreeClassifier
classifier = DecisionTreeClassifier(criterion = 'entropy')
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print("Decision Tree confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of Decision Tree is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of Decision Tree is", accuracies.std())   # Standard deviation of accuracies. 

#### Random Tree ####
# Fitting Random Forest Classification to the Training set
from sklearn.ensemble import RandomForestClassifier
classifier = RandomForestClassifier(n_estimators = 5, criterion = 'entropy')
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
cm = confusion_matrix(y_test, y_pred)

print("Random Tree confusion matrix is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies. 
print("The mean accuracy of Random Tree is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of Random Tree is", accuracies.std())   # Standard deviation of accuracies. 

########## new - a prediction ####################
# Would need to alter this!!!!!
# Predicting a single new observation
"""Predict if the customer with the following informations will leave the bank:
Geography: France
Credit Score: 600
Gender: Male
Age: 40
Tenure: 3
Balance: 60000
Number of Products: 2
Has Credit Card: Yes
Is Active Member: Yes
Estimated Salary: 50000"""
#new_prediction = classifier.predict(sc.transform(np.array([[0.0, 0, 600, 1, 40, 3, 60000, 2, 1, 1, 50000]])))
#new_prediction = (new_prediction > 0.5)

Kernel SVM confusion matrix is given by:
[[36  2]
 [16  4]]
The mean accuracy of kernel SVM is 0.7327777777777776
The mean sd of kernel SVM is 0.05964529929839455
The best accuracy given in the grid search was:  0.7454545454545454
Where the best parameters are:  {'C': 1, 'gamma': 0.2, 'kernel': 'rbf'}
XGBoost confusion matrix is given by:
[[30  8]
 [14  6]]
The mean accuracy of XGBoost is 0.7061111111111111
The mean sd of XGBoost is 0.07202215879869676
Logistic Regression confusion matrix is given by:
[[35  3]
 [15  5]]
The mean accuracy of Logistic Regression is 0.7108333333333333
The mean sd of Logistic Regression is 0.08855579952566422
knn confusion matrix is given by:
[[34  4]
 [14  6]]
The mean accuracy of knn is 0.7058333333333332
The mean sd of knn is 0.04882413641116756
Naive Bayesn confusion matrix is given by:
[[26 12]
 [10 10]]
The mean accuracy of Naive Bayes is 0.6583333333333333
The mean sd of Naive Bayes is 0.14115102235234114
Decision Tree confusion matrix is given by:


'Predict if the customer with the following informations will leave the bank:\nGeography: France\nCredit Score: 600\nGender: Male\nAge: 40\nTenure: 3\nBalance: 60000\nNumber of Products: 2\nHas Credit Card: Yes\nIs Active Member: Yes\nEstimated Salary: 50000'

In [None]:

#### Final Model#### 
'''
After searching and testing classifiers, I have found that SVM seems to be the best fit for
this problem - with consistently best results for both accuracy and standard deviation.
I have then used grid search to then further optimise this result. Below is the code for the 
final and best classifier with the parameters the grid search found. 

This is likely the best as, SVM work quite well with smaller data sets. If more data was 
available, we might expect another classifier to work better (XGBoost is a particularly good
ensemble method). In the example confusion matrix given by kernel svm, we can see few/no type
1 errors occur, while many type 2 errors do. If more data was available, we might seek a
solution with fewer of type 2 errors. 
'''
#####  Best model found: Kernel SVM ##########
# Fitting Kernel SVM to the Training set
from sklearn.svm import SVC
classifier = SVC(C = 1, kernel = 'rbf', gamma = 0.12)
classifier.fit(X_train, y_train)

# Predicting the Test set results
y_pred = classifier.predict(X_test)

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print("An example a confusion matrix for this model is given by:")
print(cm)

# Applying k-Fold Cross Validation - For evaluating model.
from sklearn.model_selection import cross_val_score
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 15) # Vector of accuracies.  

print("The mean accuracy of Random Tree is", accuracies.mean())  # Mean of accuracies.
print("The mean sd of Random Tree is", accuracies.std())   # Standard deviation of accuracies. 

# This model is then ready to use to predict those with further potential risk of recurrence. 

An example a confusion matrix for this model is given by:
[[38  2]
 [16  2]]
The mean accuracy of Random Tree is 0.7144444444444442
The mean sd of Random Tree is 0.06829380066943519


## Deep Learning

In [None]:
# Deep Learning

# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importing the dataset
dataset = pd.read_excel('/content/drive/My Drive/Colab Notebooks/Data/breast-cancer.xls')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, 9].values

#print(dataset.head(10)) # Inspect independent variables X.
#print(Counter(dataset))
#for col in dataset:   
#  print(dataset[col].unique()) # Two columns have missing values (as "?").


# Replacing missing data
# For categorical data, we can delete row or replace with the mode.
# Below, I identify the required modes and sub in for missing values.
from collections import Counter
# Replacing missing values in "node-caps".
replace_missing = Counter(X[:,4])            # This line counts the frequency of each entry.
replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
X[:,4] = np.where(X[:,4] == '?', replace_missing, X[:,4]) # Applying the most common value to the missing ones. 

# Replacing missing values in "breast-quad".
replace_missing = Counter(X[:,7])            # This line counts the frequency of each entry.
replace_missing = replace_missing.most_common(1)[0][0]  # This finds the most frequent (mode).
X[:,7] = np.where(X[:,7] == '?', replace_missing, X[:,7]) # Applying the most common value to the missing ones. 

X = pd.DataFrame(X)

# Encoding the Independent Variables
# ordinal categorical 
age_mapping = {'20-29':1, '30-39':2, '40-49':3, '50-59':4, '60-69':5, '70-79':6}
X[0] = X[0].map(age_mapping)
tumorsize_mapping = { '0-4':1, '5-9':2, '10-14':3, '15-19':4, '20-24':5, '25-29':6, '30-34':7, '35-39':8, '40-44':9, '45-49':10, '50-54':11}
X[2] = X[2].map(tumorsize_mapping)
inv_nodes_mapping = {'0-2':1, '3-5':2, '6-8':3, '9-11':4, '12-14':5, '15-17':6, '24-26':7}
X[3] = X[3].map(inv_nodes_mapping)

# Nominal 
node_caps_mapping = {"no":0, "yes":1}
X[4] = X[4].map(node_caps_mapping)
breast_mapping = {"left":0, "right":1}
X[6] = X[6].map(breast_mapping)
irradiat_mapping = {"no":0, "yes":1}
X[8] = X[8].map(irradiat_mapping)

from sklearn.preprocessing import OneHotEncoder # Encode the variables with more than one category that aren't ordinal. 
from sklearn.compose import ColumnTransformer
ct = ColumnTransformer([('encoder', OneHotEncoder(), [1,7])], remainder='passthrough') 
X = np.array(ct.fit_transform(X), dtype=np.float)
X = X[:, 1:] # Remove one dummy variable from menopause.
t = X[:, [2,0,1]]
X[:, [0,1,2]] = t
X = X[:, 1:] # Rearranging and removing 1 dummy variable for the "breast-quad" variable. 

# Encoding the dependent variable
from sklearn.preprocessing import LabelEncoder
y = LabelEncoder().fit_transform(y)

print(y)

# Creating the Training set and Test set
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3) 

# Add feature scaling as we are using SVM. 
from sklearn.preprocessing import StandardScaler
scaling = StandardScaler()
X_train = scaling.fit_transform(X_train)
X_test = scaling.transform(X_test)



[1 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0
 0 0 0 1 0 0 1 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 1 0 0 0 1 0 1 0 0 1
 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 1 1 0 0 0 1 0 1 1 1 1 0 0 0 1 1 1 0 0 0
 0 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0 1 0 0 0 1 0
 0 0 1 0 0 1 0 0 1 1 0 0 0 0 0 0 1 1 1 0 0 0 1 1 0 1 0 0 0 1 1 0 0 1 0 1 0
 0 0 1 0 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 1 0 1 1 0 1 1 0 0 0 0 1 0 1 0 0 0
 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 1 0
 1 0 0 0 1 0 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [None]:

# Importing the Keras libraries and packages
import keras
from tensorflow.python.keras.layers import Dense
from tensorflow.python.keras import Sequential


# Initialising the ANN
classifier = Sequential() # model class

# Adding the input layer and the first hidden layer
# add method used to add layers.
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13)) 

# Adding the second hidden layer
classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu')) #add second hidden layer

# Adding the output layer
classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid')) # Sigmoid is used at the end to get probability at the end.
# for more dependent variables with more than two categories use  - change units to the number you have and the activation to a multiple sigmoid version "softmax" 

# Compiling the ANN
classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy']) 
# adam is a stocastic gradient decent algorithm
# for more dependent variables with more than two categories use "category_crossentropy"

print(X_train.dtype)
print(y_train.dtype)
print(y_train)

float64
int64
[0 0 1 0 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 1 0 1 0 0 0 0 0 0 1 0 0
 0 0 0 0 0 1 0 1 1 1 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 1 0 1
 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 0 0 0 1 0 0 0 1 0 0 0 0 1 1 0 0 0 1 0 0 0
 0 1 0 0 0 1 1 0 1 0 1 1 1 0 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 1 1
 0 0 0 1 0 0 0 1 0 0 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 1 1 0 0 0 1 0 0 0 0 0 1
 0 0 1 0 1 0 1 1 0 0 1 0 0 1 0]


In [None]:
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size = 10, epochs = 100)

# Part 3 - Making the predictions and evaluating the model

# Predicting the Test set results
y_pred = classifier.predict(X_test)
y_pred = (y_pred > 0.5) # Threshold of 50% 

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

print(cm)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [None]:
# Part 4 - Evaluating, Improving and Tuning the ANN

# Evaluating the ANN
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score
from keras.models import Sequential
from keras.layers import Dense
def build_classifier():
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier, batch_size = 10, epochs = 100)
accuracies = cross_val_score(estimator = classifier, X = X_train, y = y_train, cv = 10, n_jobs = -1)
mean = accuracies.mean()
variance = accuracies.std()

In [None]:
print(mean)
print(variance)

0.6750000059604645
0.08732123949825665


In [None]:
'''
Parameter tuning with gridsearch 
'''
# Improving the ANN
# Dropout Regularization to reduce overfitting if needed

# Tuning the ANN
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV
from keras.models import Sequential
from keras.layers import Dense
def build_classifier(optimizer):
    classifier = Sequential()
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu', input_dim = 13))
    classifier.add(Dense(units = 6, kernel_initializer = 'uniform', activation = 'relu'))
    classifier.add(Dense(units = 1, kernel_initializer = 'uniform', activation = 'sigmoid'))
    classifier.compile(optimizer = optimizer, loss = 'binary_crossentropy', metrics = ['accuracy'])
    return classifier
classifier = KerasClassifier(build_fn = build_classifier)
parameters = {'batch_size': [25, 32],
              'epochs': [100, 500],
              'optimizer': ['adam', 'rmsprop']}unit
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10)
grid_search = grid_search.fit(X_train, y_train)
best_parameters = grid_search.best_params_
best_accuracy = grid_search.best_score_

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Epoch 102/500
Epoch 103/500
Epoch 104/500
Epoch 105/500
Epoch 106/500
Epoch 107/500
Epoch 108/500
Epoch 109/500
Epoch 110/500
Epoch 111/500
Epoch 112/500
Epoch 113/500
Epoch 114/500
Epoch 115/500
Epoch 116/500
Epoch 117/500
Epoch 118/500
Epoch 119/500
Epoch 120/500
Epoch 121/500
Epoch 122/500
Epoch 123/500
Epoch 124/500
Epoch 125/500
Epoch 126/500
Epoch 127/500
Epoch 128/500
Epoch 129/500
Epoch 130/500
Epoch 131/500
Epoch 132/500
Epoch 133/500
Epoch 134/500
Epoch 135/500
Epoch 136/500
Epoch 137/500
Epoch 138/500
Epoch 139/500
Epoch 140/500
Epoch 141/500
Epoch 142/500
Epoch 143/500
Epoch 144/500
Epoch 145/500
Epoch 146/500
Epoch 147/500
Epoch 148/500
Epoch 149/500
Epoch 150/500
Epoch 151/500
Epoch 152/500
Epoch 153/500
Epoch 154/500
Epoch 155/500
Epoch 156/500
Epoch 157/500
Epoch 158/500
Epoch 159/500
Epoch 160/500
Epoch 161/500
Epoch 162/500
Epoch 163/500
Epoch 164/500
Epoch 165/500
Epoch 166/500
Epoch 167/500
Epoch 168/5