In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

In [None]:
#import the necessary libraries you are going to use
import warnings
warnings.filterwarnings('ignore')

# -----> Put your code here below:


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Now loading the datasets

In [None]:
Train = pd.read_csv("/kaggle/input/amp-data-set/AMP_TrainSet.csv")
Test = pd.read_csv("/kaggle/input/amp-data-set/Test.csv")

#the code loads the datasets into the environment haveing specified the path from which the datasets are to be pulled

# At this point we are carrying out exploratory data analysis(EDA) which summarizes the main characteristics in the data set. It is an approach to analyzing data sets to summarize their main characteristics often with visuals. It's purpose is to suggest hypotheses about the causes of observed phenomena by assessing assumption son which statistical inferences will be based. It provides the basis for further data collection. In order to draw reliable conclusions from massive amounts of data, we must carefully and methodically look through the data which is the reason for EDA

# Checking dimensions of the datasets

## We do this to identify how many features we are dealing with at this stage of EDA in order to prepare better for the next steps

In [None]:
# check the dimensions of your data

Train.shape, Test.shape

#This command enables us to know the dimensions of our datasets which are basically the rows and columns that are contained in them.

# The Train dataset has 3038 rows and 12 columns while the test dataset has 758 rows and 11 columns  

# Checking out the datasets
## The purpose of this command is to check whether we loaded the dataset properly or to be sure that it is the correct dataset

In [None]:
Train.head(10)
#checks the first ten rows of the training dataset to view what kind of data to expect

In [None]:
Test.head(10)
#checks the first ten rows of the test dataset to view what kind of data to expect

# The command showed us what data we are dealing with in the datasets

# We need to determine the data type we are working with in the dataset. This is important because most machine learning algorithms work with numerical data. The purpose of this is to identify whether we have numerical data(integers or floats) or categorical data that needs further coding to be manipulated in the algorithm

In [None]:
Train.dtypes,  Test.dtypes
#the code checks to see the type of data that we have in each dataset

# from the above code we found that in both datasets, the only data types are floats and integers

In [None]:
Train.isnull().sum(), Test.isnull().sum()
#the code above checks whethere there is any missing value in the dataset



# From the code above we found that there is no missing data in any of the datasets

# Descriptive statistics

## These quantitatively describe features of a dataset taht we aim to summarize, organize and clean. They are used to describe the data before feeding it into the machine learning model using features and sample sets.

In [None]:
Train.describe()

#code to provide the descriptive statistcis of the dataset

In [None]:
Test.describe()

#code to provide the descriptive statistcis of the dataset

# The above code showed us the descriptive statistics of the datasets which will be useful going forward.

## It is important that the CLASS variable is balanced beacuse algorithms tend to favor the class with the largest proportion of observations which may lead to misleading aaccuracies especially if the classes are rare.

In [None]:
Train.groupby('CLASS').size().plot(kind='bar')
#code indicates whether the categorical values are imbalanced or not 

# From the plot we can see that distribution in the variable CLASS is balanced

# Determining correlation between variables

## Correlation explains the extent of the relationship between the features of the data. This is important in case one feature has a relationship with another and could provide valuable information for making sense of another during the prediction stage

In [None]:
Train.corr(method='pearson')

In [None]:
plt.figure(figsize=(6,6))
sns.heatmap(Train.corr(method='pearson'))

In [None]:
Train.corr(method= 'pearson')['CLASS']

# The plot below is of histograms of the dataset and this shows us the distribution of each variable in the dataset

## Histograms summarize and display the distribution of the variables in the dataset, identify skewness of data and if need be identifies which features should be modified before inputting in the model

In [None]:
plt.figure(figsize=(24,24))
Train.hist()
plt.show()

# The plots below show presence or absence of outliers in each variable

## Box plots provide a standardized way of displaying distribution of data in terms of min,max,upper quartile,lower quartile and median. Outliers are numerically distant from the rest of the data. They may contain valuable information or not but they tend to skew data away from a normal distribution

In [None]:
Train.plot(kind='box', subplots=True, layout=(6,2), sharex=False, sharey=False)
plt.show()

# The plot below shows us the distribution of each variable compared with one another

## The scatter plots illustrate a relationship between each variable with each other which can be positive or negative

In [None]:
sns.pairplot(Train)

# Distribution plot

## A univariate plot to know about the distribution of data when analyzing effect on dependent variable with respective to a single feature

In [None]:
sns.FacetGrid(Train,size=11).map(sns.distplot,'CLASS').add_legend()

# Violin plot

## This is a combination of a Box plot at the middle and distribution plots on both side of the data which gives us the details of distribution. This plot is used to visualizeddistribution of the data and it's probability density. A violin plot contains all data points and is an excellent tool to visualize samples.

In [None]:
sns.violinplot(x='CLASS',y='NT_EFC195',data=Train,size=6)

# Classification

## From the statistical descriptions we saw that some variables had negative values and thus i decided to work the data for both rescaled data and the same data to see if these values will affect our final outcome

# Rescaling the data

## As the data scales vary in the dataset, rescaling is useful for optimization algorithms

In [None]:
from numpy import set_printoptions
from sklearn.preprocessing import MinMaxScaler

array = Train.values
#separate array into input and output components
X = array[:,0:11]
Y = array[:,11]
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX = scaler.fit_transform(X)
# summarize transformed data
#set_printoptions(precision=3)
print(rescaledX[0:5,:])
np.random.seed(42)

# Feature selection

## This is the process of reducing the number of input variables when developing a predictive model. It is important as it reduces computational cost of modelling and often improves the purpose of the model by only selecting the features that are useful for the model that contribute most to the prediction variable

## For this i chose the Recursive Feature Elimination method(RFE) which uses model accuracy to identify which attributes  contribute the most to predicting the target attribute. This method fits a model and removes the weakest feature or features until the specified number of features is reached. This helped eliminate the weakest features for prediction.

In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

array_1 = Train.values
X = array_1[:,0:11]
Y = array_1[:,11]
# feature extraction
#array = Train.values
# separate array into input and output components
model = LogisticRegression()
rfe = RFE(model, 8)
fit = rfe.fit(rescaledX, Y)
print("Num Features: ",  fit.n_features_)
print("Selected Features:",  fit.support_)
print("Feature Ranking: ",  fit.ranking_)
np.random.seed(42)

# Evaluating the performance of classification models

## The purpose is to know how well an algorithm performs on unseen data and thus be able to predict more accurately on your required data. This prevents overfitting which would occur if the algorithm is trained on the same data that it will test leaving you with perfect scores that are unrealistic.

# Split into train and test data
## This method separates your training data into a training and testing data set from which we can use a model and determine the accuracy of prediction. This method enables the use of the training set to build and train your model and test the accuracy of the model on the same dataset without using the actual data you want to predict on. So as to see how well it performs on any data with the highest accuracy. 

# KNeighborsClassifier model
## This classifier implements the k-nearest neighbors vote by assigning weights to the contributions of neighbours such that the nearer neighbours contribute more to the average than the distant ones

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = KNeighborsClassifier()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))
np.random.seed(42)

# Logistic Regression
## Describes data and explains the relationship between one variable and the other

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))
np.random.seed(42)

# GaussianNB
## Bases on applying Bayes’ theorem with the assumption of conditional independence between every pair of features given the value of the class variable. This model works well in most real world scenarios.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = GaussianNB()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))
np.random.seed(42)

# Linear Discriminant Analysis
## This is a dimensionality reduction technique that reduces the dimensions while retaining as much information as possible

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))
np.random.seed(42)

# Quadratic Discrimination Analysis

## A variation of LDA that is useful if there is prior knowledge that individual classes exhibit distinct covariance. QDA is less strict and allows for differing covariance for different classes. QDA is flexible and hence can lead to an improved prediction performance

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = QuadraticDiscriminantAnalysis()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))
np.random.seed(42)

# Stochastic Gradient Descent

##  A classification method used to find values of the parameters of a function minimizing the cost as much as possible. Stochastic implies that the process is linked with random probability where a few samples are selected at random rather than the whole data set. This method is considered because it is computationally fast as it only works on one sample at a time. It is also converges faster for larger datasets as it causes updates to the parameters more frequently

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = SGDClassifier()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))
np.random.seed(42)

# Decision Tree classifier
## It is a predictive modeling approach that uses a decision tree to go from observations about an item to conclusions about the item's target value

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))
np.random.seed(42)

# Random Forest Classifier

## This is a model that grows multiple trees and classifies objects based on votes votes of all the trees. It reduces the problem of overfitting or high bias

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = RandomForestClassifier()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))
np.random.seed(42)

# Support Vector Machine
## supervised machine learning model that uses classification algorithms for two-group classification problems

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = SVC()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))
np.random.seed(42)

# MLPC Classifier

## This is a multi-layer perceptron which utilizes supervized learning method called back propagation for training and can distinguish data that is not linearly seperable

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = MLPClassifier()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))
np.random.seed(42)

# Confusion matrix
## This is a table that is often used to describe the performance of a classification model on a set of test data for which the true values are known. It allows the visualization of the performance of an algorithm. The ideal matrix has the false positives and false negatives as 0 indication perfect performance. 

## I added the code to identify the Matthew's Correlation Coefficient (MCC) which is a measure of the quality of the classification and a more accurate representation of which model actually performs better.

## KNeighborsClassifier

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)

model = KNeighborsClassifier()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)
MCC = matthews_corrcoef(Y_test, predicted)
matrix = confusion_matrix(Y_test, predicted)
print(matrix)
print(MCC)
np.random.seed(42)

##  Logistic Regression

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)

model = LogisticRegression()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
MCC = matthews_corrcoef(Y_test, predicted)
print(matrix)
print(MCC)
np.random.seed(42)

## GaussianNB

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)

model = GaussianNB()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
MCC = matthews_corrcoef(Y_test, predicted)
print(matrix)
print(MCC)
np.random.seed(42)

## Linear Discriminant Analysis

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)

model = LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
MCC = matthews_corrcoef(Y_test, predicted)
print(matrix)
print(MCC)
np.random.seed(42)

# Quadratic Discriminant Analysis

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)

model = QuadraticDiscriminantAnalysis()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
MCC = matthews_corrcoef(Y_test, predicted)
print(matrix)
print(MCC)
np.random.seed(42)

# Stochastic Gradient Descent

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)

model = SGDClassifier()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
MCC = matthews_corrcoef(Y_test, predicted)
print(matrix)
print(MCC)
np.random.seed(42)

## Decision Tree Classifier

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)

model = DecisionTreeClassifier()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
MCC = matthews_corrcoef(Y_test, predicted)
print(matrix)
print(MCC)
np.random.seed(42)

# Random Forest Classifier

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)

model = RandomForestClassifier()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
MCC = matthews_corrcoef(Y_test, predicted)
print(matrix)
print(MCC)
np.random.seed(42)

## Support Vector Manchine

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef


test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)

model = SVC()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
MCC = matthews_corrcoef(Y_test, predicted)
print(matrix)
print(MCC)
np.random.seed(42)

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef


test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)

model = MLPClassifier()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)
matrix = confusion_matrix(Y_test, predicted)
MCC = matthews_corrcoef(Y_test, predicted)
print(matrix)
print(MCC)
np.random.seed(42)

# Classification Report

## This is a convenient report that provides precision, recall, f1- score and support for each class. This helps to provide a quick idea of the accuracy of the model 


## KNeighborsClassifier

In [None]:
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = KNeighborsClassifier()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)
np.random.seed(42)

# Logistic Regression

In [None]:
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)
np.random.seed(42)

# GaussianNB

In [None]:
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = GaussianNB()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)
np.random.seed(42)

# Linear Discriminant Analysis

In [None]:
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = LinearDiscriminantAnalysis()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)
np.random.seed(42)

# Quadratic Discriminant Analysis

In [None]:
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = QuadraticDiscriminantAnalysis()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)
np.random.seed(42)

# Stochastic Gradient Descent

In [None]:
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = SGDClassifier()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)
np.random.seed(42)

# Decision Tree Classifier

In [None]:
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = DecisionTreeClassifier()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)
np.random.seed(42)

# Random Forest Classifier

In [None]:
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = RandomForestClassifier()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)
np.random.seed(42)

# Support Vector Machine

In [None]:
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = SVC()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)
np.random.seed(42)

In [None]:
from sklearn.metrics import classification_report

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(rescaledX[:,(0,1,2,3,4,5,6,7)], Y, test_size=test_size,
random_state=seed)
model = MLPClassifier()
model.fit(X_train, Y_train)
predicted = model.predict(X_test)
report = classification_report(Y_test, predicted)
print(report)
np.random.seed(42)

# For downstream work,we have to rescale the test dataset so that the predictions can fit on the test data and be accurate since the training data set was rescaled

In [None]:
array2 = Test.values
# separate array into input and output components
X1 = array2[:,0:10]
Y1 = array2[:,10]
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledX1 = scaler.fit_transform(X1)
# summarize transformed data
set_printoptions(precision=3)
print(rescaledX1[0:5,:])
np.random.seed(42)

# The following code compares the algorithms consistently outputting accuracy scores and MCC.
## This code uses K-fold cross validation method to evaluate the model. This method bases on randomly partitioning data into k equal sized subsamples with a single subsample retained as the validation data for testing the model, and the remaining are used as training data. The process is then repeated k times, with each of the k subsamples used exactly once as the validation data. The k results can then be averaged to produce a single estimation. This method tests the model's ability to predict new data that was not used in estimating it, in order to flag problems like overfitting. 
### I also added MCC to the code for better interpretation

In [None]:
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import matthews_corrcoef

array = Train.values 
X = array[:,0:11]
Y = array[:,11]

# prepare models and add them to a list
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('SDG', SGDClassifier()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('MLPC', MLPClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

for name, model in models:
    kfold = KFold(n_splits=30, random_state=14)
    cv_results = cross_val_score(model, rescaledX[:,(0,1,2,3,4,5,6,7)], Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = (name, cv_results.mean(), cv_results.std())
    print(msg)
    model.fit(rescaledX[:,(0,1,2,3,4,5,6,7)], Y)
    predicted = model.predict(rescaledX[:,(0,1,2,3,4,5,6,7)])
    MCC = matthews_corrcoef(Y, predicted)
    print(MCC)
    

# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()
np.random.seed(42)

# From the above evaluations;
## From the Split test and train data, KNeighborsClassifier and RandomForestClassifier had the highest accuracy and MCC. While from the Kfold cross validation data, GaussianNB and Support Vector Machine had the highest accuracy while Decision Tree Classifier and RandomForestClassifier had the highest MCC both equating to 1.0.

## In order to determine the most ideal model for prediction, i run the prediction algorithms using each of the 4 mentioned models and submitted the csv files in order to compare the scores

# KNeighborsClassifier

In [None]:
model = KNeighborsClassifier()
model.fit(rescaledX[:,(0,1,2,3,4,5,6,7)], Y)
model.predict(rescaledX1[:,(0,1,2,3,4,5,6,7)])
prediction = model.predict(rescaledX1[:,(0,1,2,3,4,5,6,7)])
test_pred = pd.DataFrame(prediction)
test_pred.columns = ["CLASS"]
test_pred.index.name = "Index"
test_pred['CLASS']= test_pred['CLASS'].map({0.0:False,1.0:True})

test_pred.to_csv("test_pred.csv")
print(test_pred['CLASS'].unique())
print(test_pred.groupby('CLASS').size()[0].sum())
print (test_pred.groupby('CLASS').size()[1].sum())
test_pred

# RandomForestClassifier

In [None]:
model = RandomForestClassifier()
model.fit(rescaledX[:,(0,1,2,3,4,5,6,7)],Y)
model.predict(rescaledX1[:,(0,1,2,3,4,5,6,7)])
prediction = model.predict(rescaledX1[:,(0,1,2,3,4,5,6,7)])
test_pred = pd.DataFrame(prediction)
test_pred.columns = ["CLASS"]
test_pred.index.name = "Index"
test_pred['CLASS']= test_pred['CLASS'].map({0.0:False,1.0:True})

test_pred.to_csv("test_pred.csv")
print(test_pred['CLASS'].unique())
print(test_pred.groupby('CLASS').size()[0].sum())
print (test_pred.groupby('CLASS').size()[1].sum())
test_pred

# GaussianNB

In [None]:
model = GaussianNB()
model.fit(rescaledX[:,(0,1,2,3,4,5,6,7)],Y)
model.predict(rescaledX1[:,(0,1,2,3,4,5,6,7)])
prediction = model.predict(rescaledX1[:,(0,1,2,3,4,5,6,7)])
test_pred = pd.DataFrame(prediction)
test_pred.columns = ["CLASS"]
test_pred.index.name = "Index"
test_pred['CLASS']= test_pred['CLASS'].map({0.0:False,1.0:True})

test_pred.to_csv("test_pred.csv")
print(test_pred['CLASS'].unique())
print(test_pred.groupby('CLASS').size()[0].sum())
print (test_pred.groupby('CLASS').size()[1].sum())
test_pred

# Support Vector Machine

In [None]:
model = SVC()
model.fit(rescaledX[:,(0,1,2,3,4,5,6,7)], Y)
model.predict(rescaledX1[:,(0,1,2,3,4,5,6,7)])
prediction = model.predict(rescaledX1[:,(0,1,2,3,4,5,6,7)])
test_pred = pd.DataFrame(prediction)
test_pred.columns = ["CLASS"]
test_pred.index.name = "Index"
test_pred['CLASS']= test_pred['CLASS'].map({0.0:False,1.0:True})

test_pred.to_csv("test_pred.csv")
print(test_pred['CLASS'].unique())
print(test_pred.groupby('CLASS').size()[0].sum())
print (test_pred.groupby('CLASS').size()[1].sum())
test_pred

# I discovered that the best submission score came from the GaussianNB that had the highest accuracy score in the Kfold cross validation and a moderate MCC compared to the other models. The KNeighborsClassifier and Random Forest Classifier models performed more poorly than the GaussianNB even though they had higher accuracy and MCC from split train and test evaluation

# From this i concluded that the kfold cross validation method of evaluating machine learning algorithm performance is much more powerful and accurate than splitting into test and train data sets and therefore chose GaussianNB as my prediction model

# I decicded to also test whether transforming the dataset by rescaling actually affected the model's functionality by re-evaluating using kfold cross validation to see whether there would be any change in the accuracy or MCC of the model chosen to be the most optimal

In [None]:
from pandas import read_csv
from matplotlib import pyplot
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.linear_model import SGDClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import matthews_corrcoef

array = Train.values 
X = array[:,0:11]
Y = array[:,11]

# prepare models and add them to a list
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('RFC', RandomForestClassifier()))
models.append(('SDG', SGDClassifier()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('MLPC', MLPClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

for name, model in models:
    kfold = KFold(n_splits=30, random_state=14)
    cv_results = cross_val_score(model, X[:,(0,1,2,3,4,5,6,7)], Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = (name, cv_results.mean(), cv_results.std())
    print(msg)
    model.fit(X[:,(0,1,2,3,4,5,6,7)], Y)
    predicted = model.predict(X[:,(0,1,2,3,4,5,6,7)])
    MCC = matthews_corrcoef(Y, predicted)
    print(MCC)
    

# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()
np.random.seed(42)

# It was discovered that evaluating non-rescaled data didn't change GaussianNB's accuracy or MCC, as well as most of the models. BUt it did change for the following;
## - KNeighborsClassifier; where the accuracy and MCC both reduced
## - DecisionTreeClassifier; where the accuracy reduced
## - RandomForestClassifier; where accuracy reduced
## - Support Vector Machine; where the accuracy and MCC both reduced
## - Stochastic Gradient Descent; where the accuracy and MCC both reduced
## - MLPClassifier; where the MCC increased

# This prompted me to make a prediction using also the non-rescaled data to see if there would be a change in the score submission.

In [None]:
model = GaussianNB()
model.fit(X[:,(0,1,2,3,4,5,6,7)],Y)
model.predict(X1[:,(0,1,2,3,4,5,6,7)])
prediction = model.predict(X1[:,(0,1,2,3,4,5,6,7)])
test_pred = pd.DataFrame(prediction)
test_pred.columns = ["CLASS"]
test_pred.index.name = "Index"
test_pred['CLASS']= test_pred['CLASS'].map({0.0:False,1.0:True})

test_pred.to_csv("test_pred.csv")
print(test_pred['CLASS'].unique())
print(test_pred.groupby('CLASS').size()[0].sum())
print (test_pred.groupby('CLASS').size()[1].sum())
test_pred

# From this prediction, the submission score was found to be slightly higher than the rescaled data.
## It was also discovered that the higher the number of features selected the higher the accuracy of the models and the higher the score after submission.
## The final testing was to see whether the score would improve if all features were considered and the prediction was done on all the data

In [None]:
model = GaussianNB()
model.fit(X[:,0:10],Y)
model.predict(X1)
prediction = model.predict(X1)
test_pred = pd.DataFrame(prediction)
test_pred.columns = ["CLASS"]
test_pred.index.name = "Index"
test_pred['CLASS']= test_pred['CLASS'].map({0.0:False,1.0:True})

test_pred.to_csv("test_pred.csv")
print(test_pred['CLASS'].unique())
print(test_pred.groupby('CLASS').size()[0].sum())
print (test_pred.groupby('CLASS').size()[1].sum())
test_pred

# It was discovered that the highest score was obtained from the set which was predicted without features selected.

# In conclusion, Kfold cross validation is a better tool for evaluating the performance of a machine learning algorithm than splitting the data into test and training sets. It was also noted that accuracy or MCC alone are not efficient measures of model functionality as we saw the models with the highest accuracy in split test and train data did not perform best upon submission. The best model for this data set is the GaussianNB model which coincides with previous claims that it works best for real world data. An MCC OF 1.0 indicates a perfect agreement between actuals and predictions. Thus we would expect that KNeighborsClassifier and RandomForestClassifier would perform the best at prediction. However we see that they did not perform as expected which can be put down to a high rate of false positives and false negatives being generated.
# Therefore we need to consider several factors such as precision and F1-score before we conclude which model is best.

# Literature:
## -wikipedia
## -class notes
## -https://scikit-learn.org/stable/supervised_learning.html#supervised-learning
## -https://medium.com/datadriveninvestor/classification-algorithms-in-machine-learning-85c0ab65ff4
## -https://www.geeksforgeeks.org/ml-stochastic-gradient-descent-sgd/
## -https://data-flair.training/blogs/machine-learning-classification-algorithms/
## -https://www.datascienceblog.net/post/machine-learning/linear-discriminant-analysis/
## -https://towardsdatascience.com/data-visualization-for-machine-learning-and-data-science-a45178970be7
## -https://seaborn.pydata.org/generated/seaborn.distplot.html
## -https://bmcgenomics.biomedcentral.com/articles/10.1186/s12864-019-6413-7