In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: 
# https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # used in linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) 
# will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

# Understanding Machine Learning pipeline

## by Tumuhimbise Peninah
## This involves;
### 1). import necessary libraries and the data. 2). look at your raw data and review the dimensions. 3). review the data types of attributes in your data. 4). summarize your data using descriptive statistics. 5). understand the relationships in your data using correlations. 6). review the skew of the distributions of each attribute and visualise your data. 7). prepare your data for machine learning. 8). feature selection for training your ML models. 9). evaluate the performance of ML algorithms. 10). machine learning algorithm performance metrics. 11). create model based on training dataset. 12). use model on test dataset


In [None]:
# import the necessary libraries
import numpy as np # used in linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #interactive plots and plot generation
import seaborn as sns #provides interface for data visualization

# remove all warnings from the cells
import warnings
warnings.filterwarnings('ignore')


## Import datasets; training dataset;AMP_TrainSet.csv and test dataset; Test.csv

In [None]:
# import your data
AMP_TrainSet = pd.read_csv("../input/AMP_TrainSet.csv")
Test = pd.read_csv("../input/Test.csv")


## Using the head() command, we can see how our data looks like

In [None]:
# this displays the first five lines of the train dataset
AMP_TrainSet.head(5)


In [None]:
# this displays the first five lines of the test dataset
Test.head(5)

## Dimensions of our dataset

## This will tell us how many rows and columns our datasets contain. The first figure represents the rows and the second figure represents the columns.

In [None]:
# shape command tells us how many rows and columns our dataset contain. 
AMP_TrainSet.shape

In [None]:
# columns command tells us how many columns our dataset contain.
AMP_TrainSet.columns

## Data type for attributes of the dataset
## Each sample in a dataset has an arbitrary number of attributes. They are stored as vectors of the same length as the number of samples in a collection, and are accessible via the attribute. Attribute data can be stored as one of five different field types in a dataset: character, integer, floating, date, and BLOB.
## Data types are the classification or categorization of data items. Data types represent a kind of value which determines what operations can be performed on that data.

In [None]:
#dtypes command will show us the datatypes for each attribute
AMP_TrainSet.dtypes

## Descriptive Statistics
## When we have a set of observations, it is useful to summarize features of our data into a single statement called a descriptive statistic. As their name suggests, descriptive statistics describe a particular quality of the data they summarize. These statistics fall into two general categories: the measures of central tendency such as mean, median and mode and the measures of spread such as standard deviation, range and interquatrtile range and variance.

In [None]:
# describe() gives us the descriptive statistics as you can see below
AMP_TrainSet.describe()

## Class Distribution
## To know how balanced the class values are, we need to classify the data. You can quickly get an idea of the distribution of the class attributes in Pandas.

In [None]:
# we are classifying the data based on CLASS a column in our train dataset
AMP_TrainSet.groupby('CLASS').size().plot(kind='bar')

## Correlations Between Attributes
## Correlation is any statistical association, though it commonly refers to the degree to which a pair of variables are linearly related. Correlations are useful because they can indicate a predictive relationship that can be exploited.
## A correlation of -1 or 1 shows a full negative or positive correlation respectively. Whereas a value of 0 shows no correlation at all. Some machine learning algorithms like linear and logistic regression can suffer poor performance if there are highly correlated attributes in your dataset.

In [None]:
# let us use pearson correlation method
AMP_TrainSet.corr(method='pearson')

In [None]:
# now we are going to plot a heat map to see the correlation
# the seaborn package is usually loaded at this point
# but since we already imported no need to do it now
plt.figure(figsize=(6,6))
sns.heatmap(AMP_TrainSet.corr(method='pearson'))

In [None]:
# let's check how these variables correlate to CLASS 
# because we are trying to build an algorithm to predict the CLASS variable

AMP_TrainSet.corr(method='pearson')['CLASS']

## The following have correlation coefficients closest to 1 and therefore are better picks to use to build the algorithms
### AS_MeanAmphiMoment    0.693552
### FULL_Charge           0.534602

## Skew of the distributions of each attribute
## A data is called as skewed when curve appears distorted or skewed either to the left or to the right, in a statistical distribution. In a normal distribution, the graph appears symmetry meaning that there are about as many data values on the left side of the median as on the right side. Skew refers to a distribution that is assumed Gaussian (normal or bell curve) that is shifted or squashed in one direction or another.
## You can calculate the skew of each attribute using the skew() function. If skewness value lies above +1 or below -1, data is highly skewed. If it lies between +0.5 to -0.5, it is moderately skewed. If the value is 0, then the data is symmetric
## When data is positively skewed, the data can be transformed using some common transformations like square root, cube root and logarithm
## When data is negatively skewed, transformations that can be used to reduce the skewedness are square and logarithm

In [None]:
# skew command is used to show the skew distributions 
# plot visualises the skew distributions for all the attributes
AMP_TrainSet.skew().plot(kind='bar')

## Visualise your data
## Visualisation of the data can provide some really important information about your dataset. Plots are used for visualisation; univariate plots that can be used to understand each attribute of the data such as histograms, density plots, box and whisker plots and multivariate plots that show interactions between multiple variables in the data such as correlation matrix plot and scatter plot matrix.

## Univariate plots;

## 1. Histograms
## A histogram is one of the most frequently used data visualization techniques in machine learning. It represents the distribution of a continuous variable over a given interval or period of time. Histograms plot the data by dividing it into intervals called ‘bins’. It is used to inspect the underlying frequency distribution (eg. Normal distribution), outliers, skewness, etc.

In [None]:
plt.figure(figsize=(15,15))
AMP_TrainSet.hist()
plt.show()

## 2. Density plots
## This are similar to histograms except that we have a smooth curve instead of the bars. Density plots a way to estimate the probability density function of a continuous random variable. It is used when you need to know the distribution of the variable.

In [None]:
# we have to indicate what kind of plot we want in the plot() function
AMP_TrainSet.plot(kind='density', subplots=True, layout=(3,4), sharex=False)
# this displays the plot
plt.show

## 3. Box and Whisker Plots
## Boxplots summarize the distribution of each attribute, drawing a line for the median (middle value) and a box around the 25th and 75th percentiles (the middle 50% of the data). The whiskers give an idea of the spread of the data and dots outside of the whiskers show candidate outlier values (values that are 1.5 times greater than the size of spread of the middle 50% of the data).

In [None]:
# we have to indicate what kind of plot we want in the plot() function
AMP_TrainSet.plot(kind='box', subplots=True, layout=(3,4), sharex=False, sharey=False)
# this displays the plot
plt.show()

## Multivariate Plots

## 1. Correlation Matrix Plot
## Correlogram is a graph of correlation matrix. It is very useful to highlight the most correlated variables in a data table. In this plot, correlation coefficients is colored according to the value. Correlation matrix can be also reordered according to the degree of association between variables.



In [None]:
# let's remind ourseves of the columns in our data
AMP_TrainSet.columns

In [None]:
# corr() is used to get the correlation
# we are redirecting the correlations to correlations
correlations = AMP_TrainSet.corr()

# now let's plot the correlation matriix
fig = plt.figure()
ax = fig.add_subplot(111)
cax = ax.matshow(correlations, vmin=-1, vmax=1)
fig.colorbar(cax)
ticks = np.arange(0,9,1)
ax.set_xticks(ticks)
ax.set_yticks(ticks)
ax.set_xticklabels(AMP_TrainSet.columns)
ax.set_yticklabels(AMP_TrainSet.columns)
plt.show()

## 2. Scatter Plot Matrix
## A scatter plot shows the relationship between two variables as dots in two dimensions, one axis for each attribute. You can create a scatter plot for each pair of attributes in your data.

## Scatter plots are useful for spotting structured relationships between variables, like whether you could summarize the relationship between two variables with a line. Attributes with structured relationships may also be correlated and good candidates for removal from your dataset.

In [None]:
# we are using pairplot to get the scatter plot matrix
# sns to visualisation the matrix
sns.pairplot(AMP_TrainSet)

## Prepare Your Data For Machine Learning

## Data pre-processing
## Data preprocessing is an integral step in Machine Learning as the quality of data and the useful information that can be derived from it directly affects the ability of our model to learn; therefore, it is extremely important that we preprocess our data before feeding it into our model.
## The steps involved in data preprocessing include; splitting the dataset into the input and output variables for machine learning, apply a pre-processing transform to the input variables and summarize the data to show the change.

## 1. Rescaling your data
## This involves rescaling the attributes so all vallues are in the range between 0 and 1. This is useful for optimisation algorithms, you can rescale your data using scikit-learn using the MinMaxScaler class.

In [None]:
# set_printoptions determines the way floating point numbers, arrays and other NumPy objects are displayed.
from numpy import set_printoptions

# MinMaxScaler estimator transforms features by scaling each feature to a given range.
from sklearn.preprocessing import MinMaxScaler 

# convert our train dataset in to a dataset
AMP_array = AMP_TrainSet.values

# separate array into input and output components
# our data has 11 variables and that's what we are using to convert to arrays
T = AMP_array[:,0:11]
P = AMP_array[:,11]
scaler = MinMaxScaler(feature_range=(0, 1))
rescaledT = scaler.fit_transform(T)

# summarize transformed data
set_printoptions(precision=3)
print(rescaledT[0:5,:])

## 2. Standardize Data
## In Standardization we transform our values such that the mean of the values is 0 and the standard deviation is 1.
## Consider a data frame with 2 numerical values Age and Salary . They are not on the same scale as Age is in years and Salary is in dollars and since Salary will always be greater than Age, the model will give more weight to salary which is not the ideal scenario as age is also an integral factor. In order to avoid this issue we perform standardization. So we just calculate the mean and standard deviation of the values and then for each data point we just subtract the mean and divide it by standard deviation.

In [None]:
# StandardScaler standardizes features by removing the mean and scaling to unit variance
from sklearn.preprocessing import StandardScaler

AMP_array2 = AMP_TrainSet.values

# separate array into input and output components
T = AMP_array2[:,0:11]
P = AMP_array2[:,11]
scaler = StandardScaler().fit(T)
standardizedT = scaler.transform(T)

# summarize transformed data
set_printoptions(precision=3)
print(standardizedT[0:5,:])

## 3. Normalize Data
## Normalizing in scikit-learn refers to rescaling each observation (row) to have a length of 1 (called a unit norm or a vector with the length of 1 in linear algebra).

In [None]:
from sklearn.preprocessing import Normalizer

AMP_array3 = AMP_TrainSet.values

# separate array into input and output components
T = AMP_array3[:,0:11]
P = AMP_array3[:,11]
scaler = Normalizer().fit(T)
normalizedT = scaler.transform(T)

# summarize transformed data
set_printoptions(precision=3)

#print the data type so we can know what we are working with in the dataset
print(normalizedT[0:5,:])
print(type(normalizedT))

## 4. Binarize Data
## You can transform your data using a binary threshold. All values above the threshold are marked 1 and all equal to or below are marked as 0.

In [None]:
from sklearn.preprocessing import Binarizer

AMP_array4 = AMP_TrainSet.values

# separate array into input and output components
T = AMP_array4[:,0:11]
P = AMP_array4[:,11]
binarizer = Binarizer(threshold=0.0).fit(T)
binaryT = binarizer.transform(T)

# summarize transformed data
set_printoptions(precision=3)
print(binaryT[0:5,:])

## Feature Selection

## Feature selection is a process where you automatically or manually select those features in your data which contribute most to the prediction variable or output in which you are interested. Having irrelevant features in your data can decrease the accuracy of many models, especially linear algorithms like linear and logistic regression. Feature selection can reduce overfitting, improve accuracy and also reduce training time for the model.

## Feature selection techniques that are easy to use include; univariate selection, feature importance and correlation matrix with heatmap

## 1. Univariate Selecion
## Statistical tests can be used to select those features that have the strongest relationship with the output variable. The scikit-learn library provides the SelectKBest class that can be used with a suite of different statistical tests to select a specific number of features.
## Chi-Squared test (chi2) is a statistical hypothesis test that assumes (the null hypothesis) that the observed frequencies for a categorical variable match the expected frequencies for the categorical variable.

In [None]:
# to select a specific number of features
from sklearn.feature_selection import SelectKBest

# feature selector chi2 is imported
from sklearn.feature_selection import chi2

# separate array into input and output components
AMP_array_ = AMP_TrainSet.values
T = AMP_array_[:,0:11]
P = AMP_array_[:,11]

# feature extraction
# rescaledT data is used because it has all positive values
# this feature selector requires X values for test.fit to be positive 
test = SelectKBest(score_func=chi2, k=4)
fit = test.fit(rescaledT, P)

# summarize scores
set_printoptions(precision=3)
print(fit.scores_)
features = fit.transform(rescaledT)

# summarize selected features
print(features[0:5,:])

## Recursive Feature Elimination (RFE)
## Recursive means doing or saying the same thing several times in order to produce a particular result or effect and a feature is an individual measurable property or characteristic of a phenomenon being observed attribute in your dataset

## Recursive Feature Elimination works by recursively removing attributes and building a model on those attributes that remain. It uses the model accuracy to identify which attributes or combination of attributes contribute the most to predicting the target attribute (output).

In [None]:
# importing required libraries
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# separate array into input and output components
AMP_array_1 = AMP_TrainSet.values
T = AMP_array_1[:,0:11]
P = AMP_array_1[:,11]

# feature extraction
model = LogisticRegression()

# now we decide how many features we want to be selected for our model
rfe = RFE(model, 3)
fit = rfe.fit(T, P)
print("Num Features: ",  fit.n_features_)
print("Selected Features:",  fit.support_)
print("Feature Ranking: ",  fit.ranking_)

## Principal Component Analysis
## This is a technique used to reduce the dimension of the feature space by feature extraction. For example, if we have 10 variables, in feature extraction, we create new independent variables by combining the old ten variables. By creating new variables it might seem as if more dimensions are introduced, but we select only a few variables from the newly created variables in the order of importance. Then the number of those selected variables is less than what we started with and that’s how we reduce the dimensionality.

In [None]:
from sklearn.decomposition import PCA

# separate array into input and output components
AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]

# feature extraction
pca = PCA(n_components=3)
fit = pca.fit(T)

# summarize components
print("Explained Variance: " , fit.explained_variance_ratio_)
print(fit.components_)

## Feature Importance
## Knowing feature importance indicated by machine learning models can benefit you in multiple ways, for example: by getting a better understanding of the model’s logic you can not only verify it being correct but also work on improving the model by focusing only on the important variables, variable selection (you can remove a certain number of variables that are not that significant) and have similar or better performance in much shorter training time

In [None]:

# Random Forest method is used for determining feature importance
from sklearn.ensemble import RandomForestRegressor

# separate array into input and output components
AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]
model = RandomForestRegressor(n_estimators = 100,
                           n_jobs = 1,
                           oob_score = True,
                           bootstrap = True,
                           random_state = 42)
model.fit(T, P)

print('R^2 Training Score: {:.2f} \nOOB Score: {:.2f} \nR^2 Validation Score: {:.2f}'.format(model.score(T, P), 
                                                                                             model.oob_score_,
                                                                                             model.score(T, P)))
# display the feature importance values
print(model.feature_importances_)

In [None]:
# now we can visualise the importance of features
(pd.Series(model.feature_importances_, index=AMP_TrainSet.iloc[:,0:11].columns).nlargest(11).plot(kind='barh')) 

## We can clearly see that AS_MeanAmphiMoment is most important of all the features

## Evaluate Machine Learning Algorithms
## We must evaluate our machine learning algorithms on data that is not used to train the algorithm. Once we estimate the performance of our algorithm, we can then re-train the final algorithm on the entire training dataset and get it ready for operational use.
## Next we shall look at four different techniques that we can use to split up our training dataset and create useful estimates of performance for our machine learning algorithms: Train and Test Sets, K-fold Cross Validation, Leave One Out Cross Validation and Repeated Random Test-Train Splits.
## You need some kind of assurance that your model has got most of the patterns from the data correct, and its not picking up too much on the noise, or in other words its low on bias and variance.
## 1. Split into Train and Test Sets
## The simplest method that we can use to evaluate the performance of a machine learning algorithm is to use different training and testing datasets.
## We can take our original dataset, split it into two parts. Train the algorithm on the first part, make predictions on the second part and evaluate the predictions against the expected results.The size of the split can depend on the size and specifics of your dataset, although it is common to use 67% of the data for training and the remaining 33% for testing

In [None]:
# Evaluate using a train and a test set
# LogisticRegression as the algorithm in this case
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

# separate array into input and output components
AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]

test_size = 0.33
seed = 7
T_train, T_test, P_train, P_test = model_selection.train_test_split(T, P, test_size=test_size, random_state=seed)

model = LogisticRegression()
model.fit(T_train, P_train)
result = model.score(T_test, P_test)
print("Accuracy: %.3f%%" % (result*100.0))

## K-fold Cross Validation
## Cross validation is an approach that you can use to estimate the performance of a machine learning algorithm with less variance than a single train-test set split.

## It works by splitting the dataset into k-parts (e.g. k=5 or k=10). Each split of the data is called a fold. The algorithm is trained on k-1 folds with one held back and tested on the held back fold. This is repeated so that each fold of the dataset is given a chance to be the held back test set. After running cross validation you end up with k different performance scores that you can summarize using a mean and a standard deviation. The result is a more reliable estimate of the performance of the algorithm on new data given your test data. It is more accurate because the algorithm is trained and evaluated multiple times on different data.
## For modest sized datasets in the thousands or tens of thousands of records, k values of 3, 5 and 10 are common.

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn import model_selection

# separate array into input and output components
AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]

num_instances = len(T)
seed = 7
kfold = model_selection.KFold(n_splits=15, random_state=seed)

model = LogisticRegression()
results = model_selection.cross_val_score(model, T, P, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

## Leave One Out Cross Validation
## This approach leaves p data points out of training data, i.e. if there are n data points in the original sample then, n-p samples are used to train the model and p points are used as the validation set. This is repeated for all combinations in which original sample can be separated this way, and then the error is averaged for all trials, to give overall effectiveness. This method is exhaustive in the sense that it needs to train and validate the model for all possible combinations, and for moderately large p, it can become computationally infeasible.

## A particular case of this method is when p = 1. This is known as Leave one out cross validation. This method is generally preferred over the previous one because it does not suffer from the intensive computation, as number of possible combinations is equal to number of data points in original sample or n.

In [None]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression

# separate array into input and output components
AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]

num_folds = 10
loocv = model_selection.LeaveOneOut()

model = LogisticRegression()
results = model_selection.cross_val_score(model, T, P, cv=loocv)
print("Accuracy: %.3f%%" % (results.mean()*100.0))

## Repeated Random Test-Train Splits
## This technique is a hybrid of traditional train-test splitting and the k-fold cross-validation method. In this technique, we create random splits of the data in the training-test set manner and then repeat the process of splitting and evaluating the algorithm multiple times, just like the cross-validation method.

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

# separate array into input and output components
AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]

kfold = ShuffleSplit(n_splits=10, test_size=0.33, random_state=7)

model = LogisticRegression()
results = cross_val_score(model, T, P, cv=kfold)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean()*100.0, results.std()*100.0))

## Machine Learning Algorithm Performance Metrics;
## Classiffication Accuracy
## Classification Accuracy is what we usually mean, when we use the term accuracy. It is the ratio of number of correct predictions to the total number of input samples. It works well only if there are equal number of samples belonging to each class. For example, consider that there are 98% samples of class A and 2% samples of class B in our training set. Then our model can easily get 98% training accuracy by simply predicting every training sample belonging to class A.

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]

kfold = KFold(n_splits=10, random_state=7)

model = LogisticRegression()

scoring = 'accuracy'

results = cross_val_score(model, T, P, cv=kfold, scoring=scoring)
print("Accuracy: %.3f%% (%.3f%%)" % (results.mean(), results.std()))

## Area Under ROC Curve
## Area Under Curve(AUC) is one of the most widely used metrics for evaluation. It is used for binary classification problem. AUC of a classifier is equal to the probability that the classifier will rank a randomly chosen positive example higher than a randomly chosen negative example. Before defining AUC, let us understand two basic terms:
## True Positive Rate (Sensitivity) : True Positive Rate is defined as TP/ (FN+TP). True Positive Rate corresponds to the proportion of positive data points that are correctly considered as positive, with respect to all positive data points. False Positive Rate (Specificity) : False Positive Rate is defined as FP / (FP+TN). False Positive Rate corresponds to the proportion of negative data points that are mistakenly considered as positive, with respect to all negative data points.

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression

AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]

kfold = KFold(n_splits=2, random_state=7)
model = LogisticRegression()
scoring = 'roc_auc'
results = cross_val_score(model, T, P, cv=kfold, scoring=scoring)
print("AUC:", (results.mean(), results.std()))

## Confusion Matrix
## The confusion matrix is a handy presentation of the accuracy of a model with two or more classes. The table presents predictions on the x-axis and accuracy outcomes on the y-axis. The cells of the table are the number of predictions made by a machine learning algorithm. For example, a machine learning algorithm can predict 0 or 1 and each prediction may actually have been a 0 or 1. Predictions for 0 that were actually 0 appear in the cell for prediction = 0 and actual = 0, whereas predictions for 0 that were actually 1 appear in the cell for prediction = 0 and actual = 1. 

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

# separate array into input and output components
AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]

test_size = 0.33
seed = 7
T_train, T_test, P_train, P_test = train_test_split(T, P, test_size=test_size,
random_state=seed)

model = LogisticRegression()
model.fit(T_train, P_train)

predicted = model.predict(T_test)
matrix = confusion_matrix(P_test, predicted)
print(matrix)


## Classiffication Report

## The scikit-learn library provides a convenience report when working on classiffication problems to give you a quick idea of the accuracy of a model using a number of measures. The classification report() function displays the precision, recall, F1-score and support for each class. The example below demonstrates the report on the binary classication problem.
## F1 Score is the Harmonic Mean between precision and recall. The range for F1 Score is [0, 1]. It tells you how precise your classifier is (how many instances it classifies correctly), as well as how robust it is (it does not miss a significant number of instances). High precision but lower recall, gives you an extremely accurate, but it then misses a large number of instances that are difficult to classify. The greater the F1 Score, the better is the performance of our model.
## Precision: It is the number of correct positive results divided by the number of positive results predicted by the classifier.
## Recall: It is the number of correct positive results divided by the number of all relevant samples (all samples that should have been identified as positive).



In [None]:
from sklearn.metrics import classification_report

AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]
test_size = 0.33
seed = 7
T_train, T_test, P_train, P_test = train_test_split(T, P, test_size=test_size,
random_state=seed)
model = LogisticRegression()
model.fit(T_train, P_train)
predicted = model.predict(T_test)
report = classification_report(P_test, predicted)
print(report)

## Regression Metrics
## Mean Absolute Error
## Mean Absolute Error is the average of the difference between the Original Values and the Predicted Values. It gives us the measure of how far the predictions were from the actual output. However, they don’t gives us any idea of the direction of the error i.e. whether we are under predicting the data or over predicting the data.

In [None]:
from sklearn.linear_model import LinearRegression

AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]

kfold = KFold(n_splits=15, random_state=7)
model = LinearRegression()
scoring = 'neg_mean_absolute_error'
results = cross_val_score(model, T, P, cv=kfold, scoring=scoring)
print("MAE:",(results.mean(), results.std()))

## Mean Squared Error
## Mean Squared Error(MSE) is quite similar to Mean Absolute Error, the only difference being that MSE takes the average of the square of the difference between the original values and the predicted values. The advantage of MSE being that it is easier to compute the gradient, whereas Mean Absolute Error requires complicated linear programming tools to compute the gradient. As, we take square of the error, the effect of larger errors become more pronounced then smaller error, hence the model can now focus more on the larger errors.



In [None]:
# Cross Validation Regression MSE

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]
num_folds = 10
kfold = KFold(n_splits=15, random_state=7)
model = LinearRegression()
scoring = 'neg_mean_squared_error'
results = cross_val_score(model, T, P, cv=kfold, scoring=scoring)
print("MSE:",(results.mean(), results.std()))

## R2 Metric

## R-squared is a statistical measure of how close the data are to the fitted regression line. It is also known as the coefficient of determination, or the coefficient of multiple determination for multiple regression. The definition of R-squared is fairly straight-forward; it is the percentage of the response variable variation that is explained by a linear model. Or:

## R-squared = Explained variation / Total variation

## R-squared is always between 0 and 1: 0 indicates that the model explains none of the variability of the response data around its mean. 1 indicates that the model explains all the variability of the response data around its mean. In general, the higher the R-squared, the better the model fits your data.

In [None]:
# Cross Validation Regression R^2

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression

AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]

kfold = KFold(n_splits=15, random_state=7)
model = LinearRegression()
scoring = 'r2'
results = cross_val_score(model, T, P, cv=kfold, scoring=scoring)
print("R^2:",(results.mean(), results.std()))

## Spot-Check Classification Machine Learning Algorithms
## Spot-checking is a way of discovering which algorithms perform well on your machine learning problem. You cannot know which algorithms are best suited to your problem beforehand. You must trial a number of methods and focus attention on those that prove themselves the most promising.

## You cannot know which algorithm will work best on your dataset before hand. You must use trial and error to discover a short list of algorithms that do well on your problem that you can then double down on and tune further. I call this process spot checking.
### The question is not:

   * What algorithm should I use on my dataset?

## Instead it is:

   * What algorithms should I spot check on my dataset?

## Algorithms Overview

## We are going to take a look at 10 classification algorithms that you can spot check on your dataset.
## Linear Machine Learning Algorithms:
   * Lasso
   * Bayesian ridge regression
   * Logistic Regression
   * Linear Discriminant Analysis  

## Nonlinear Machine Learning Algorithms:
   * 
   * Naive Bayes
   * Nearest centroid
   * Stochastic gradient descent
   * Classification and Regression Trees
   * Support Vector Machines
   

## Ensemble machine learning algorithm:
   * Bagging  
   * Random Forest
   * Boosting 


In [None]:
# boosting algorithm requires an experimental feature before importing for the classifier to work
# importing classifiers from sklearn
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# linear algorithms
from sklearn import linear_model
from sklearn.linear_model import Lasso
from sklearn.linear_model import BayesianRidge
from sklearn.linear_model import LogisticRegression
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

#non linear algorithms

from sklearn.neighbors import NearestCentroid
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import SGDClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC

from xgboost import XGBClassifier

#ensemble algorithms
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import HistGradientBoostingClassifier


# prepare all the models to be used and add them to a list also called appending
# implies that they can all be run at once
models = []
# models.append(('Las', Lasso()))
# Error:Classification metrics can't handle a mix of binary and continuous targets
# this particular model is not good for our data

#models.append(('BR', BayesianRidge()))
models.append(('LR', LogisticRegression()))
models.append(('LA', LinearDiscriminantAnalysis()))

models.append(('DTC', DecisionTreeClassifier()))
models.append(('NC', NearestCentroid()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
models.append(('SGD',SGDClassifier()))

models.append(('XGB',XGBClassifier()))
models.append(('BAG',BaggingClassifier()))
models.append(('RF',RandomForestClassifier()))
models.append(('HGB',HistGradientBoostingClassifier()))

# evaluate each model in turn using accuracy
results = []
names = []
scoring = 'accuracy'

# separate array into input and output components
AMP_array = AMP_TrainSet.values
T = AMP_array[:,0:11]
P = AMP_array[:,11]

for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, T, P, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = (name, cv_results.mean(), cv_results.std())
    print(msg)
    
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()



## From the result, these two models have an accuracy closest to 1;Gaussian Naive Bayes has an accuracy of 88.08% and Nearest Centroid Classifier has 86.76%. Now we are going to test these algorithms using our test dataset.


## Prediction metrics

## Confusion matrix

In [None]:
## Trial with naive bayes
from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

array_1 = AMP_TrainSet.values
X = array_1[:,0:11]
Y = array_1[:,11]
test_size = 0.33
seed = 7
 
array_2 = Test.values
Test_set = array_2[:,0:11]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)

#model =GaussianNB ()
NBmodel = GaussianNB()
NBmodel.fit(X, Y)

predicted = NBmodel.predict(Test_set)
#matrix = confusion_matrix(Y_test, predicted)
print(predicted)


In [None]:
## Trial with nearest centroid classifier
from sklearn.metrics import confusion_matrix
from sklearn.neighbors import NearestCentroid
from sklearn.model_selection import train_test_split

array_2 = AMP_TrainSet.values
X = array_2[:,0:11]
Y = array_2[:,11]
test_size = 0.33
seed = 7
 
array_3 = Test.values
Test_set = array_3[:,0:11]

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)

# our model is nearest centroid
NCmodel = NearestCentroid()
NCmodel.fit(X, Y)

predicted_2 = NCmodel.predict(Test_set)
#matrix = confusion_matrix(Y_test, predicted_2)
print(predicted)


## Generating .csv for Naive bayes

In [None]:
# indexing the predicted variable
df1=pd.DataFrame(predicted)
df1.columns=['CLASS']
df1.index.names=["Index"]
df1['CLASS']=df1['CLASS'].map({0.0:False, 1.0:True})
df1

#converting to csv
df1.to_csv('Naive_Bayes.csv')
print(df1['CLASS'].unique())

## Generating a .csv for Nearest centroid classifier

In [None]:
# indexing the predicted variable
df2=pd.DataFrame(predicted_2)
df2.columns=['CLASS']
df2.index.names=["Index"]
df2['CLASS']=df2['CLASS'].map({0.0:False, 1.0:True})
df2

#converting to csv
df2.to_csv('NC.csv')
print(df2['CLASS'].unique())

## Matthews correlation coefficient (MCC)

In [None]:
from sklearn.metrics import matthews_corrcoef

array= AMP_TrainSet.values
X = array[:,0:11]
Y = array[:,11]

test_size = 0.33
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)

model = GaussianNB()
model.fit(X_train, Y_train)

predicted = model.predict(X_test)
matrix = matthews_corrcoef(Y_test, predicted)
print(matrix)
