In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        

# Any results you write to the current directory are saved as output.

# References
#### https://machinelearningmastery.com/evaluate-performance-machine-learning-algorithms-python-using-resampling/
#### https://www.dataquest.io/blog/top-10-machine-learning-algorithms-for-beginners/
#### https://monkeylearn.com/blog/introduction-to-support-vector-machines-svm/
#### https://towardsdatascience.com/understanding-random-forest-58381e0602d2

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
test = pd.read_csv("../input/ace-class-assignment/Test.csv")
test

In [None]:
test.shape

In [None]:
#read in the data
data = pd.read_csv("../input/ace-class-assignment/AMP_TrainSet.csv")
data.head(5)

## Analyze data by describing

#### This step helped me know which features are in my dataset, are they categorical or numerical.
#### How many rows and columns does the dataset have
#### The data types for the various features
#### Checked whether the dataset has null or missing values

In [None]:
#Check the dimensions to the number of rows and columns
data.shape

In [None]:
data.columns

In [None]:
data.dtypes

In [None]:
#Generate descriptive statistics that summarize the central tendency, dispersion, and shape of a dataset’s distribution, excluding NaN values
data.describe()

In [None]:
#number of null values in each column
data.isnull().sum()
#since my data has no null values then its good to go

#### needed to know how balanced the class values are

In [None]:

data.groupby('CLASS').size().plot(kind='bar')

#### Its a good idea to review all the pairwise correlations of the attributes in the dataset because some machine learning algorithm like linear and logistic regression can suffer poor performance if there are highly correlated attributes in the dataset

In [None]:
data.corr(method='pearson')

####  heat map to show the correlation of the data; plots that show the interactions between multiple variables in the dataset
#### Correlation gives an indication of how related the changes are between two variables. If two variables change in the same direction they are positively correlated. If they change in opposite directions together (one goes up, one goes down), then they are negatively correlated. 

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(data.corr(method='pearson'))


#### also checked the corelation in regards to the class since am trying to build a ML agorithm for that class

In [None]:

data.corr(method='pearson')['CLASS']

#### Most of my variables are positively skewed

In [None]:
 data.skew().plot(kind='bar')

## understanding data with visualization
#### Data can be visualised in many ways that is univariate plots and multivariate plots             #### Used the Histogram for univariate plot as shown below and the correlation matrix plot as the multivariate plot as shown above

## Histogram
#### This helps to understand each attribute of my dataset independently

## Data pre-processing

In [None]:
plt.figure(figsize=(18,18))
data.hist()
plt.subplots_adjust(bottom=3, right=2, top=5)
plt.show()

## Standardize data
#### Standardization is a useful technique to transform attributes with a Gaussian distribution and differing means and standard deviations to a standard Gaussian distribution with a mean of 0 and a standard deviation of 1

In [None]:
from sklearn.preprocessing import StandardScaler
array = data.values
#separate array into input and output components
X = array[:,0:11]
Y = array[:,11]
scaler = StandardScaler().fit(X)
rescaledX = scaler.transform(X)
# summarize transformed data
#set_printoptions(precision=3)
print(rescaledX[0:5,:])

In [None]:
array = test.values
scaler = StandardScaler().fit(array)
rescaledt = scaler.transform(array)
# summarize transformed data
#set_printoptions(precision=3)
print(rescaledt[0:5,:])

##  Feature selection



####  it's the process of selecting a subset of relevant features for use in model construction

### Chose Recursive Feature Elimination
#### This is an automatic feature selection technique
#### Used logistic regression it is a good baseline as it is fast to train and predict and scales well.


In [None]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

array = data.values
X = array[:,0:11]
Y = array[:,11]
# feature extraction
model = LogisticRegression()
rfe = RFE(model,8)
fit = rfe.fit(X,Y)
print("Num Features:", fit.n_features_)
print("Selected Features:", fit.support_)
print("Feature Ranking:", fit.ranking_)

In [None]:
X[:,fit.support_]

In [None]:
drop=data.drop(['FULL_AcidicMolPerc', 'FULL_DAYM780201', 'AS_DAYM780201'],axis=1)
drop

In [None]:
drop_test = test.drop(['FULL_AcidicMolPerc', 'FULL_DAYM780201', 'AS_DAYM780201'],axis=1)
drop_test

1. #### Decided to first use all the  first


# Evaluate the Performance of Machine Learning Algorithms with Resampling¶


#### The best way to evaluate the performance of an algorithm would be to make predictions for new data to which you already know the answers.

## Split into Train and Test Sets

#### This algorithm evaluation technique is very fast. It is ideal for large datasets where there is strong evidence that both splits of the data are representative of the underlying problem. Because of the speed, it is useful to use this approach when the algorithm you are investigating is slow to train.



In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

array = data.values
X = array[:,0:11]
Y = array[:,11]
test_size = 0.30
seed = 7
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=test_size,
random_state=seed)
model = LogisticRegression()
model.fit(X_train, Y_train)
result = model.score(X_test, Y_test)
print("Accuracy: ",  (result*100.0))


model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report = pd.DataFrame(output)
report.columns = ['CLASS']
report.index.name = "Index"
report['CLASS']=report['CLASS'].map({0.0:False, 1.0:True})
report.to_csv("report.csv")

print(report['CLASS'].unique())
print('False: ',report.groupby('CLASS').size()[0].sum())
print('True: ',report.groupby('CLASS').size()[1].sum())


## K-fold Cross Validation

#### It is more accurate because the algorithm is trained and evaluated multiple times on different data.

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

num_folds = 10 #number of folds to use
seed = 7 #reproducibility

kfold = KFold(n_splits=num_folds, random_state=seed)
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)

print(f"Accuracy:", (results.mean()*100.0, results.std()*100.0))


model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_kf = pd.DataFrame(output)
report_kf.columns = ['CLASS']
report_kf.index.name = "Index"
report_kf['CLASS']=report_kf['CLASS'].map({0.0:False, 1.0:True})
report_kf.to_csv("report_kf.csv")

print(report_kf['CLASS'].unique())
print('False: ',report_kf.groupby('CLASS').size()[0].sum())
print('True: ',report_kf.groupby('CLASS').size()[1].sum())



## Leave One Out Cross Validation

In [None]:
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score

num_folds = 10
loocv = LeaveOneOut()
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=loocv)
print("Accuracy:",  (results.mean()*100.0, results.std()*100.0))


model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_l = pd.DataFrame(output)
report_l.columns = ['CLASS']
report_l.index.name = "Index"
report_l['CLASS']=report_l['CLASS'].map({0.0:False, 1.0:True})
report_l.to_csv("report_l.csv")

print(report_l['CLASS'].unique())
print('False: ',report_l.groupby('CLASS').size()[0].sum())
print('True: ',report_l.groupby('CLASS').size()[1].sum())


## Repeated Random Test-Train Splits

#### Creates a random split of the data like the train/test split , but repeats the process of splitting and evaluation of the algorithm multiple times, like cross validation. Repeated random splits can be useful intermediates when trying to balance variance in the estimated performance, model training speed and dataset size
#### In this I prefered using Repeated Random Test_Train Splits because when you look at the dataset the zeros are one side and the ones on the otherside in the 'class' column. So I would prefer to first shuffle the data and then split it to reduce on the bias

In [None]:
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score

n_splits = 10
test_size = 0.30
seed = 7
kfold = ShuffleSplit(n_splits=n_splits, test_size=test_size, random_state=seed)
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)
print("Accuracy: " , (results.mean()*100.0, results.std()*100.0))


model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_rrt = pd.DataFrame(output)
report_rrt.columns = ['CLASS']
report_rrt.index.name = "Index"
report_rrt['CLASS']=report_rrt['CLASS'].map({0.0:False, 1.0:True})
report_rrt.to_csv("report_rrt.csv")

print(report_rrt['CLASS'].unique())
print('False: ',report_rrt.groupby('CLASS').size()[0].sum())
print('True: ',report_rrt.groupby('CLASS').size()[1].sum())



# Machine Learning Algorithm Performance Metrics

## Algorithms Overview
### linear machine learning algorithms:

    Logistic Regression.
    Linear Discriminant Analysis.
### onlinear machine learning algorithms

    k-Nearest Neighbors.
    Naive Bayes.
    Classication and Regression Trees.
    Support Vector Machines.


## Linear Machine Learning Algorithms

### Logistic Regression

#### Logistic regression is best suited for binary classification: data sets where y = 0 or 1

### Using standardized data

In [None]:
# Logistic regression on standardized data
num_folds = 10
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

model.fit(rescaledX,Y)
output = model.predict(rescaledt)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_scaled = pd.DataFrame(output)
report_scaled.columns = ['CLASS']
report_scaled.index.name = "Index"
report_scaled['CLASS']=report_scaled['CLASS'].map({0.0:False, 1.0:True})
report_scaled.to_csv("report_scaled.csv")

print(report_scaled['CLASS'].unique())
print('False: ',report_scaled.groupby('CLASS').size()[0].sum())
print('True: ',report_scaled.groupby('CLASS').size()[1].sum())


In [None]:
# Logistic Regression Classification

num_folds = 10
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
my_report = pd.DataFrame(output)
my_report.columns = ['CLASS']
my_report.index.name = "Index"
my_report['CLASS']=my_report['CLASS'].map({0.0:False, 1.0:True})
my_report.to_csv("report_XGB.csv")

print(my_report['CLASS'].unique())
print('False: ',my_report.groupby('CLASS').size()[0].sum())
print('True: ',my_report.groupby('CLASS').size()[1].sum())

## Linear Discriminant Analysis¶



In [None]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

num_folds = 10
kfold = KFold(n_splits=10, random_state=7)
model = LinearDiscriminantAnalysis()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
lda_report = pd.DataFrame(output)
lda_report.columns = ['CLASS']
lda_report.index.name = "Index"
lda_report['CLASS']=lda_report['CLASS'].map({0.0:False, 1.0:True})
lda_report.to_csv("ldareport.csv")

print(lda_report['CLASS'].unique())
print('False: ',lda_report.groupby('CLASS').size()[0].sum())
print('True: ',lda_report.groupby('CLASS').size()[1].sum())



## Nonlinear Machine Learning Algorithms

### k-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
num_folds = 10
kfold = KFold(n_splits=10, random_state=7)
model = KNeighborsClassifier()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())



model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_k = pd.DataFrame(output)
report_k.columns = ['CLASS']
report_k.index.name = "Index"
report_k['CLASS']=report_k['CLASS'].map({0.0:False, 1.0:True})
report_k.to_csv("report_k.csv")


print(report_k['CLASS'].unique())
print('False: ',report_k.groupby('CLASS').size()[0].sum())
print('True: ',report_k.groupby('CLASS').size()[1].sum())

### Naive Bayes

### Tried using Standardised data on Naive Bayes

### When I predicted Naive Bayes on Standardised data gave me a score of 0.98235, after feature selection it gave 0.90 and on unstandardised data it gave a score of 0.9959

In [None]:
# Naive Bayes on standardised data
from sklearn.naive_bayes import GaussianNB

kfold = KFold(n_splits=10, random_state=7)
model = GaussianNB()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


model.fit(rescaledX,Y)
output = model.predict(rescaledt)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_rebayes = pd.DataFrame(output)
report_rebayes.columns = ['CLASS']
report_rebayes.index.name = "Index"
report_rebayes['CLASS']=report_rebayes['CLASS'].map({0.0:False, 1.0:True})
report_rebayes.to_csv("report_rebayes.csv")


print(report_rebayes['CLASS'].unique())
print('False: ',report_rebayes.groupby('CLASS').size()[0].sum())
print('True: ',report_rebayes.groupby('CLASS').size()[1].sum())

## Naive Bayes on selected features

In [None]:
# Naive Bayes on selected features

array = data.values
X = array[:,0:11]
Y = array[:,11]

selectedX = X[:,fit.support_]

array2 =test.values
selectedT = array2[:,fit.support_]

kfold = KFold(n_splits=10, random_state=7)
model = GaussianNB()
results = cross_val_score(model, selectedX, Y, cv=kfold)
print(results.mean())


model.fit(selectedX,Y)
output = model.predict(selectedT)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(selectedX),Y)
print('MCC:',mcc)
                       
report_sel = pd.DataFrame(output)
report_sel.columns = ['CLASS']
report_sel.index.name = "Index"
report_sel['CLASS']=report_sel['CLASS'].map({0.0:False, 1.0:True})
report_sel.to_csv("report_sel.csv")


print(report_sel['CLASS'].unique())
print('False: ',report_sel.groupby('CLASS').size()[0].sum())
print('True: ',report_sel.groupby('CLASS').size()[1].sum())


In [None]:
from sklearn.naive_bayes import GaussianNB

kfold = KFold(n_splits=10, random_state=7)
model = GaussianNB()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_bayes = pd.DataFrame(output)
report_bayes.columns = ['CLASS']
report_bayes.index.name = "Index"
report_bayes['CLASS']=report_bayes['CLASS'].map({0.0:False, 1.0:True})
report_bayes.to_csv("report_bayes.csv")


print(report_bayes['CLASS'].unique())
print('False: ',report_bayes.groupby('CLASS').size()[0].sum())
print('True: ',report_bayes.groupby('CLASS').size()[1].sum())

### Classiffication and Regression Trees

#### used for classification or regression predictive modeling problems

In [None]:
from sklearn.tree import DecisionTreeClassifier
kfold = KFold(n_splits=10, random_state=7)
model = DecisionTreeClassifier()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())


model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_tree = pd.DataFrame(output)
report_tree.columns = ['CLASS']
report_tree.index.name = "Index"
report_tree['CLASS']=report_tree['CLASS'].map({0.0:False, 1.0:True})
report_tree.to_csv("report_tree.csv")


print(report_tree['CLASS'].unique())
print('False: ',report_tree.groupby('CLASS').size()[0].sum())
print('True: ',report_tree.groupby('CLASS').size()[1].sum())

### Support Vector Machines 

#### A support vector machine (SVM) is a supervised machine learning model that uses classification algorithms for two-group classification problems

In [None]:
from sklearn.svm import SVC

kfold = KFold(n_splits=10, random_state=7)
model = SVC()
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_svm = pd.DataFrame(output)
report_svm.columns = ['CLASS']
report_svm.index.name = "Index"
report_svm['CLASS']=report_svm['CLASS'].map({0.0:False, 1.0:True})
report_svm.to_csv("report_svm.csv")


print(report_svm['CLASS'].unique())
print('False: ',report_svm.groupby('CLASS').size()[0].sum())
print('True: ',report_svm.groupby('CLASS').size()[1].sum())




# Combine Models Into Ensemble Predictions

The three most popular methods for combining the predictions from different models are:
   
   Bagging
   Boosting
   Voting

> # BoostingAlgorithms

####  These seek to improve the prediction power by training a sequence of weak models, each compensating the weaknesses of its predecessors.


## AdaBoost

#### This is specifically designed for classification problems

In [None]:
# AdaBoost Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import AdaBoostClassifier


X = array[:,0:11]
Y = array[:,11]

num_trees = 39
seed=10

kfold = KFold(n_splits=10, random_state=seed)

model = AdaBoostClassifier(n_estimators=num_trees, random_state=seed)
results = cross_val_score(model, X, Y, cv=kfold)

print(results.mean())

model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_ada = pd.DataFrame(output)
report_ada.columns = ['CLASS']
report_ada.index.name = "Index"
report_ada['CLASS']=report_ada['CLASS'].map({0.0:False, 1.0:True})
report_ada.to_csv("report_ada.csv")


print(report_ada['CLASS'].unique())
print('False: ',report_ada.groupby('CLASS').size()[0].sum())
print('True: ',report_ada.groupby('CLASS').size()[1].sum())



## Bagging Algorithms

#### Bagging is used with decision trees where it significantly raises the stability of models in the reduction of variance and improving accuracy, which eliminates the challenge of overfitting.

## Bagged Decision Trees

In [None]:
# Bagged Decision Trees for Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

#split the data in portions
X = array[:,0:11]
Y = array[:,11]
seed = 7 #duplication

#split according to cross validation
kfold = KFold(n_splits=10, random_state=seed)

#initialize the model
cart = DecisionTreeClassifier()

#bagging
num_trees = 250

#model
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees, random_state=seed)

results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_bag = pd.DataFrame(output)
report_bag.columns = ['CLASS']
report_bag.index.name = "Index"
report_bag['CLASS']=report_bag['CLASS'].map({0.0:False, 1.0:True})
report_bag.to_csv("report_bag.csv")


print(report_bag['CLASS'].unique())
print('False: ',report_bag.groupby('CLASS').size()[0].sum())
print('True: ',report_bag.groupby('CLASS').size()[1].sum())



## Random Forest

In [None]:
# Random Forest Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier


X = array[:,0:11]
Y = array[:,11]

num_trees = 1000

max_features = 3

kfold = KFold(n_splits=10, random_state=7)
model = RandomForestClassifier(n_estimators=num_trees, max_features=max_features)
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())

model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_rf = pd.DataFrame(output)
report_rf.columns = ['CLASS']
report_rf.index.name = "Index"
report_rf['CLASS']=report_rf['CLASS'].map({0.0:False, 1.0:True})
report_rf.to_csv("report_rf.csv")


print(report_rf['CLASS'].unique())
print('False: ',report_rf.groupby('CLASS').size()[0].sum())
print('True: ',report_rf.groupby('CLASS').size()[1].sum())


## Extra Trees

In [None]:
from sklearn.ensemble import ExtraTreesClassifier

X = array[:,0:11]
Y = array[:,11]

num_trees = 100
max_features = 7

kfold = KFold(n_splits=10, random_state=7)

model = ExtraTreesClassifier(n_estimators=num_trees, max_features=max_features)

results = cross_val_score(model, X, Y, cv=kfold)

print(results.mean())


## Voting Ensemble

In [None]:
# Voting Ensemble for Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier


X = array[:,0:11]
Y = array[:,11]
kfold = KFold(n_splits=10, random_state=7)

# create the sub models
estimators = []
model1 = LogisticRegression()
estimators.append(('logistic', model1))

model2 = DecisionTreeClassifier()
estimators.append(('cart', model2))

model3 = SVC()
estimators.append(('svm', model3))

model4 = XGBClassifier()
estimators.append(('xgb', model4))

model5 = RandomForestClassifier()
estimators.append(('rfc', model5))

# create the ensemble model
ensemble = VotingClassifier(estimators)
results = cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())


model.fit(X,Y)
output = model.predict(test.values)

from sklearn.metrics import matthews_corrcoef
mcc = matthews_corrcoef(model.predict(X),Y)
print('MCC:',mcc)
                       
report_v = pd.DataFrame(output)
report_v.columns = ['CLASS']
report_v.index.name = "Index"
report_v['CLASS']=report_v['CLASS'].map({0.0:False, 1.0:True})
report_v.to_csv("report_v.csv")


print(report_v['CLASS'].unique())
print('False: ',report_v.groupby('CLASS').size()[0].sum())
print('True: ',report_v.groupby('CLASS').size()[1].sum())




## comparing the algorithms

In [None]:

# prepare models and add them to a list
from matplotlib import pyplot

models = []
models.append(('LR', LogisticRegression(solver='liblinear', multi_class='ovr')))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC(gamma='auto')))
models.append(('ETC', ExtraTreesClassifier()))
models.append(('RFC', RandomForestClassifier()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'

for name, model in models:
    kfold = KFold(n_splits=10, random_state=7)
    cv_results = cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
    results.append(cv_results)
    names.append(name)
    msg = (name, cv_results.mean(), cv_results.std())
    print(msg)

# boxplot algorithm comparison
fig = pyplot.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
pyplot.boxplot(results)
ax.set_xticklabels(names)
pyplot.show()

# '''''''''''''''''''''''''''''''END''''''''''''''''''''''''''''''