Submit a file containing your report, including (at least) the following items:
1. Problem description
2. Algorithms implemented
3. Experimental results, including accuracies or mean squared errors and running times for each algorithm and parameter choice. You may want to use tables and or plots to illustrate this.
4. Discussion of results
5. Conclusions
6. Appendix: Source code. This may be included as a separate file. Make sure it's well-documented.

# Heart Disease Part 1

## Preprocessing-Nominal

In [None]:
#Library
import time
import math 
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
import distutils
import pandas as pd
import sklearn.metrics as sm
from google.colab import drive
from sklearn.preprocessing import LabelEncoder

from keras.models import Model
from tensorflow.keras.layers import *
from tensorflow.keras.activations import *
from tensorflow.keras.optimizers import *
from keras.utils.vis_utils import plot_model
from keras.regularizers import l2

from sklearn.neural_network import MLPClassifier
import sklearn.metrics
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error,mean_absolute_error

from keras.callbacks import ReduceLROnPlateau

In [None]:
# Mount google drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!ls  '/content/drive/MyDrive/ML_Project'

Final_Project_Report_Abel_Yamel.gdoc
heart2.csv
heart.csv
heart_failure_clinical_records_dataset.csv


In [None]:
#from google.colab import files
#uploaded = files.upload() 

df = pd.read_csv('/content/drive/MyDrive/ML_Project/heart.csv')

In [None]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1,1,140,289,0,1,172,0,0.0,2,0
1,49,0,2,160,180,0,1,156,0,1.0,1,1
2,37,1,1,130,283,0,2,98,0,0.0,2,0
3,48,0,0,138,214,0,1,108,1,1.5,1,1
4,54,1,2,150,195,0,1,122,0,0.0,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1,3,110,264,0,1,132,0,1.2,1,1
914,68,1,0,144,193,1,1,141,0,3.4,1,1
915,57,1,0,130,131,0,1,115,1,1.2,1,1
916,57,0,1,130,236,0,0,174,0,0.0,1,1


In [None]:
# sex = [M : Male, F:  Female]
#chest pain type = [TA: Typical Angina, ATA: Atypical Angina, 
#                   NAP: Non-Anginal Pain, ASY: Asymptomatic]
#RestingECG = [Normal: Normal, 
#              ST: having ST-T wave abnormality (T wave inversions and/or ST elevation or depression of > 0.05 mV), 
#              LVH: showing probable or definite left ventricular hypertrophy by Estes' criteria]
#ExerciseAngina = exercise-induced angina [Y: Yes, N: No]
#ST_Slope =  [Up: upsloping, 
#             Flat: flat,
#              Down: downsloping]

#Nominal LabelEncoder
#Ordinal OrdinalEncoder

In [None]:
for i in ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']:
  df[i] = LabelEncoder().fit_transform(df[i])


In [None]:
data = df.to_numpy()
#take all rows(x) and only attributes/columns(y) from 0-11 to predict heart attacks

x_original = data[:,:10] #this will be the original x array
x_products = data[:,:10] #this one will be x array with products of attributes

#y gets the heart attack disease values
y = data[:,11]

#Products of Attributes
x_cols = x_products.shape[1]
for i in range(x_cols):                            
    x_products = np.hstack((x_products ,(x_products[:,i:x_cols]*x_products[:,i].reshape(-1,1))))  #end of class
print(x_original.shape)
print(x_products.shape)

#grabs the names in the columns
feature_names = df.columns

#split
x_train, x_test, y_train, y_test = train_test_split(x_original, y, test_size=0.2, random_state=42)
x_train_products, x_test_products, y_train_products, y_test_products = train_test_split(x_products, y, test_size=0.2, random_state=42)


(918, 10)
(918, 65)


In [None]:
#Normalizated dataset
scalerMM = MinMaxScaler()
scalerMM.fit(x_train)
x_trainScaleMM = scalerMM.transform(x_train)
x_testScaleMM = scalerMM.transform(x_test)

#standardized dataset
scalerSS = StandardScaler()
scalerSS.fit(x_train)
x_trainScaleSS = scalerSS.transform(x_train)
x_testScaleSS = scalerSS.transform(x_test)

#Normalizated dataset for products of attributes
scalerMM = MinMaxScaler()
scalerMM.fit(x_train_products)
x_train_products_ScaleMM = scalerMM.transform(x_train_products)
x_test_products_ScaleMM = scalerMM.transform(x_test_products)

#standardized dataset for products of attributes
scalerSS = StandardScaler()
scalerSS.fit(x_train_products)
x_train_products_ScaleSS = scalerSS.transform(x_train_products)
x_test_products_ScaleSS = scalerSS.transform(x_test_products)

dataset_name = ["Original dataset", "Normalized Original Dataset", "Standardized Original Dataset", 
                "Products of Attributes Dataset", "Normalized Products of Attributes Dataset", "Standarized Products of Attributes Dataset"]
dataset_xtr = [x_train, x_trainScaleMM, x_trainScaleSS,  x_train_products, x_train_products_ScaleMM, x_train_products_ScaleSS]
dataset_xte = [x_test, x_testScaleMM, x_testScaleSS,  x_test_products, x_test_products_ScaleMM, x_test_products_ScaleSS]

Important Determining/influential feature/attribute

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

AccList = []
for i in range(0,x_train.shape[1]):
  model = DecisionTreeClassifier().fit(x_train[:,i:i+1], y_train)
  pred = model.predict(x_test[:,i:i+1])
  AccList.append(accuracy_score(y_test, pred))

bestAcc = np.argmax(AccList)
print('Best Accuracy is', AccList[bestAcc])
print('Best Feature is ', bestAcc)

Best Accuracy is 0.8043478260869565
Best Feature is  2


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

AccList = []
for i in range(0,x_train_products.shape[1]):
  model = DecisionTreeClassifier().fit(x_train_products[:,i:i+1], y_train)
  pred = model.predict(x_test_products[:,i:i+1])
  AccList.append(accuracy_score(y_test, pred))

bestAcc = np.argmax(AccList)
print('Best Accuracy is', AccList[bestAcc])
print('Best Feature is ', bestAcc)

Best Accuracy is 0.8043478260869565
Best Feature is  2


###Regressor

In [None]:
#Sklearn Regressors

def evaluate_model(model, x_train, y_train, x_test, y_test):
  m = model().fit(x_train, y_train)
  pred = m.predict(x_test)
  return mean_squared_error(y_test, pred)

# MultinomialNB removed due to Maximum iterations still hasnt converged]
models = [KNeighborsRegressor, DecisionTreeRegressor,  RandomForestRegressor, BaggingRegressor,  LinearRegression ] # These are functions!
model_names = ['K-Nearest Neighbors regressor', 'Decision Tree Regressor' , 'Random Forest Regressor', 'Bagging Regressor', 'Linear Regression']

#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print()

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(evaluate_model(models[i], dataset_xtr[index], y_train, dataset_xte[index], y_test))
    done = time.time() - start
    print('MSE = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmin(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best regression model that performed under this dataset is',model_names[best])
  print('Lowest MSE acheived  = {:6.4f}'.format(acc_list[best]))
  print('\n')



Training all the models with this dataset variant:  Original dataset

Evaluating K-Nearest Neighbors regressor
MSE = 0.2204
Elapsed time = 0.0044 secs
Evaluating Decision Tree Regressor
MSE = 0.2935
Elapsed time = 0.0034 secs
Evaluating Random Forest Regressor
MSE = 0.1534
Elapsed time = 0.2516 secs
Evaluating Bagging Regressor
MSE = 0.1676
Elapsed time = 0.0320 secs
Evaluating Linear Regression
MSE = 0.1527
Elapsed time = 0.0035 secs

By utilizing this dataset variant:  Original dataset
The best regression model that performed under this dataset is Linear Regression
Lowest MSE acheived  = 0.1527


Training all the models with this dataset variant:  Normalized Original Dataset

Evaluating K-Nearest Neighbors regressor
MSE = 0.1504
Elapsed time = 0.0038 secs
Evaluating Decision Tree Regressor
MSE = 0.2826
Elapsed time = 0.0029 secs
Evaluating Random Forest Regressor
MSE = 0.1500
Elapsed time = 0.2629 secs
Evaluating Bagging Regressor
MSE = 0.1582
Elapsed time = 0.0331 secs
Evaluating Li

###Classifer

In [None]:
# Sklearn Classifer


def evaluate_model(model, x_train, y_train, x_test, y_test):
  m = model()
  m.fit(x_train,y_train)
  return m.score(x_test,y_test)

# MultinomialNB,  'ComplementNB', removed due to  Negative values in data passed to MultinomialNB (input X)
models = [KNeighborsClassifier, GaussianNB, BernoulliNB,  DecisionTreeClassifier, RandomForestClassifier] # These are functions!
model_names = ['K-Nearest Neighbors Classifier',  'GaussianNB', 'BernoulliNB', 'Decision Tree classifer',  'Random Forest classifer']


#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print()

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(evaluate_model(models[i], dataset_xtr[index], y_train, dataset_xte[index], y_test))
    done = time.time() - start
    print('Accuracy = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmax(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best classification model that performed under this dataset is',model_names[best])
  print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best]))

Training all the models with this dataset variant:  Original dataset

Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.7011
Elapsed time = 0.0096 secs
Evaluating GaussianNB
Accuracy = 0.8207
Elapsed time = 0.0015 secs
Evaluating BernoulliNB
Accuracy = 0.8261
Elapsed time = 0.0052 secs
Evaluating Decision Tree classifer
Accuracy = 0.7174
Elapsed time = 0.0033 secs
Evaluating Random Forest classifer
Accuracy = 0.7880
Elapsed time = 0.2020 secs

By utilizing this dataset variant:  Original dataset
The best classification model that performed under this dataset is BernoulliNB
Highest Accuracy acheived  = 0.8261
Training all the models with this dataset variant:  Normalized Original Dataset

Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.7609
Elapsed time = 0.0141 secs
Evaluating GaussianNB
Accuracy = 0.8207
Elapsed time = 0.0034 secs
Evaluating BernoulliNB
Accuracy = 0.8315
Elapsed time = 0.0029 secs
Evaluating Decision Tree classifer
Accuracy = 0.7174
Elapsed time = 0.0040 s

In [None]:
def cross_validate_model(model, x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = model().fit(x_train, y_train)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)

# MultinomialNB,  'ComplementNB', removed due to  Negative values in data passed to MultinomialNB (input X)
models = [KNeighborsClassifier, GaussianNB, BernoulliNB,  DecisionTreeClassifier, RandomForestClassifier] # These are functions!
model_names = ['K-Nearest Neighbors Classifier',  'GaussianNB', 'BernoulliNB', 'Decision Tree classifer',  'Random Forest classifer']


#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print(' using cross validation')

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(cross_validate_model(models[i], dataset_xtr[index], y_train, dataset_xte[index], y_test,10))
    done = time.time() - start
    print('Accuracy = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmax(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best regression model that performed under this dataset is',model_names[best])
  print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best]))
  print('\n')

Training all the models with this dataset variant:  Original dataset
 using cross validation
Evaluating K-Nearest Neighbors Classifier
Accuracy = 69.0643
Elapsed time = 0.0270 secs
Evaluating GaussianNB
Accuracy = 76.6082
Elapsed time = 0.0151 secs
Evaluating BernoulliNB
Accuracy = 83.2164
Elapsed time = 0.0183 secs
Evaluating Decision Tree classifer
Accuracy = 73.3626
Elapsed time = 0.0188 secs
Evaluating Random Forest classifer
Accuracy = 82.0760
Elapsed time = 1.7161 secs

By utilizing this dataset variant:  Original dataset
The best regression model that performed under this dataset is BernoulliNB
Highest Accuracy acheived  = 83.2164


Training all the models with this dataset variant:  Normalized Original Dataset
 using cross validation
Evaluating K-Nearest Neighbors Classifier
Accuracy = 79.8538
Elapsed time = 0.0296 secs
Evaluating GaussianNB
Accuracy = 76.6082
Elapsed time = 0.0164 secs
Evaluating BernoulliNB
Accuracy = 82.6023
Elapsed time = 0.0183 secs
Evaluating Decision Tre

###Logistic Regression

In [None]:
#Logistic Regression

def evaluate_Logistic_Reg(x_train, y_train, x_test, y_test):
  model = LogisticRegression(max_iter=100,verbose=1,n_jobs=-1) 
  model.fit(x_train,y_train)
  pred = model.predict(x_test)
  return sm.accuracy_score(y_test, pred)

acc_list = []
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training Linear Regression model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(evaluate_Logistic_Reg(dataset_xtr[index], y_train, dataset_xte[index], y_test))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on Linear Regression: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 


Training Linear Regression model with this dataset variant:  Original dataset


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Accuracy = 0.7935
Elapsed time = 0.6203 secs

Training Linear Regression model with this dataset variant:  Normalized Original Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Accuracy = 0.7772
Elapsed time = 0.2836 secs

Training Linear Regression model with this dataset variant:  Standardized Original Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Accuracy = 0.7989
Elapsed time = 0.2830 secs

Training Linear Regression model with this dataset variant:  Products of Attributes Dataset
Accuracy = 0.7935
Elapsed time = 0.3158 secs

Training Linear Regression model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 0.8207
Elapsed time = 0.0435 secs

Training Linear Regression model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 0.8315
Elapsed time = 0.0580 secs

By utilizing the dataset variant on Linear Regression:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.8315


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [None]:
#cross validation
def cross_validate_LR(x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = LogisticRegression(max_iter=100,verbose=1,n_jobs=-1).fit(x_train, y_train)
  pred = cvm.predict(x_test)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)


acc_list = []
print('Using cross validation')
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training Linear Regression model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(cross_validate_LR(dataset_xtr[index], y_train, dataset_xte[index], y_test, 10))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on Linear Regression: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

Using cross validation
Training Linear Regression model with this dataset variant:  Original dataset


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

Accuracy = 77.6608
Elapsed time = 0.3850 secs

Training Linear Regression model with this dataset variant:  Normalized Original Dataset
Accuracy = 76.6082
Elapsed time = 0.1545 secs

Training Linear Regression model with this dataset variant:  Standardized Original Dataset


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

Accuracy = 74.9415
Elapsed time = 0.1092 secs

Training Linear Regression model with this dataset variant:  Products of Attributes Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

Accuracy = 77.1637
Elapsed time = 0.4331 secs

Training Linear Regression model with this dataset variant:  Normalized Products of Attributes Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

Accuracy = 78.2456
Elapsed time = 0.2802 secs

Training Linear Regression model with this dataset variant:  Standarized Products of Attributes Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

Accuracy = 78.8304
Elapsed time = 0.3412 secs

By utilizing the dataset variant on Linear Regression:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 78.8304


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


###SVC

In [None]:
#Sklearn SVC 

def evaluate_svc(x_train, y_train, x_test, y_test):
  model = SVC()
  model.fit(x_train, y_train)
  pred = model.predict(x_test)
  return sm.accuracy_score(y_test, pred)

acc_list = []
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training SVC model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(evaluate_svc(dataset_xtr[index], y_train, dataset_xte[index], y_test))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on SVC: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

Training SVC model with this dataset variant:  Original dataset
Accuracy = 0.6848
Elapsed time = 0.0339 secs

Training SVC model with this dataset variant:  Normalized Original Dataset
Accuracy = 0.7717
Elapsed time = 0.0221 secs

Training SVC model with this dataset variant:  Standardized Original Dataset
Accuracy = 0.7880
Elapsed time = 0.0228 secs

Training SVC model with this dataset variant:  Products of Attributes Dataset
Accuracy = 0.6848
Elapsed time = 0.0436 secs

Training SVC model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 0.7717
Elapsed time = 0.0300 secs

Training SVC model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 0.7880
Elapsed time = 0.0307 secs

By utilizing the dataset variant on SVC:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.7880


In [None]:
#cross validation
def cross_validate_svc(x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = SVC().fit(x_train, y_train)
  pred = cvm.predict(x_test)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)

acc_list = []
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training SVC model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(cross_validate_svc(dataset_xtr[index], y_train, dataset_xte[index], y_test, 10))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on SVC: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

Training SVC model with this dataset variant:  Original dataset
Accuracy = 72.8363
Elapsed time = 0.0652 secs

Training SVC model with this dataset variant:  Normalized Original Dataset
Accuracy = 79.8538
Elapsed time = 0.0495 secs

Training SVC model with this dataset variant:  Standardized Original Dataset
Accuracy = 81.5205
Elapsed time = 0.0505 secs

Training SVC model with this dataset variant:  Products of Attributes Dataset
Accuracy = 73.3041
Elapsed time = 0.0821 secs

Training SVC model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 82.0760
Elapsed time = 0.0633 secs

Training SVC model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 80.9942
Elapsed time = 0.0675 secs

By utilizing the dataset variant on SVC:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 82.0760


In [None]:
#Aprox which kernel and parameters to use before search for max of SVC 

def SVCApprox(x_tr, y_tr, x_te, y_te):


  C = [.1, .5, 1.0, 3.0]
  Gamma = [.1, .5, 1.0, 3.0, 'scale', 'auto']
  COEF = [.1, .25, .5, .75, 1]
  AccuracyList = []
  Top4Acc = []
  #PrecisionList = []
  #RecallList = []
  #'poly',
  for kernel in ['linear', 'rbf', 'sigmoid']:
    print(f'SVC Model with kernel = {kernel}')
    KernelBestAcc = []
    for c in C:

      for G in Gamma:

        for CE in COEF:


          print(f"SVC paramters set to C={c}, Gamma ={G}, Coef={CE}")
          model = SVC(kernel=kernel, C = c, gamma = G, cache_size=10000, coef0 = CE)
              
          model.fit(x_tr, y_tr)
          pred = model.predict(x_te)
          AccuracyList.append(sm.accuracy_score(y_te, pred))
          KernelBestAcc.append(sm.accuracy_score(y_te, pred))
          print(f'Accuracy {AccuracyList[-1]:.4}')

    bestKernel = np.argmax(KernelBestAcc)
    Top4Acc.append(KernelBestAcc[bestKernel])
    print()
    print(f'SVC Model with kernel = {kernel} had a MAX_Accuracy = {KernelBestAcc[bestKernel]:.4}')
    print()

  bestA = np.argmax(AccuracyList)
  print(f'Total Size = {len(AccuracyList)}, Half size= {(len(AccuracyList)/2)}, Quarter size= {(len(AccuracyList)/4)}')
  print('Index of the Max Accuracy is in =', bestA)
  print(f'MAX_Accuracy = {AccuracyList[bestA]:.4}')


  #precision = sm.precision_score(y_test, pred)
  #recall = sm.recall_score(y_test, pred)
  #f1 = sm.f1_score(y_test, pred)
  #print(f'Accuracy {AccuracyList[-1]:.4}, Precision {PrecisionList[-1]:.4}, Recall {RecallList[-1]:.4}')

  return Top4Acc, AccuracyList[bestA]

def print3(T4):
  kernel = ['linear', 'rbf', 'sigmoid']
  for top in range(0,3):
    print(f'Kernel {kernel[top]} had a max accuracy of {T4[top]:.3}')

In [None]:
#Finding approx best to be able to go to max

#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training svc with this dataset variant: ',dataset_name[index])
  print()
  Top3Acc, Max_Acc = SVCApprox(x_train, y_train, x_test, y_test)
  print(f'Max Accuracy with this dataset variant:{dataset_name[index]}  is {Max_Acc:.3}')
  print3(Top3Acc)


Training svc with this dataset variant:  Original dataset

SVC Model with kernel = linear
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.1
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.25
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.5
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.75
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =0.1, Coef=1
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.1
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.25
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.5
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.75
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =0.5, Coef=1
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.1
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.25
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.5
Accuracy 0.7989
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.75
Accuracy 0.798

In [None]:
#Aprox which kernel and parameters to use before search for max of SVC 
#With products of attributes 

def SVCApprox(x_tr, y_tr, x_te, y_te):


  C = [.1, .5, 1.0, 3.0]
  Gamma = [.1, .5, 1.0, 3.0, 'scale', 'auto']
  COEF = [.1, .25, .5, .75, 1]
  AccuracyList = []
  Top4Acc = []
  #PrecisionList = []
  #RecallList = []
  #'poly',
  for kernel in ['linear', 'rbf', 'sigmoid']:
    print(f'SVC Model with kernel = {kernel}')
    KernelBestAcc = []
    for c in C:

      for G in Gamma:

        for CE in COEF:


          print(f"SVC paramters set to C={c}, Gamma ={G}, Coef={CE}")
          model = SVC(kernel=kernel, C = c, gamma = G, cache_size=10000, coef0 = CE)
              
          model.fit(x_tr, y_tr)
          pred = model.predict(x_te)
          AccuracyList.append(sm.accuracy_score(y_te, pred))
          KernelBestAcc.append(sm.accuracy_score(y_te, pred))
          print(f'Accuracy {AccuracyList[-1]:.4}')

    bestKernel = np.argmax(KernelBestAcc)
    Top4Acc.append(KernelBestAcc[bestKernel])
    print()
    print(f'SVC Model with kernel = {kernel} had a MAX_Accuracy = {KernelBestAcc[bestKernel]:.4}')
    print()

  bestA = np.argmax(AccuracyList)
  print(f'Total Size = {len(AccuracyList)}, Half size= {(len(AccuracyList)/2)}, Quarter size= {(len(AccuracyList)/4)}')
  print('Index of the Max Accuracy is in =', bestA)
  print(f'MAX_Accuracy = {AccuracyList[bestA]:.4}')


  #precision = sm.precision_score(y_test, pred)
  #recall = sm.recall_score(y_test, pred)
  #f1 = sm.f1_score(y_test, pred)
  #print(f'Accuracy {AccuracyList[-1]:.4}, Precision {PrecisionList[-1]:.4}, Recall {RecallList[-1]:.4}')



  return Top4Acc, AccuracyList[bestA]

Approximately a 77% correct classification accuracy with a
logistic-regression-derived discriminant function

David W. Aha & Dennis Kibler
-- Instance-based prediction of heart-disease presence with the
Cleveland database
-- NTgrowth: 77.0% accuracy
-- C4: 74.8% accuracy

John Gennari
-- Gennari, J.~H., Langley, P, \& Fisher, D. (1989). Models of
incremental concept formation. {\it Artificial Intelligence, 40},
11--61.
-- Results:
-- The CLASSIT conceptual clustering system achieved a 78.9% accuracy
on the Cleveland database.

### Dense Network

In [None]:
#reshape
x_train = np.float32(x_train/255).reshape(x_train.shape[0],-1)
x_test = np.float32(x_test/255).reshape(x_test.shape[0],-1)


#one hot
y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=2)

#reshape
x_train_products = np.float32(x_train_products/255).reshape(x_train_products.shape[0],-1)
x_test_products = np.float32(x_test_products/255).reshape(x_test_products.shape[0],-1)


#one hot
y_train_products = tf.keras.utils.to_categorical(y_train_products, num_classes=2)
y_test_products = tf.keras.utils.to_categorical(y_test_products, num_classes=2)


rop = ReduceLROnPlateau(monitor='val_loss',factor=0.5, patience=2, verbose=1,min_lr=0.0005)

cross validation implemented

In [None]:
from sklearn.model_selection import KFold

In [None]:
#number of folds to test out
#original data
num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train, x_test), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )


  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 500)               5500      
                                                                 
 dense_1 (Dense)             (None, 500)               250500    
                                                                 
 dense_2 (Dense)             (None, 500)               250500    
                                                                 
 dense_3 (Dense)             (None, 500)               250500    
                                                                 
 dense_4 (Dense)             (None, 2)                 1002      
                                                                 
Total params: 758,002
Trainable params: 758,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for fol

In [None]:
#number of folds to test out
#products
num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train_products, x_test_products), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train_products.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))
  #return model

#dense_network = dense_model()
#dense_network.summary() 
  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
      #validation_data=(inputs, targets),
      #callbacks=[rop],
  )

  #all_history.append(history)

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1




print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

#print('Final accuracy on training set: {:.4f}'.format(history.history['accuracy'][-1]))
#print('Final accuracy on test set: {:.4f}'.format(history.history['val_accuracy'][-1]))

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_50 (Dense)            (None, 500)               33000     
                                                                 
 dense_51 (Dense)            (None, 500)               250500    
                                                                 
 dense_52 (Dense)            (None, 500)               250500    
                                                                 
 dense_53 (Dense)            (None, 500)               250500    
                                                                 
 dense_54 (Dense)            (None, 2)                 1002      
                                                                 
Total params: 785,502
Trainable params: 785,502
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for 

In [None]:
#Normalizated dataset
scalerMM = MinMaxScaler()
scalerMM.fit(x_train)
x_trainScaleMM = scalerMM.transform(x_train)
x_testScaleMM = scalerMM.transform(x_test)

#standardized dataset
scalerSS = StandardScaler()
scalerSS.fit(x_train)
x_trainScaleSS = scalerSS.transform(x_train)
x_testScaleSS = scalerSS.transform(x_test)

#Normalizated dataset with products of attributes
scalerMMp = MinMaxScaler()
scalerMMp.fit(x_train_products)
x_train_products_ScaleMM = scalerMMp.transform(x_train_products)
x_test_products_ScaleMM = scalerMMp.transform(x_test_products)

#standardized dataset with products of attributes
scalerSSp = StandardScaler()
scalerSSp.fit(x_train_products)
x_train_products_ScaleSS = scalerSSp.transform(x_train_products)
x_test_products_ScaleSS = scalerSSp.transform(x_test_products)



In [None]:
#number of folds to test out
#NORMALIZED DATA

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_trainScaleMM, x_testScaleMM), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_trainScaleMM.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))
  #return model

#dense_network = dense_model()
#dense_network.summary() 
  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
      #validation_data=(inputs, targets),
      #callbacks=[rop],
  )

  #all_history.append(history)

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1




print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

#print('Final accuracy on training set: {:.4f}'.format(history.history['accuracy'][-1]))
#print('Final accuracy on test set: {:.4f}'.format(history.history['val_accuracy'][-1]))

Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_100 (Dense)           (None, 500)               5500      
                                                                 
 dense_101 (Dense)           (None, 500)               250500    
                                                                 
 dense_102 (Dense)           (None, 500)               250500    
                                                                 
 dense_103 (Dense)           (None, 500)               250500    
                                                                 
 dense_104 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 758,002
Trainable params: 758,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for 

In [None]:
#number of folds to test out
#NORMALIZED DATA Products

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train_products_ScaleMM, x_test_products_ScaleMM), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train_products_ScaleMM.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))
  #return model

#dense_network = dense_model()
#dense_network.summary() 
  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
      #validation_data=(inputs, targets),
      #callbacks=[rop],
  )

  #all_history.append(history)

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1




print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

#print('Final accuracy on training set: {:.4f}'.format(history.history['accuracy'][-1]))
#print('Final accuracy on test set: {:.4f}'.format(history.history['val_accuracy'][-1]))

Model: "sequential_30"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_150 (Dense)           (None, 500)               33000     
                                                                 
 dense_151 (Dense)           (None, 500)               250500    
                                                                 
 dense_152 (Dense)           (None, 500)               250500    
                                                                 
 dense_153 (Dense)           (None, 500)               250500    
                                                                 
 dense_154 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 785,502
Trainable params: 785,502
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for 

In [None]:
#number of folds to test out
#STANDARDIED DATA

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_trainScaleSS, x_testScaleSS), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_trainScaleSS.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))
  #return model

#dense_network = dense_model()
#dense_network.summary() 
  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
      #validation_data=(inputs, targets),
      #callbacks=[rop],
  )

  #all_history.append(history)

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1




print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

#print('Final accuracy on training set: {:.4f}'.format(history.history['accuracy'][-1]))
#print('Final accuracy on test set: {:.4f}'.format(history.history['val_accuracy'][-1]))

Model: "sequential_40"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_200 (Dense)           (None, 500)               5500      
                                                                 
 dense_201 (Dense)           (None, 500)               250500    
                                                                 
 dense_202 (Dense)           (None, 500)               250500    
                                                                 
 dense_203 (Dense)           (None, 500)               250500    
                                                                 
 dense_204 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 758,002
Trainable params: 758,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for 

In [None]:
#number of folds to test out
#STANDARDIED DATA products

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train_products_ScaleSS, x_test_products_ScaleSS), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train_products_ScaleSS.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))
  #return model

#dense_network = dense_model()
#dense_network.summary() 
  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
      #validation_data=(inputs, targets),
      #callbacks=[rop],
  )

  #all_history.append(history)

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1




print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

#print('Final accuracy on training set: {:.4f}'.format(history.history['accuracy'][-1]))
#print('Final accuracy on test set: {:.4f}'.format(history.history['val_accuracy'][-1]))

Model: "sequential_60"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_300 (Dense)           (None, 500)               33000     
                                                                 
 dense_301 (Dense)           (None, 500)               250500    
                                                                 
 dense_302 (Dense)           (None, 500)               250500    
                                                                 
 dense_303 (Dense)           (None, 500)               250500    
                                                                 
 dense_304 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 785,502
Trainable params: 785,502
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for 

## Heart Disease Part 1 Model Evaluation and Findings with Nominal


##Processing-Ordoinal


In [None]:
#Library
from numpy import mean
from numpy import std
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import accuracy_score
import time
import math 
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
import os
import distutils
import pandas as pd
import sklearn.metrics as sm
from google.colab import drive

from keras.models import Model
from tensorflow.keras.layers import *
from tensorflow.keras.activations import *
from tensorflow.keras.optimizers import *
from keras.utils.vis_utils import plot_model
from keras.regularizers import l2

from sklearn.neural_network import MLPClassifier
import sklearn.metrics
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.naive_bayes import GaussianNB, MultinomialNB, ComplementNB, BernoulliNB
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor

from sklearn.metrics import accuracy_score, confusion_matrix,mean_squared_error,mean_absolute_error

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

from keras.callbacks import ReduceLROnPlateau

In [None]:
# Mount google drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls  '/content/drive/MyDrive/ML_Project'

Final_Project_Report_Abel_Yamel.gdoc
heart2.csv
heart.csv
heart_failure_clinical_records_dataset.csv


In [None]:
#from google.colab import files
#uploaded = files.upload() 

df = pd.read_csv('/content/drive/MyDrive/ML_Project/heart.csv')

In [None]:

# In the 'temp' column, I want 'cold' to be 0, 'cool' to be 1, 'warm' to be 2, and 'hot' to be 3
# In the 'place' column, I want 'first' to be 0, 'second' to be 1, and 'third' to be 2
Sex_categories = ['F','M']
ChestPain_categories = ['ASY','NAP', 'ATA', 'TA']
RestingECG_categories = ['Normal', 'ST', 'LVH']
ExerciseA_categories = ['N', 'Y']
STslope_categories = ['Down', 'Flat', 'Up']

# Now, when you instantiate the encoder, both of these lists go in one big categories list:
encoder = OrdinalEncoder(categories=[Sex_categories, ChestPain_categories, RestingECG_categories, ExerciseA_categories, STslope_categories])
df2 = encoder.fit_transform(df[['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']])
titles = ['Sex', 'ChestPainType', 'RestingECG', 'ExerciseAngina', 'ST_Slope']
for i in range(len(titles)):
  df[titles[i]] = df2[:,i]

In [None]:
df

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,1.0,2.0,140,289,0,0.0,172,0.0,0.0,2.0,0
1,49,0.0,1.0,160,180,0,0.0,156,0.0,1.0,1.0,1
2,37,1.0,2.0,130,283,0,1.0,98,0.0,0.0,2.0,0
3,48,0.0,0.0,138,214,0,0.0,108,1.0,1.5,1.0,1
4,54,1.0,1.0,150,195,0,0.0,122,0.0,0.0,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
913,45,1.0,3.0,110,264,0,0.0,132,0.0,1.2,1.0,1
914,68,1.0,0.0,144,193,1,0.0,141,0.0,3.4,1.0,1
915,57,1.0,0.0,130,131,0,0.0,115,1.0,1.2,1.0,1
916,57,0.0,2.0,130,236,0,2.0,174,0.0,0.0,1.0,1


In [None]:
data = df.to_numpy()
#take all rows(x) and only attributes/columns(y) from 0-11 to predict heart attacks

x_original = data[:,:10] #this will be the original x array
x_products = data[:,:10] #this one will be x array with products of attributes

#y gets the heart attack disease values
y = data[:,11]

#Products of Attributes
x_cols = x_products.shape[1]
for i in range(x_cols):                            
    x_products = np.hstack((x_products ,(x_products[:,i:x_cols]*x_products[:,i].reshape(-1,1))))  #end of class
print(x_original.shape)
print(x_products.shape)

#grabs the names in the columns
feature_names = df.columns


x_train, x_test, y_train, y_test = train_test_split(x_original, y, test_size=0.2, random_state=42)
x_train_products, x_test_products, y_train_products, y_test_products = train_test_split(x_products, y, test_size=0.2, random_state=42)


(918, 10)
(918, 65)


In [None]:
#Normalizated dataset
scalerMM = MinMaxScaler()
scalerMM.fit(x_train)
x_trainScaleMM = scalerMM.transform(x_train)
x_testScaleMM = scalerMM.transform(x_test)

#standardized dataset
scalerSS = StandardScaler()
scalerSS.fit(x_train)
x_trainScaleSS = scalerSS.transform(x_train)
x_testScaleSS = scalerSS.transform(x_test)

#Normalizated dataset for products of attributes
scalerMM = MinMaxScaler()
scalerMM.fit(x_train_products)
x_train_products_ScaleMM = scalerMM.transform(x_train_products)
x_test_products_ScaleMM = scalerMM.transform(x_test_products)

#standardized dataset for products of attributes
scalerSS = StandardScaler()
scalerSS.fit(x_train_products)
x_train_products_ScaleSS = scalerSS.transform(x_train_products)
x_test_products_ScaleSS = scalerSS.transform(x_test_products)

dataset_name = ["Original dataset", "Normalized Original Dataset", "Standardized Original Dataset", 
                "Products of Attributes Dataset", "Normalized Products of Attributes Dataset", "Standarized Products of Attributes Dataset"]
dataset_xtr = [x_train, x_trainScaleMM, x_trainScaleSS,  x_train_products, x_train_products_ScaleMM, x_train_products_ScaleSS]
dataset_xte = [x_test, x_testScaleMM, x_testScaleSS,  x_test_products, x_test_products_ScaleMM, x_test_products_ScaleSS]

Important feature in dataset

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

AccList = []
for i in range(0,x_train.shape[1]):
  model = DecisionTreeClassifier().fit(x_train[:,i:i+1], y_train)
  pred = model.predict(x_test[:,i:i+1])
  AccList.append(accuracy_score(y_test, pred))

bestAcc = np.argmax(AccList)
print('Best Accuracy is', AccList[bestAcc])
print('Best Feature is ', bestAcc)

Best Accuracy is 0.8043478260869565
Best Feature is  2


In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

AccList = []
for i in range(0,x_train_products.shape[1]):
  model = DecisionTreeClassifier().fit(x_train_products[:,i:i+1], y_train)
  pred = model.predict(x_test_products[:,i:i+1])
  AccList.append(accuracy_score(y_test, pred))

bestAcc = np.argmax(AccList)
print('Best Accuracy is', AccList[bestAcc])
print('Best Feature is ', bestAcc)

Best Accuracy is 0.8043478260869565
Best Feature is  2


feature 2 is chestpain type

### Regressor

In [None]:
#Sklearn Regressors

def evaluate_model(model, x_train, y_train, x_test, y_test):
  m = model().fit(x_train, y_train)
  pred = m.predict(x_test)
  return mean_squared_error(y_test, pred)

# MultinomialNB removed due to Maximum iterations still hasnt converged]
models = [KNeighborsRegressor, DecisionTreeRegressor,  RandomForestRegressor, BaggingRegressor,  LinearRegression ] # These are functions!
model_names = ['K-Nearest Neighbors regressor', 'Decision Tree Regressor' , 'Random Forest Regressor', 'Bagging Regressor', 'Linear Regression']

#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print()

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(evaluate_model(models[i], dataset_xtr[index], y_train, dataset_xte[index], y_test))
    done = time.time() - start
    print('MSE = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmin(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best regression model that performed under this dataset is',model_names[best])
  print('Lowest MSE acheived  = {:6.4f}'.format(acc_list[best]))

Training all the models with this dataset variant:  Original dataset

Evaluating K-Nearest Neighbors regressor
MSE = 0.2193
Elapsed time = 0.0044 secs
Evaluating Decision Tree Regressor
MSE = 0.2989
Elapsed time = 0.0038 secs
Evaluating Random Forest Regressor
MSE = 0.1420
Elapsed time = 0.2625 secs
Evaluating Bagging Regressor
MSE = 0.1568
Elapsed time = 0.0319 secs
Evaluating Linear Regression
MSE = 0.1462
Elapsed time = 0.0026 secs

By utilizing this dataset variant:  Original dataset
The best regression model that performed under this dataset is Random Forest Regressor
Lowest MSE acheived  = 0.1420
Training all the models with this dataset variant:  Normalized Original Dataset

Evaluating K-Nearest Neighbors regressor
MSE = 0.1548
Elapsed time = 0.0048 secs
Evaluating Decision Tree Regressor
MSE = 0.2663
Elapsed time = 0.0034 secs
Evaluating Random Forest Regressor
MSE = 0.1514
Elapsed time = 0.2621 secs
Evaluating Bagging Regressor
MSE = 0.1691
Elapsed time = 0.0306 secs
Evaluatin

### Classifier


In [None]:
# Sklearn Classifer


def evaluate_model(model, x_train, y_train, x_test, y_test):
  m = model()
  m.fit(x_train,y_train)
  return m.score(x_test,y_test)

# MultinomialNB,  'ComplementNB', removed due to  Negative values in data passed to MultinomialNB (input X)
models = [KNeighborsClassifier, GaussianNB, BernoulliNB,  DecisionTreeClassifier, RandomForestClassifier] # These are functions!
model_names = ['K-Nearest Neighbors Classifier',  'GaussianNB', 'BernoulliNB', 'Decision Tree classifer',  'Random Forest classifer']


#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print()

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(evaluate_model(models[i], dataset_xtr[index], y_train, dataset_xte[index], y_test))
    done = time.time() - start
    print('Accuracy = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmax(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best classification model that performed under this dataset is',model_names[best])
  print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best]))

Training all the models with this dataset variant:  Original dataset

Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.7065
Elapsed time = 0.0085 secs
Evaluating GaussianNB
Accuracy = 0.8098
Elapsed time = 0.0014 secs
Evaluating BernoulliNB
Accuracy = 0.8207
Elapsed time = 0.0019 secs
Evaluating Decision Tree classifer
Accuracy = 0.7011
Elapsed time = 0.0036 secs
Evaluating Random Forest classifer
Accuracy = 0.7880
Elapsed time = 0.2085 secs

By utilizing this dataset variant:  Original dataset
The best classification model that performed under this dataset is BernoulliNB
Highest Accuracy acheived  = 0.8207
Training all the models with this dataset variant:  Normalized Original Dataset

Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.7989
Elapsed time = 0.0106 secs
Evaluating GaussianNB
Accuracy = 0.8098
Elapsed time = 0.0045 secs
Evaluating BernoulliNB
Accuracy = 0.7935
Elapsed time = 0.0038 secs
Evaluating Decision Tree classifer
Accuracy = 0.7120
Elapsed time = 0.0046 s

In [None]:
def cross_validate_model(model, x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = model().fit(x_train, y_train)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)

# MultinomialNB,  'ComplementNB', removed due to  Negative values in data passed to MultinomialNB (input X)
models = [KNeighborsClassifier, GaussianNB, BernoulliNB,  DecisionTreeClassifier, RandomForestClassifier] # These are functions!
model_names = ['K-Nearest Neighbors Classifier',  'GaussianNB', 'BernoulliNB', 'Decision Tree classifer',  'Random Forest classifer']


#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print(' using cross validation')

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(cross_validate_model(models[i], dataset_xtr[index], y_train, dataset_xte[index], y_test,10))
    done = time.time() - start
    print('Accuracy = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmax(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best regression model that performed under this dataset is',model_names[best])
  print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best]))
  print('\n')

Training all the models with this dataset variant:  Original dataset
 using cross validation
Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.6854
Elapsed time = 0.0293 secs
Evaluating GaussianNB
Accuracy = 0.7772
Elapsed time = 0.0166 secs
Evaluating BernoulliNB
Accuracy = 0.8272
Elapsed time = 0.0195 secs
Evaluating Decision Tree classifer
Accuracy = 0.7491
Elapsed time = 0.0205 secs
Evaluating Random Forest classifer
Accuracy = 0.8152
Elapsed time = 1.7040 secs

By utilizing this dataset variant:  Original dataset
The best regression model that performed under this dataset is BernoulliNB
Highest Accuracy acheived  = 0.8272


Training all the models with this dataset variant:  Normalized Original Dataset
 using cross validation
Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.7667
Elapsed time = 0.0323 secs
Evaluating GaussianNB
Accuracy = 0.7772
Elapsed time = 0.0169 secs
Evaluating BernoulliNB
Accuracy = 0.8374
Elapsed time = 0.0211 secs
Evaluating Decision Tree classif

### Logistic Regression

In [None]:
#Logistic Regression

def evaluate_Logistic_Reg(x_train, y_train, x_test, y_test):
  model = LogisticRegression(max_iter=100,verbose=1,n_jobs=-1) 
  model.fit(x_train,y_train)
  pred = model.predict(x_test)
  return sm.accuracy_score(y_test, pred)

acc_list = []
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training Linear Regression model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(evaluate_Logistic_Reg(dataset_xtr[index], y_train, dataset_xte[index], y_test))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on Linear Regression: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

Training Linear Regression model with this dataset variant:  Original dataset


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Accuracy = 0.8043
Elapsed time = 0.6659 secs

Training Linear Regression model with this dataset variant:  Normalized Original Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Accuracy = 0.7935
Elapsed time = 0.2930 secs

Training Linear Regression model with this dataset variant:  Standardized Original Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Accuracy = 0.8098
Elapsed time = 0.2873 secs

Training Linear Regression model with this dataset variant:  Products of Attributes Dataset
Accuracy = 0.7989
Elapsed time = 0.3380 secs

Training Linear Regression model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 0.7989
Elapsed time = 0.0426 secs

Training Linear Regression model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 0.8261
Elapsed time = 0.0556 secs

By utilizing the dataset variant on Linear Regression:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.8261


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.1s finished


In [None]:
#cross validation
def cross_validate_LR(x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = LogisticRegression(max_iter=100,verbose=1,n_jobs=-1).fit(x_train, y_train)
  pred = cvm.predict(x_test)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)


acc_list = []
print('Using cross validation')
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training Linear Regression model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(cross_validate_LR(dataset_xtr[index], y_train, dataset_xte[index], y_test, 10))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on Linear Regression: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

Using cross validation
Training Linear Regression model with this dataset variant:  Original dataset


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

Accuracy = 0.7614
Elapsed time = 0.3208 secs

Training Linear Regression model with this dataset variant:  Normalized Original Dataset
Accuracy = 0.7719
Elapsed time = 0.1308 secs

Training Linear Regression model with this dataset variant:  Standardized Original Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

Accuracy = 0.7617
Elapsed time = 0.1136 secs

Training Linear Regression model with this dataset variant:  Products of Attributes Dataset


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

Accuracy = 0.7772
Elapsed time = 0.3904 secs

Training Linear Regression model with this dataset variant:  Normalized Products of Attributes Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

Accuracy = 0.7886
Elapsed time = 0.2395 secs

Training Linear Regression model with this dataset variant:  Standarized Products of Attributes Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

Accuracy = 0.7988
Elapsed time = 0.3413 secs

By utilizing the dataset variant on Linear Regression:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.7988


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


### SVC

In [None]:
#Sklearn SVC 

def evaluate_svc(x_train, y_train, x_test, y_test):
  model = SVC()
  model.fit(x_train, y_train)
  pred = model.predict(x_test)
  return sm.accuracy_score(y_test, pred)

acc_list = []
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training SVC model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(evaluate_svc(dataset_xtr[index], y_train, dataset_xte[index], y_test))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on SVC: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

Training SVC model with this dataset variant:  Original dataset
Accuracy = 0.6848
Elapsed time = 0.0308 secs

Training SVC model with this dataset variant:  Normalized Original Dataset
Accuracy = 0.7717
Elapsed time = 0.0220 secs

Training SVC model with this dataset variant:  Standardized Original Dataset
Accuracy = 0.7772
Elapsed time = 0.0237 secs

Training SVC model with this dataset variant:  Products of Attributes Dataset
Accuracy = 0.6848
Elapsed time = 0.0450 secs

Training SVC model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 0.7880
Elapsed time = 0.0315 secs

Training SVC model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 0.7609
Elapsed time = 0.0313 secs

By utilizing the dataset variant on SVC:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.7880


In [None]:
#cross validation
def cross_validate_svc(x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = SVC().fit(x_train, y_train)
  pred = cvm.predict(x_test)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)

acc_list = []
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training SVC model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(cross_validate_svc(dataset_xtr[index], y_train, dataset_xte[index], y_test, 10))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on SVC: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

Training SVC model with this dataset variant:  Original dataset
Accuracy = 0.7284
Elapsed time = 0.0675 secs

Training SVC model with this dataset variant:  Normalized Original Dataset
Accuracy = 0.7678
Elapsed time = 0.0510 secs

Training SVC model with this dataset variant:  Standardized Original Dataset
Accuracy = 0.7673
Elapsed time = 0.0517 secs

Training SVC model with this dataset variant:  Products of Attributes Dataset
Accuracy = 0.7330
Elapsed time = 0.0839 secs

Training SVC model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 0.7731
Elapsed time = 0.0634 secs

Training SVC model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 0.7778
Elapsed time = 0.0733 secs

By utilizing the dataset variant on SVC:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.7778


In [None]:
#Original

def print3(T4):
  kernel = ['linear', 'rbf', 'sigmoid']
  for top in range(0,3):
    print(f'Kernel {kernel[top]} had a max accuracy of {T4[top]:.3}')

#Finding approx best to be able to go to max
print('No augumentation was done on the original  dataset')
Top3Acc, Max_Acc = SVCApprox(x_train, y_train, x_test, y_test)
print(f'Max Accuracy with no data augmentation = {Max_Acc:.3}')
print3(Top3Acc)

No augumentation was done on the original  dataset
SVC Model with kernel = linear
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.1
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.25
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.5
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.75
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =0.1, Coef=1
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.1
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.25
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.5
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.75
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =0.5, Coef=1
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.1
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.25
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.5
Accuracy 0.788
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.75
Accuracy 0.788
SVC paramters set to

##FINISH RUNNING THIS BELOW (jUST THE ONE with no augumentation)

In [None]:
#Original

print('Applied MixMax/Normalization onto the Original dataset')

Top3Acc_MM, Max_N_Acc = SVCApprox(x_trainScaleMM, y_train,x_testScaleMM, y_test)
print(f'Max Accuracy with Normalization on data = {Max_N_Acc:.3}')
print3(Top3Acc_MM)


Applied MixMax/Normalization onto the Original dataset
SVC Model with kernel = linear
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.1
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.25
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.5
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.75
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =0.1, Coef=1
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.1
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.25
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.5
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.75
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =0.5, Coef=1
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.1
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.25
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.5
Accuracy 0.7554
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.75
Accuracy 0.7554
SV

Approximately a 77% correct classification accuracy with a logistic-regression-derived discriminant function

David W. Aha & Dennis Kibler -- Instance-based prediction of heart-disease presence with the Cleveland database -- NTgrowth: 77.0% accuracy -- C4: 74.8% accuracy

John Gennari -- Gennari, J.~H., Langley, P, & Fisher, D. (1989). Models of incremental concept formation. {\it Artificial Intelligence, 40}, 11--61. -- Results: -- The CLASSIT conceptual clustering system achieved a 78.9% accuracy on the Cleveland database.

In [None]:
#Aprox which kernel and parameters to use before search for max of SVC

def SVCApprox(x_tr, y_tr, x_te, y_te):

  C = [.1, .5, 1.0, 3.0]
  Gamma = [.1, .5, 1.0, 3.0, 'scale', 'auto']
  COEF = [.1, .25, .5, .75, 1]
  AccuracyList = []
  Top4Acc = []
  #PrecisionList = []
  #RecallList = []
  #'poly',
  for kernel in ['linear', 'rbf', 'sigmoid']:
    print(f'SVC Model with kernel = {kernel}')
    KernelBestAcc = []
    for c in C:

      for G in Gamma:

        for CE in COEF:


          print(f"SVC paramters set to C={c}, Gamma ={G}, Coef={CE}")
          model = SVC(kernel=kernel, C = c, gamma = G, cache_size=10000, coef0 = CE)
              
          model.fit(x_tr, y_tr)
          pred = model.predict(x_te)
          AccuracyList.append(sm.accuracy_score(y_te, pred))
          KernelBestAcc.append(sm.accuracy_score(y_te, pred))
          print(f'Accuracy {AccuracyList[-1]:.4}')

    bestKernel = np.argmax(KernelBestAcc)
    Top4Acc.append(KernelBestAcc[bestKernel])
    print()
    print(f'SVC Model with kernel = {kernel} had a MAX_Accuracy = {KernelBestAcc[bestKernel]:.4}')
    print()

  bestA = np.argmax(AccuracyList)
  print(f'Total Size = {len(AccuracyList)}, Half size= {(len(AccuracyList)/2)}, Quarter size= {(len(AccuracyList)/4)}')
  print('Index of the Max Accuracy is in =', bestA)
  print(f'MAX_Accuracy = {AccuracyList[bestA]:.4}')


  #precision = sm.precision_score(y_test, pred)
  #recall = sm.recall_score(y_test, pred)
  #f1 = sm.f1_score(y_test, pred)
  #print(f'Accuracy {AccuracyList[-1]:.4}, Precision {PrecisionList[-1]:.4}, Recall {RecallList[-1]:.4}')

  return Top4Acc, AccuracyList[bestA]

In [None]:
def SVCmax(x_tr, y_tr, x_te, y_te,):
  AccList = []


  C = [.1, .5, 1.0, 5.0]
  Gamma = [.1, .5, 1.0, 5.0]
  AccuracyList = []
  #PrecisionList = []
  #RecallList = []

  for kernel in ['linear', 'rbf', 'poly', 'sigmoid']:
    print(f"SVC Model with ={kernel}")

    for c in C:

      for G in Gamma:
        model = SVC(C = c, kernel=kernel, degree = d, gamma = G, cache_size=1000, coef0 = 0.5)
      
  model = SVC(C = c, kernel=kernel, gamma = G, cache_size=10000, coef0 = (0.1*Ce)
  model.fit(x_train, y_train)
  pred = model.predict(x_test)
      
      print(f"SVC paramters set to C={c}, Gamma ={G}")
      print(f'Accuracy {AccuracyList[-1]:.4}, Precision {PrecisionList[-1]:.4}, Recall {RecallList[-1]:.4}')


      print(f'Best SVC combo for Accuracy is kernal={Am}, C={Ac}, Gamma={Ag}')
print('Accuracy = {:6.4f}'.format(AccuracyList[bestA]))
  accuracy = sm.accuracy_score(y_test, pred)
  #precision = sm.precision_score(y_test, pred)
  #recall = sm.recall_score(y_test, pred)
  #f1 = sm.f1_score(y_test, pred)


  return max

SyntaxError: ignored

### Dense Network

In [None]:
#reshape
x_train = np.float32(x_train/255).reshape(x_train.shape[0],-1)
x_test = np.float32(x_test/255).reshape(x_test.shape[0],-1)


#one hot
y_train = tf.keras.utils.to_categorical(y_train, num_classes=2)
y_test = tf.keras.utils.to_categorical(y_test, num_classes=2)

#reshape
x_train_products = np.float32(x_train_products/255).reshape(x_train_products.shape[0],-1)
x_test_products = np.float32(x_test_products/255).reshape(x_test_products.shape[0],-1)

#one hot
y_train_products = tf.keras.utils.to_categorical(y_train_products, num_classes=2)
y_test_products = tf.keras.utils.to_categorical(y_test_products, num_classes=2)

rop = ReduceLROnPlateau(monitor='val_loss',factor=0.5, patience=2, verbose=1,min_lr=0.0005)

In [None]:
#number of folds to test out
#original data
num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train, x_test), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )


  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_80"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_400 (Dense)           (None, 500)               5500      
                                                                 
 dense_401 (Dense)           (None, 500)               250500    
                                                                 
 dense_402 (Dense)           (None, 500)               250500    
                                                                 
 dense_403 (Dense)           (None, 500)               250500    
                                                                 
 dense_404 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 758,002
Trainable params: 758,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for 

In [None]:
#products
num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train_products, x_test_products), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train_products.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1


print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

#print('Final accuracy on training set: {:.4f}'.format(history.history['accuracy'][-1]))
#print('Final accuracy on test set: {:.4f}'.format(history.history['val_accuracy'][-1]))

Model: "sequential_90"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_450 (Dense)           (None, 500)               33000     
                                                                 
 dense_451 (Dense)           (None, 500)               250500    
                                                                 
 dense_452 (Dense)           (None, 500)               250500    
                                                                 
 dense_453 (Dense)           (None, 500)               250500    
                                                                 
 dense_454 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 785,502
Trainable params: 785,502
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for 

In [None]:
#Normalizated dataset
scalerMM = MinMaxScaler()
scalerMM.fit(x_train)
x_trainScaleMM = scalerMM.transform(x_train)
x_testScaleMM = scalerMM.transform(x_test)

#standardized dataset
scalerSS = StandardScaler()
scalerSS.fit(x_train)
x_trainScaleSS = scalerSS.transform(x_train)
x_testScaleSS = scalerSS.transform(x_test)

#Normalizated dataset with products of attr
scalerMM = MinMaxScaler()
scalerMM.fit(x_train_products)
x_train_products_ScaleMM = scalerMM.transform(x_train_products)
x_test_products_ScaleMM = scalerMM.transform(x_test_products)

#standardized dataset with products of attr
scalerSS = StandardScaler()
scalerSS.fit(x_train_products)
x_train_products_ScaleSS = scalerSS.transform(x_train_products)
x_test_products_ScaleSS = scalerSS.transform(x_test_products)

In [None]:
#regular Normalized
#NORMALIZED DATA

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_trainScaleMM, x_testScaleMM), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_trainScaleMM.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_100"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_500 (Dense)           (None, 500)               5500      
                                                                 
 dense_501 (Dense)           (None, 500)               250500    
                                                                 
 dense_502 (Dense)           (None, 500)               250500    
                                                                 
 dense_503 (Dense)           (None, 500)               250500    
                                                                 
 dense_504 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 758,002
Trainable params: 758,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#Normalized products
#NORMALIZED DATA Products

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train_products_ScaleMM, x_test_products_ScaleMM), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train_products_ScaleMM.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_110"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_550 (Dense)           (None, 500)               33000     
                                                                 
 dense_551 (Dense)           (None, 500)               250500    
                                                                 
 dense_552 (Dense)           (None, 500)               250500    
                                                                 
 dense_553 (Dense)           (None, 500)               250500    
                                                                 
 dense_554 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 785,502
Trainable params: 785,502
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#regular standard
#STANDARDIED DATA

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_trainScaleSS, x_testScaleSS), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_trainScaleSS.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_120"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_600 (Dense)           (None, 500)               5500      
                                                                 
 dense_601 (Dense)           (None, 500)               250500    
                                                                 
 dense_602 (Dense)           (None, 500)               250500    
                                                                 
 dense_603 (Dense)           (None, 500)               250500    
                                                                 
 dense_604 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 758,002
Trainable params: 758,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#standerdized products
#number of folds to test out
#STANDARDIED DATA products

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train_products_ScaleSS, x_test_products_ScaleSS), axis=0)
targets = np.concatenate((y_train, y_test), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train_products_ScaleSS.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_130"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_650 (Dense)           (None, 500)               33000     
                                                                 
 dense_651 (Dense)           (None, 500)               250500    
                                                                 
 dense_652 (Dense)           (None, 500)               250500    
                                                                 
 dense_653 (Dense)           (None, 500)               250500    
                                                                 
 dense_654 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 785,502
Trainable params: 785,502
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

## Heart Disease Part 1 Model Evaluation and Findings with Oridinal



#Heart Disease Dataset PART 2

## Preprocessing

In [None]:
#heart disease uci
df_d = pd.read_csv('/content/drive/MyDrive/ML_Project/heart2.csv')

In [None]:
df_d

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [None]:
# F:0  M:1
# excercise No:0   Yes:1
#thal Normal:3   Fixed Defect: 6   7:reversible defect

#don't need to use the label encode because there's already values there instead of labels

In [None]:
data2 = df_d.to_numpy()

x2_original = data2[:,:12] #take all rows(x) and only attributes/columns(y) from 0-13 to predict heart disease
x2_products = data2[:,:12] #take all rows(x) and only attributes/columns(y) from 0-13 to predict heart disease


#y gets the heart attack disease values
y2 = data2[:,13]
x2_products = data2[:,:12] #take all rows(x) and only attributes/columns(y) from 0-13 to predict heart disease

#Products of Attributes
x2_cols = x2_products.shape[1]
for i in range(x2_cols):                            
    x2_products = np.hstack((x2_products ,(x2_products[:,i:x2_cols]*x2_products[:,i].reshape(-1,1))))  #end of class
print(x2_original.shape)
print(x2_products.shape)


#grabs the names in the columns
feature_names = df_d.columns


x_train2, x_test2, y_train2, y_test2 = train_test_split(x2_original, y2, test_size=0.2, random_state=42)
x_train2_products, x_test2_products, y_train2_products, y_test2_products = train_test_split(x2_products, y2, test_size=0.2, random_state=42)

print(x_train2)
print(x_train2_products)

(303, 12)
(303, 90)
[[42.   1.   1.  ...  0.   2.   0. ]
 [58.   1.   0.  ...  0.8  2.   0. ]
 [46.   1.   2.  ...  3.6  1.   0. ]
 ...
 [69.   1.   3.  ...  0.1  1.   1. ]
 [46.   1.   0.  ...  0.8  2.   0. ]
 [63.   0.   1.  ...  0.   2.   2. ]]
[[42.  1.  1. ...  4.  0.  0.]
 [58.  1.  0. ...  4.  0.  0.]
 [46.  1.  2. ...  1.  0.  0.]
 ...
 [69.  1.  3. ...  1.  1.  1.]
 [46.  1.  0. ...  4.  0.  0.]
 [63.  0.  1. ...  4.  4.  4.]]


In [None]:
#Normalizated dataset
scalerMM2 = MinMaxScaler()
scalerMM2.fit(x_train2)
x_trainScaleMM2 = scalerMM2.transform(x_train2)
x_testScaleMM2 = scalerMM2.transform(x_test2)

#standardized dataset
scalerSS2 = StandardScaler()
scalerSS2.fit(x_train2)
x_trainScaleSS2 = scalerSS2.transform(x_train2)
x_testScaleSS2 = scalerSS2.transform(x_test2)

#Normalizated dataset with products of attributes
scalerMM2 = MinMaxScaler()
scalerMM2.fit(x_train2_products)
x_train_products_ScaleMM2 = scalerMM2.transform(x_train2_products)
x_test_products_ScaleMM2 = scalerMM2.transform(x_test2_products)

#standardized dataset with  products of attributes
scalerSS2 = StandardScaler()
scalerSS2.fit(x_train2_products)
x_train_products_ScaleSS2 = scalerSS2.transform(x_train2_products)
x_test_products_ScaleSS2 = scalerSS2.transform(x_test2_products)

dataset_name = ["Original dataset", "Normalized Original Dataset", "Standardized Original Dataset", 
                "Products of Attributes Dataset", "Normalized Products of Attributes Dataset", "Standarized Products of Attributes Dataset"]
dataset_xtr = [x_train2, x_trainScaleMM2, x_trainScaleSS2,  x_train2_products, x_train_products_ScaleMM2, x_train_products_ScaleSS2]
dataset_xte = [x_test2, x_testScaleMM2, x_testScaleSS2,  x_test2_products, x_test_products_ScaleMM2, x_test_products_ScaleSS2]

## Model Evaluation

### Regressor

In [None]:
#Sklearn Regressors

def evaluate_model2(model, x_train, y_train, x_test, y_test):
  m = model().fit(x_train2, y_train2)
  pred = m.predict(x_test2)
  return mean_squared_error(y_test2, pred)

# MultinomialNB removed due to Maximum iterations still hasnt converged]
models = [KNeighborsRegressor, DecisionTreeRegressor,  RandomForestRegressor, BaggingRegressor,  LinearRegression ] # These are functions!
model_names = ['K-Nearest Neighbors regressor', 'Decision Tree Regressor' , 'Random Forest Regressor', 'Bagging Regressor', 'Linear Regression']

#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print()

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(evaluate_model2(models[i], dataset_xtr[index], y_train2, dataset_xte[index], y_test2))
    done = time.time() - start
    print('MSE = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmin(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best regression model that performed under this dataset is',model_names[best])
  print('Lowest MSE acheived  = {:6.4f}'.format(acc_list[best]))

Training all the models with this dataset variant:  Original dataset

Evaluating K-Nearest Neighbors regressor
MSE = 0.2052
Elapsed time = 0.0056 secs
Evaluating Decision Tree Regressor
MSE = 0.2459
Elapsed time = 0.0024 secs
Evaluating Random Forest Regressor
MSE = 0.1129
Elapsed time = 0.1620 secs
Evaluating Bagging Regressor
MSE = 0.1292
Elapsed time = 0.0245 secs
Evaluating Linear Regression
MSE = 0.1198
Elapsed time = 0.0025 secs

By utilizing this dataset variant:  Original dataset
The best regression model that performed under this dataset is Random Forest Regressor
Lowest MSE acheived  = 0.1129
Training all the models with this dataset variant:  Normalized Original Dataset

Evaluating K-Nearest Neighbors regressor
MSE = 0.2052
Elapsed time = 0.0032 secs
Evaluating Decision Tree Regressor
MSE = 0.2295
Elapsed time = 0.0015 secs
Evaluating Random Forest Regressor
MSE = 0.1218
Elapsed time = 0.1708 secs
Evaluating Bagging Regressor
MSE = 0.1307
Elapsed time = 0.0217 secs
Evaluatin

In [None]:
#Regressor with products of attributes

def evaluate_model2(model, x_train2, y_train2, x_test2, y_test2):
  m = model()
  m.fit(x_train2,y_train2)
  pred = m.predict(x_test2)
  #changd from MSE to MAE because values are continous
  return mean_absolute_error(pred,y_test2)


models2 = [KNeighborsRegressor, DecisionTreeRegressor,  RandomForestRegressor, BaggingRegressor] # These are functions!
model_names2 = ['K-Nearest Neighbors regressor', 'Decision Tree Regressor' , 'Random Forest Regressor', 'Bagging Regressor ']


acc_list2 = []
for i in range(len(models2)):
  print('Evaluating',model_names2[i])
  start = time.time()
  acc_list2.append(evaluate_model2(models[i], x_train2_products, y_train2_products, x_test2_products, y_test2_products))
  done = time.time() - start
  print('MAE = {:6.4f}'.format(acc_list2[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))

best = np.argmin(acc_list2)
print('The best model is',model_names2[best])
print('MAE = {:6.4f}'.format(acc_list2[best]))

Evaluating K-Nearest Neighbors regressor
MAE = 0.3738
Elapsed time = 0.0089 secs
Evaluating Decision Tree Regressor
MAE = 0.2295
Elapsed time = 0.0081 secs
Evaluating Random Forest Regressor
MAE = 0.2451
Elapsed time = 0.4320 secs
Evaluating Bagging Regressor 
MAE = 0.2639
Elapsed time = 0.0458 secs
The best model is Decision Tree Regressor
MAE = 0.2295


### Classifier

In [None]:
# Sklearn Classifer


def evaluate_model(model, x_train, y_train, x_test, y_test):
  m = model().fit(x_train,y_train)
  return m.score(x_test,y_test)

# MultinomialNB,  'ComplementNB', removed due to  Negative values in data passed to MultinomialNB (input X)
models = [KNeighborsClassifier, GaussianNB, BernoulliNB,  DecisionTreeClassifier, RandomForestClassifier] # These are functions!
model_names = ['K-Nearest Neighbors Classifier',  'GaussianNB', 'BernoulliNB', 'Decision Tree classifer',  'Random Forest classifer']


#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print()

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(evaluate_model(models[i], dataset_xtr[index], y_train2, dataset_xte[index], y_test2))
    done = time.time() - start
    print('Accuracy = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmax(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best classification model that performed under this dataset is',model_names[best])
  print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best]))

Training all the models with this dataset variant:  Original dataset

Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.6885
Elapsed time = 0.0081 secs
Evaluating GaussianNB
Accuracy = 0.7869
Elapsed time = 0.0026 secs
Evaluating BernoulliNB
Accuracy = 0.8689
Elapsed time = 0.0032 secs
Evaluating Decision Tree classifer
Accuracy = 0.7541
Elapsed time = 0.0029 secs
Evaluating Random Forest classifer
Accuracy = 0.8689
Elapsed time = 0.1773 secs

By utilizing this dataset variant:  Original dataset
The best classification model that performed under this dataset is BernoulliNB
Highest Accuracy acheived  = 0.8689
Training all the models with this dataset variant:  Normalized Original Dataset

Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.8197
Elapsed time = 0.0096 secs
Evaluating GaussianNB
Accuracy = 0.7869
Elapsed time = 0.0038 secs
Evaluating BernoulliNB
Accuracy = 0.8689
Elapsed time = 0.0040 secs
Evaluating Decision Tree classifer
Accuracy = 0.7705
Elapsed time = 0.0023 s

In [None]:
# Classifers with products

def evaluate_model3(model, x_train2, y_train2, x_test2, y_test2):
  m = model()
  m.fit(x_train2,y_train2)
  pred = m.predict(x_test2)
  return accuracy_score(pred,y_test2)


models3 = [KNeighborsClassifier, MultinomialNB, BernoulliNB,  DecisionTreeClassifier, RandomForestClassifier] # These are functions!
model_names3 = ['K-Nearest Neighbors Classifier',  'MultinomialNB', 'BernoulliNB', 'Decision Tree classifer',  'Random Forest classifer']


acc_list3 = []
for i in range(len(models3)):
  print('Evaluating',model_names3[i])
  start = time.time()
  acc_list3.append(evaluate_model3(models3[i], x_train2_products, y_train2_products, x_test2_products, y_test2_products))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list3[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))

best = np.argmax(acc_list3)
print('The best model is',model_names3[best])
print('Accuracy = {:6.4f}'.format(acc_list3[best]))

Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.6557
Elapsed time = 0.0148 secs
Evaluating MultinomialNB
Accuracy = 0.8525
Elapsed time = 0.0020 secs
Evaluating BernoulliNB
Accuracy = 0.8361
Elapsed time = 0.0026 secs
Evaluating Decision Tree classifer
Accuracy = 0.7541
Elapsed time = 0.0088 secs
Evaluating Random Forest classifer
Accuracy = 0.8689
Elapsed time = 0.2193 secs
The best model is Random Forest classifer
Accuracy = 0.8689


In [None]:
def cross_validate_model(model, x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = model().fit(x_train, y_train)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)

# MultinomialNB,  'ComplementNB', removed due to  Negative values in data passed to MultinomialNB (input X)
models = [KNeighborsClassifier, GaussianNB, BernoulliNB,  DecisionTreeClassifier, RandomForestClassifier] # These are functions!
model_names = ['K-Nearest Neighbors Classifier',  'GaussianNB', 'BernoulliNB', 'Decision Tree classifer',  'Random Forest classifer']


#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print(' using cross validation')

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(cross_validate_model(models[i], dataset_xtr[index], y_train2, dataset_xte[index], y_test2, 10))
    done = time.time() - start
    print('Accuracy = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmax(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best regression model that performed under this dataset is',model_names[best])
  print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best]))
  print('\n')

Training all the models with this dataset variant:  Original dataset
 using cross validation
Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.6071
Elapsed time = 0.0224 secs
Evaluating GaussianNB
Accuracy = 0.8357
Elapsed time = 0.0149 secs
Evaluating BernoulliNB
Accuracy = 0.8381
Elapsed time = 0.0200 secs
Evaluating Decision Tree classifer
Accuracy = 0.7214
Elapsed time = 0.0161 secs
Evaluating Random Forest classifer
Accuracy = 0.8024
Elapsed time = 1.5935 secs

By utilizing this dataset variant:  Original dataset
The best regression model that performed under this dataset is BernoulliNB
Highest Accuracy acheived  = 0.8381


Training all the models with this dataset variant:  Normalized Original Dataset
 using cross validation
Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.8024
Elapsed time = 0.0244 secs
Evaluating GaussianNB
Accuracy = 0.8357
Elapsed time = 0.0167 secs
Evaluating BernoulliNB
Accuracy = 0.8381
Elapsed time = 0.0193 secs
Evaluating Decision Tree classif

In [None]:
#Linear regression with products of attributes
model = LinearRegression().fit(x_train2_products,y_train2_products)

pred = model.predict(x_test2_products)
print('Mean squared error = {:5.2f}'.format(sm.mean_squared_error(pred,y_test2_products)))
print('Mean absolute error =  {:5.2f}'.format(sm.mean_absolute_error(pred,y_test2_products)))
print('Accuracy for test set=  {:5.2f}'.format(model.score(x_test2_products,y_test2_products)))

pred1 = model.predict(x_train2_products)
print('Mean squared error = {:5.2f}'.format(sm.mean_squared_error(pred1,y_train2_products)))
print('Mean absolute error =  {:5.2f}'.format(sm.mean_absolute_error(pred1,y_train2_products)))
print('Accuracy train set =  {:5.2f}'.format(model.score(x_train2_products,y_train2_products)))

Mean squared error =  0.20
Mean absolute error =   0.35
Accuracy for test set=   0.21
Mean squared error =  0.08
Mean absolute error =   0.22
Accuracy train set =   0.68


### Logistic Regression

In [None]:
#Logistic Regression

def evaluate_Logistic_Reg(x_train, y_train, x_test, y_test):
  model = LogisticRegression(max_iter=100,verbose=1,n_jobs=-1) 
  model.fit(x_train,y_train)
  pred = model.predict(x_test)
  return sm.accuracy_score(y_test, pred)

acc_list = []
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training Linear Regression model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(evaluate_Logistic_Reg(dataset_xtr[index], y_train2, dataset_xte[index], y_test2))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on Linear Regression: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 


Training Linear Regression model with this dataset variant:  Original dataset


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.6s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Accuracy = 0.8525
Elapsed time = 0.6524 secs

Training Linear Regression model with this dataset variant:  Normalized Original Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Accuracy = 0.8361
Elapsed time = 0.2838 secs

Training Linear Regression model with this dataset variant:  Standardized Original Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.


Accuracy = 0.8361
Elapsed time = 0.2857 secs

Training Linear Regression model with this dataset variant:  Products of Attributes Dataset
Accuracy = 0.8525
Elapsed time = 0.3213 secs

Training Linear Regression model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 0.8689
Elapsed time = 0.0215 secs

Training Linear Regression model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 0.8361
Elapsed time = 0.0262 secs

By utilizing the dataset variant on Linear Regression:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.8689


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [None]:
#cross validation
def cross_validate_LR(x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = LogisticRegression(max_iter=100,verbose=1,n_jobs=-1).fit(x_train, y_train)
  pred = cvm.predict(x_test)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)


acc_list = []
print('Using cross validation')
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training Linear Regression model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(cross_validate_LR(dataset_xtr[index], y_train2, dataset_xte[index], y_test2, 10))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on Linear Regression: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

Using cross validation
Training Linear Regression model with this dataset variant:  Original dataset


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

Accuracy = 0.8190
Elapsed time = 0.3084 secs

Training Linear Regression model with this dataset variant:  Normalized Original Dataset
Accuracy = 0.8690
Elapsed time = 0.1320 secs

Training Linear Regression model with this dataset variant:  Standardized Original Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

Accuracy = 0.8190
Elapsed time = 0.1295 secs

Training Linear Regression model with this dataset variant:  Products of Attributes Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

Accuracy = 0.7548
Elapsed time = 0.3545 secs

Training Linear Regression model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 0.8524
Elapsed time = 0.1880 secs

Training Linear Regression model with this dataset variant:  Standarized Products of Attributes Dataset


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

Accuracy = 0.8690
Elapsed time = 0.2218 secs

By utilizing the dataset variant on Linear Regression:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.8690


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


### SVC

In [None]:
#Sklearn SVC 

def evaluate_svc(x_train, y_train, x_test, y_test):
  model = SVC()
  model.fit(x_train, y_train)
  pred = model.predict(x_test)
  return sm.accuracy_score(y_test, pred)

acc_list = []
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training SVC model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(evaluate_svc(dataset_xtr[index], y_train2, dataset_xte[index], y_test2))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on SVC: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 


Training SVC model with this dataset variant:  Original dataset
Accuracy = 0.7049
Elapsed time = 0.0087 secs

Training SVC model with this dataset variant:  Normalized Original Dataset
Accuracy = 0.8525
Elapsed time = 0.0060 secs

Training SVC model with this dataset variant:  Standardized Original Dataset
Accuracy = 0.8361
Elapsed time = 0.0053 secs

Training SVC model with this dataset variant:  Products of Attributes Dataset
Accuracy = 0.7049
Elapsed time = 0.0087 secs

Training SVC model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 0.8525
Elapsed time = 0.0068 secs

Training SVC model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 0.8197
Elapsed time = 0.0087 secs

By utilizing the dataset variant on SVC:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.8525


In [None]:
#cross validation
def cross_validate_svc(x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = SVC().fit(x_train, y_train)
  pred = cvm.predict(x_test)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)

acc_list = []
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training SVC model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(cross_validate_svc(dataset_xtr[index], y_train2, dataset_xte[index], y_test2, 10))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on SVC: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

Training SVC model with this dataset variant:  Original dataset
Accuracy = 0.6524
Elapsed time = 0.0244 secs

Training SVC model with this dataset variant:  Normalized Original Dataset
Accuracy = 0.8857
Elapsed time = 0.0227 secs

Training SVC model with this dataset variant:  Standardized Original Dataset
Accuracy = 0.8024
Elapsed time = 0.0224 secs

Training SVC model with this dataset variant:  Products of Attributes Dataset
Accuracy = 0.6524
Elapsed time = 0.0298 secs

Training SVC model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 0.8690
Elapsed time = 0.0274 secs

Training SVC model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 0.8524
Elapsed time = 0.0275 secs

By utilizing the dataset variant on SVC:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.8857


##STOPPED HERE WITH THE UPDATING WITH PRODUCTS

In [None]:
#Aprox which kernel and parameters to use before search for max of SVC

def SVCApprox2(x_tr, y_tr, x_te, y_te):


  C = [.1, .5, 1.0, 3.0]
  Gamma = [.1, .5, 1.0, 3.0, 'scale', 'auto']
  COEF = [.1, .25, .5, .75, 1]
  AccuracyList = []
  Top4Acc = []
  #PrecisionList = []
  #RecallList = []
  #'poly',
  for kernel in ['linear', 'rbf', 'sigmoid']:
    print(f'SVC Model with kernel = {kernel}')
    KernelBestAcc = []
    for c in C:

      for G in Gamma:

        for CE in COEF:


          print(f"SVC paramters set to C={c}, Gamma ={G}, Coef={CE}")
          model = SVC(kernel=kernel, C = c, gamma = G, cache_size=10000, coef0 = CE)
              
          model.fit(x_tr, y_tr)
          pred = model.predict(x_te)
          AccuracyList.append(sm.accuracy_score(y_te, pred))
          KernelBestAcc.append(sm.accuracy_score(y_te, pred))
          print(f'Accuracy {AccuracyList[-1]:.4}')

    bestKernel = np.argmax(KernelBestAcc)
    Top4Acc.append(KernelBestAcc[bestKernel])
    print()
    print(f'SVC Model with kernel = {kernel} had a MAX_Accuracy = {KernelBestAcc[bestKernel]:.4}')
    print()

  bestA = np.argmax(AccuracyList)
  print(f'Total Size = {len(AccuracyList)}, Half size= {(len(AccuracyList)/2)}, Quarter size= {(len(AccuracyList)/4)}')
  print('Index of the Max Accuracy is in =', bestA)
  print(f'MAX_Accuracy = {AccuracyList[bestA]:.4}')

  return Top4Acc, AccuracyList[bestA]

In [None]:
def print5(T4):
  kernel = ['linear', 'rbf', 'sigmoid']
  for top in range(0,3):
    print(f'Kernel {kernel[top]} had a max accuracy of {T4[top]:.3}')

#Finding approx best to be able to go to max

print('No augumentation was done on the dataset')
Top4Acc2, Max_Acc2 = SVCApprox2(x_train2, y_train2, x_test2, y_test2)
print(f'Max Accuracy with no data augmentation = {Max_Acc2:.3}')
print5(Top4Acc2)

No augumentation was done on the dataset
SVC Model with kernel = linear
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.1
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.25
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.5
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.75
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =0.1, Coef=1
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.1
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.25
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.5
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.75
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =0.5, Coef=1
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.1
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.25
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.5
Accuracy 0.8852
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.75
Accuracy 0.8852
SVC paramters se

In [None]:
print('Applied MixMax/Normalization onto the dataset')
x_trainScaleMM3, x_testScaleMM3 = NormalizeData2(x_train2, x_test2)
Top4Acc_MM3, Max_N_Acc3 = SVCApprox2(x_trainScaleMM3, y_train2,x_testScaleMM3, y_test2)
print(f'Max Accuracy with Normalization on data = {Max_N_Acc3:.3}')
print5(Top4Acc_MM3)

Applied MixMax/Normalization onto the dataset
SVC Model with kernel = linear
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.1
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.25
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.5
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.75
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =0.1, Coef=1
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.1
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.25
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.5
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.75
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =0.5, Coef=1
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.1
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.25
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.5
Accuracy 0.8525
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.75
Accuracy 0.8525
SVC paramte

In [None]:
print('Applied standarization onto the dataset')
x_trainScaleSS3, x_testScaleSS3 = StandardizeData2(x_train2, x_test2)
Top4Acc_SS3, Max_S_Acc3 = SVCApprox2(x_trainScaleSS3, y_train2,x_testScaleSS3, y_test2)
print(f'Max Accuracy with Standardization on data = {Max_S_Acc3:.3}')
print5(Top4Acc_SS3)

Applied standarization onto the dataset
SVC Model with kernel = linear
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.1
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.25
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.5
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.75
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =0.1, Coef=1
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.1
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.25
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.5
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.75
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =0.5, Coef=1
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.1
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.25
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.5
Accuracy 0.8689
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.75
Accuracy 0.8689
SVC paramters set

### Dense Network

In [None]:
#attempting to fix overfitting on data
data2 = df_d.to_numpy()

x2 = data2[:,:12] #take all rows(x) and only attributes/columns(y) from 0-13 to predict heart disease


#y gets the heart attack disease values
y2 = data2[:,13]

#grabs the names in the columns
feature_names = df_d.columns

In [None]:
#data for dense network


#add more data
x2_cols = x2.shape[1]
for i in range(x2_cols):
    x2 = np.hstack((x2 ,(x2[:,i:x2_cols]*x2[:,i].reshape(-1,1))))

print(x2.shape)
print(x_train2)

x_train2, x_test2, y_train2, y_test2 = train_test_split(x2, y2, test_size=0.2, random_state=42)

#reshape
x_train2 = np.float32(x_train2/255).reshape(x_train2.shape[0],-1)
x_test2 = np.float32(x_test2/255).reshape(x_test2.shape[0],-1)


#one hot
y_train2 = tf.keras.utils.to_categorical(y_train2, num_classes=2)
y_test2 = tf.keras.utils.to_categorical(y_test2, num_classes=2)

rop = ReduceLROnPlateau(monitor='val_loss',factor=0.5, patience=2, verbose=1,min_lr=0.0005)

(303, 90)
[[0.16470589 0.00392157 0.00392157 ... 0.         0.         0.        ]
 [0.22745098 0.00392157 0.         ... 0.         0.         0.        ]
 [0.18039216 0.00392157 0.00784314 ... 0.         0.         0.        ]
 ...
 [0.27058825 0.00392157 0.01176471 ... 0.00392157 0.00392157 0.00392157]
 [0.18039216 0.00392157 0.         ... 0.         0.         0.        ]
 [0.24705882 0.         0.00392157 ... 0.0627451  0.0627451  0.0627451 ]]


In [None]:
x_train2.shape[1]


12

In [None]:
#original data
num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train2, x_test2), axis=0)
targets = np.concatenate((y_train2, y_test2), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train2.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )


  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_140"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_700 (Dense)           (None, 500)               6500      
                                                                 
 dense_701 (Dense)           (None, 500)               250500    
                                                                 
 dense_702 (Dense)           (None, 500)               250500    
                                                                 
 dense_703 (Dense)           (None, 500)               250500    
                                                                 
 dense_704 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 759,002
Trainable params: 759,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#products
num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train2_products, x_test2_products), axis=0)
targets = np.concatenate((y_train2, y_test2), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train2_products.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1


print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_150"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_750 (Dense)           (None, 500)               45500     
                                                                 
 dense_751 (Dense)           (None, 500)               250500    
                                                                 
 dense_752 (Dense)           (None, 500)               250500    
                                                                 
 dense_753 (Dense)           (None, 500)               250500    
                                                                 
 dense_754 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 798,002
Trainable params: 798,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#Normalizated dataset
scalerMM4 = MinMaxScaler()
scalerMM4.fit(x_train2)
x_trainScaleMM = scalerMM4.transform(x_train2)
x_testScaleMM = scalerMM4.transform(x_test2)

#standardized dataset
scalerSS4 = StandardScaler()
scalerSS4.fit(x_train2)
x_trainScaleSS = scalerSS4.transform(x_train2)
x_testScaleSS = scalerSS4.transform(x_test2)

#Normalizated dataset with products of attr
scalerMM4 = MinMaxScaler()
scalerMM4.fit(x_train2_products)
x_train_products_ScaleMM = scalerMM4.transform(x_train2_products)
x_test_products_ScaleMM = scalerMM4.transform(x_test2_products)

#standardized dataset with products of attr
scalerSS4 = StandardScaler()
scalerSS4.fit(x_train2_products)
x_train_products_ScaleSS = scalerSS4.transform(x_train2_products)
x_test_products_ScaleSS = scalerSS4.transform(x_test2_products)

In [None]:
#regular Normalized
#NORMALIZED DATA

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_trainScaleMM, x_testScaleMM), axis=0)
targets = np.concatenate((y_train2, y_test2), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_trainScaleMM.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_160"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_800 (Dense)           (None, 500)               6500      
                                                                 
 dense_801 (Dense)           (None, 500)               250500    
                                                                 
 dense_802 (Dense)           (None, 500)               250500    
                                                                 
 dense_803 (Dense)           (None, 500)               250500    
                                                                 
 dense_804 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 759,002
Trainable params: 759,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#NORMALIZED DATA Products

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train_products_ScaleMM, x_test_products_ScaleMM), axis=0)
targets = np.concatenate((y_train2, y_test2), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train_products_ScaleMM.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_170"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_850 (Dense)           (None, 500)               45500     
                                                                 
 dense_851 (Dense)           (None, 500)               250500    
                                                                 
 dense_852 (Dense)           (None, 500)               250500    
                                                                 
 dense_853 (Dense)           (None, 500)               250500    
                                                                 
 dense_854 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 798,002
Trainable params: 798,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#regular standard
#STANDARDIED DATA

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_trainScaleSS, x_testScaleSS), axis=0)
targets = np.concatenate((y_train2, y_test2), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_trainScaleSS.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_180"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_900 (Dense)           (None, 500)               6500      
                                                                 
 dense_901 (Dense)           (None, 500)               250500    
                                                                 
 dense_902 (Dense)           (None, 500)               250500    
                                                                 
 dense_903 (Dense)           (None, 500)               250500    
                                                                 
 dense_904 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 759,002
Trainable params: 759,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#standerdized products
#STANDARDIED DATA products

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train_products_ScaleSS, x_test_products_ScaleSS), axis=0)
targets = np.concatenate((y_train2, y_test2), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train_products_ScaleSS.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_190"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_950 (Dense)           (None, 500)               45500     
                                                                 
 dense_951 (Dense)           (None, 500)               250500    
                                                                 
 dense_952 (Dense)           (None, 500)               250500    
                                                                 
 dense_953 (Dense)           (None, 500)               250500    
                                                                 
 dense_954 (Dense)           (None, 2)                 1002      
                                                                 
Total params: 798,002
Trainable params: 798,002
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

## Heart Disease Part 2 Model Evaluation and Findings


# Heart Failure Part 3

## Preprocessing

In [None]:
# Mount google drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!ls  '/content/drive/MyDrive/ML_Project'

heart2.csv  heart.csv  heart_failure_clinical_records_dataset.csv


In [None]:
#from google.colab import files
#uploaded = files.upload() 

#heart failure predictions
df = pd.read_csv('/content/drive/MyDrive/ML_Project/heart.csv')

#heart disease uci
df_d = pd.read_csv('/content/drive/MyDrive/ML_Project/heart2.csv')

#heart failure 3
#heart disease uci
df_f = pd.read_csv('/content/drive/MyDrive/ML_Project/heart_failure_clinical_records_dataset.csv')

In [None]:
df_f

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
0,75.0,0,582,0,20,1,265000.00,1.9,130,1,0,4,1
1,55.0,0,7861,0,38,0,263358.03,1.1,136,1,0,6,1
2,65.0,0,146,0,20,0,162000.00,1.3,129,1,1,7,1
3,50.0,1,111,0,20,0,210000.00,1.9,137,1,0,7,1
4,65.0,1,160,1,20,0,327000.00,2.7,116,0,0,8,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
294,62.0,0,61,1,38,1,155000.00,1.1,143,1,1,270,0
295,55.0,0,1820,0,38,0,270000.00,1.2,139,0,0,271,0
296,45.0,0,2060,1,60,0,742000.00,0.8,138,0,0,278,0
297,45.0,0,2413,0,38,0,140000.00,1.4,140,1,1,280,0


In [None]:
df_f.shape

(299, 13)

In [None]:
data3 = df_f.to_numpy()

x3_original = data3[:,:11] #take all rows(x) and only attributes/columns(y) from 0-12 to predict heart failure
x3_products = data3[:,:11] #take all rows(x) and only attributes/columns(y) from 0-13 to predict heart disease

#Products of Attributes
x3_cols = x3_products.shape[1]
for i in range(x3_cols):                            
    x3_products = np.hstack((x3_products ,(x3_products[:,i:x3_cols]*x3_products[:,i].reshape(-1,1))))  #end of class
print(x3_original.shape)
print(x3_products.shape)

#y gets the heart death values
y3 = data3[:,12]

#grabs the names in the columns
feature_names3 = df_f.columns


x_train3, x_test3, y_train3, y_test3 = train_test_split(x3_original, y3, test_size=0.2, random_state=42)
x_train3_products, x_test3_products, y_train3_products, y_test3_products = train_test_split(x3_products, y3, test_size=0.2, random_state=42)


(299, 11)
(299, 77)


In [None]:
#Normalizated dataset
scalerMM3 = MinMaxScaler()
scalerMM3.fit(x_train3)
x_trainScaleMM3 = scalerMM3.transform(x_train3)
x_testScaleMM3 = scalerMM3.transform(x_test3)

#standardized dataset
scalerSS3 = StandardScaler()
scalerSS3.fit(x_train3)
x_trainScaleSS3 = scalerSS3.transform(x_train3)
x_testScaleSS3 = scalerSS3.transform(x_test3)

#Normalizated dataset with products of attributes
scalerMM3 = MinMaxScaler()
scalerMM3.fit(x_train3_products)
x_train_products_ScaleMM3 = scalerMM3.transform(x_train3_products)
x_test_products_ScaleMM3 = scalerMM3.transform(x_test3_products)

#standardized dataset with  products of attributes
scalerSS3 = StandardScaler()
scalerSS3.fit(x_train3_products)
x_train_products_ScaleSS3 = scalerSS3.transform(x_train3_products)
x_test_products_ScaleSS3 = scalerSS3.transform(x_test3_products)

dataset_name = ["Original dataset", "Normalized Original Dataset", "Standardized Original Dataset", 
                "Products of Attributes Dataset", "Normalized Products of Attributes Dataset", "Standarized Products of Attributes Dataset"]
dataset_xtr = [x_train3, x_trainScaleMM3, x_trainScaleSS3,  x_train3_products, x_train_products_ScaleMM3, x_train_products_ScaleSS3]
dataset_xte = [x_test3, x_testScaleMM3, x_testScaleSS3,  x_test3_products, x_test_products_ScaleMM3, x_test_products_ScaleSS3]

### Regressor

In [None]:
#Sklearn Regressors

def evaluate_model3(model, x_train, y_train, x_test, y_test):
  m = model().fit(x_train, y_train)
  pred = m.predict(x_test)
  return mean_squared_error(y_test, pred)

# MultinomialNB removed due to Maximum iterations still hasnt converged]
models = [KNeighborsRegressor, DecisionTreeRegressor,  RandomForestRegressor, BaggingRegressor,  LinearRegression ] # These are functions!
model_names = ['K-Nearest Neighbors regressor', 'Decision Tree Regressor' , 'Random Forest Regressor', 'Bagging Regressor', 'Linear Regression']

#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print()

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(evaluate_model3(models[i], dataset_xtr[index], y_train3, dataset_xte[index], y_test3))
    done = time.time() - start
    print('MSE = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmin(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best regression model that performed under this dataset is',model_names[best])
  print('Lowest MSE acheived  = {:6.4f}'.format(acc_list[best]))

Training all the models with this dataset variant:  Original dataset

Evaluating K-Nearest Neighbors regressor
MSE = 0.3127
Elapsed time = 0.0039 secs
Evaluating Decision Tree Regressor
MSE = 0.3167
Elapsed time = 0.0044 secs
Evaluating Random Forest Regressor
MSE = 0.2022
Elapsed time = 0.1770 secs
Evaluating Bagging Regressor
MSE = 0.1940
Elapsed time = 0.0247 secs
Evaluating Linear Regression
MSE = 0.2273
Elapsed time = 0.0020 secs

By utilizing this dataset variant:  Original dataset
The best regression model that performed under this dataset is Bagging Regressor
Lowest MSE acheived  = 0.1940
Training all the models with this dataset variant:  Normalized Original Dataset

Evaluating K-Nearest Neighbors regressor
MSE = 0.2693
Elapsed time = 0.0054 secs
Evaluating Decision Tree Regressor
MSE = 0.4333
Elapsed time = 0.0023 secs
Evaluating Random Forest Regressor
MSE = 0.1961
Elapsed time = 0.1777 secs
Evaluating Bagging Regressor
MSE = 0.2200
Elapsed time = 0.0244 secs
Evaluating Line

### Classifier

In [None]:
# Sklearn Classifer


def evaluate_model(model, x_train, y_train, x_test, y_test):
  m = model()
  m.fit(x_train,y_train)
  return m.score(x_test,y_test)

# MultinomialNB,  'ComplementNB', removed due to  Negative values in data passed to MultinomialNB (input X)
models = [KNeighborsClassifier, GaussianNB, BernoulliNB,  DecisionTreeClassifier, RandomForestClassifier] # These are functions!
model_names = ['K-Nearest Neighbors Classifier',  'GaussianNB', 'BernoulliNB', 'Decision Tree classifer',  'Random Forest classifer']


#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print()

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(evaluate_model(models[i], dataset_xtr[index], y_train3, dataset_xte[index], y_test3))
    done = time.time() - start
    print('Accuracy = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmax(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best classification model that performed under this dataset is',model_names[best])
  print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best]))

Training all the models with this dataset variant:  Original dataset

Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.5500
Elapsed time = 0.0039 secs
Evaluating GaussianNB
Accuracy = 0.6167
Elapsed time = 0.0020 secs
Evaluating BernoulliNB
Accuracy = 0.5833
Elapsed time = 0.0035 secs
Evaluating Decision Tree classifer
Accuracy = 0.7333
Elapsed time = 0.0021 secs
Evaluating Random Forest classifer
Accuracy = 0.6667
Elapsed time = 0.1621 secs

By utilizing this dataset variant:  Original dataset
The best classification model that performed under this dataset is Decision Tree classifer
Highest Accuracy acheived  = 0.7333
Training all the models with this dataset variant:  Normalized Original Dataset

Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.5833
Elapsed time = 0.0086 secs
Evaluating GaussianNB
Accuracy = 0.6333
Elapsed time = 0.0015 secs
Evaluating BernoulliNB
Accuracy = 0.5833
Elapsed time = 0.0032 secs
Evaluating Decision Tree classifer
Accuracy = 0.6167
Elapsed tim

In [None]:
def cross_validate_model(model, x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = model().fit(x_train, y_train)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)

# MultinomialNB,  'ComplementNB', removed due to  Negative values in data passed to MultinomialNB (input X)
models = [KNeighborsClassifier, GaussianNB, BernoulliNB,  DecisionTreeClassifier, RandomForestClassifier] # These are functions!
model_names = ['K-Nearest Neighbors Classifier',  'GaussianNB', 'BernoulliNB', 'Decision Tree classifer',  'Random Forest classifer']


#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):
  acc_list = []
  print('Training all the models with this dataset variant: ',dataset_name[index])
  print(' using cross validation')

  for i in range(len(models)):
    print('Evaluating',model_names[i])
    start = time.time()
    acc_list.append(cross_validate_model(models[i], dataset_xtr[index], y_train3, dataset_xte[index], y_test3, 10))
    done = time.time() - start
    print('Accuracy = {:6.4f}'.format(acc_list[-1]))
    print('Elapsed time = {:.4f} secs'.format(done))
  print()

  best = np.argmax(acc_list)
  print('By utilizing this dataset variant: ', dataset_name[index])
  print('The best regression model that performed under this dataset is',model_names[best])
  print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best]))
  print('\n')

Training all the models with this dataset variant:  Original dataset
 using cross validation
Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.5667
Elapsed time = 0.0264 secs
Evaluating GaussianNB
Accuracy = 0.6167
Elapsed time = 0.0163 secs
Evaluating BernoulliNB
Accuracy = 0.5667
Elapsed time = 0.0196 secs
Evaluating Decision Tree classifer
Accuracy = 0.6667
Elapsed time = 0.0160 secs
Evaluating Random Forest classifer
Accuracy = 0.6167
Elapsed time = 1.5850 secs

By utilizing this dataset variant:  Original dataset
The best regression model that performed under this dataset is Decision Tree classifer
Highest Accuracy acheived  = 0.6667


Training all the models with this dataset variant:  Normalized Original Dataset
 using cross validation
Evaluating K-Nearest Neighbors Classifier
Accuracy = 0.5833
Elapsed time = 0.0270 secs
Evaluating GaussianNB
Accuracy = 0.5833
Elapsed time = 0.0144 secs
Evaluating BernoulliNB
Accuracy = 0.5667
Elapsed time = 0.0184 secs
Evaluating Decision 

### Logistic Regression

In [None]:
#logistic regression
model = LogisticRegression(max_iter=100,verbose=1,n_jobs=-1)   #max_iter=100,verbose=1,n_jobs=-1
model.fit(x_train3,y_train3)
pred = model.predict(x_test3)

accuracy = sm.accuracy_score(y_test3, pred)
precision = sm.precision_score(y_test3, pred)
recall = sm.recall_score(y_test3, pred)
f1 = sm.f1_score(y_test3, pred)
print('Logistic Regression with default parameters')
print(f'Accuracy {accuracy:.4}')
print(f'Precision {precision:.4}')
print(f'Recall {recall:.4}')
print(f'F1 {f1:.4}')

Logistic Regression with default parameters
Accuracy 0.6833
Precision 0.875
Recall 0.28
F1 0.4242


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished


In [None]:
#cross validation
def cross_validate_LR(x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = LogisticRegression(max_iter=100,verbose=1,n_jobs=-1).fit(x_train, y_train)
  pred = cvm.predict(x_test)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)


acc_list = []
print('Using cross validation')
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training Linear Regression model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(cross_validate_LR(dataset_xtr[index], y_train3, dataset_xte[index], y_test3, 10))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on Linear Regression: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

Using cross validation
Training Linear Regression model with this dataset variant:  Original dataset


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1

Accuracy = 0.6333
Elapsed time = 0.2841 secs

Training Linear Regression model with this dataset variant:  Normalized Original Dataset
Accuracy = 0.6500
Elapsed time = 0.1331 secs

Training Linear Regression model with this dataset variant:  Standardized Original Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

Accuracy = 0.6500
Elapsed time = 0.1233 secs

Training Linear Regression model with this dataset variant:  Products of Attributes Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

Accuracy = 0.6000
Elapsed time = 0.2459 secs

Training Linear Regression model with this dataset variant:  Normalized Products of Attributes Dataset


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

Accuracy = 0.6333
Elapsed time = 0.2001 secs

Training Linear Regression model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 0.6167
Elapsed time = 0.2455 secs

By utilizing the dataset variant on Linear Regression:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.6500


[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBacken

### SVC

In [None]:
#Sklearn SVC 

def evaluate_svc(x_train, y_train, x_test, y_test):
  model = SVC()
  model.fit(x_train, y_train)
  pred = model.predict(x_test)
  return sm.accuracy_score(y_test, pred)

acc_list = []
#go through all 6 different types of dataset variations
for index in range(len(dataset_name)):

  print('Training SVC model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(evaluate_svc(dataset_xtr[index], y_train3, dataset_xte[index], y_test3))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on SVC: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

Training SVC model with this dataset variant:  Original dataset
Accuracy = 0.5833
Elapsed time = 0.0078 secs

Training SVC model with this dataset variant:  Normalized Original Dataset
Accuracy = 0.6000
Elapsed time = 0.0049 secs

Training SVC model with this dataset variant:  Standardized Original Dataset
Accuracy = 0.7000
Elapsed time = 0.0042 secs

Training SVC model with this dataset variant:  Products of Attributes Dataset
Accuracy = 0.5833
Elapsed time = 0.0084 secs

Training SVC model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 0.6000
Elapsed time = 0.0081 secs

Training SVC model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 0.6833
Elapsed time = 0.0081 secs

By utilizing the dataset variant on SVC:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.7000


In [None]:
#cross validation
def cross_validate_svc(x_train, y_train, x_test, y_test, num):
  #create the model
  cvm = SVC().fit(x_train, y_train)
  pred = cvm.predict(x_test)
  scores = cross_val_score(cvm, x_test, y_test, cv=num)
  return np.mean(scores)

acc_list = []
#go through all 6 different types of dataset variations
print('using cross validation')
for index in range(len(dataset_name)):

  print('Training SVC model with this dataset variant: ',dataset_name[index])

  start = time.time()
  acc_list.append(cross_validate_svc(dataset_xtr[index], y_train3, dataset_xte[index], y_test3, 10))
  done = time.time() - start
  print('Accuracy = {:6.4f}'.format(acc_list[-1]))
  print('Elapsed time = {:.4f} secs'.format(done))
  print()

best = np.argmax(acc_list)
print('By utilizing the dataset variant on SVC: ', dataset_name[index])
print('Highest Accuracy acheived  = {:6.4f}'.format(acc_list[best])) 

using cross validation
Training SVC model with this dataset variant:  Original dataset
Accuracy = 0.5833
Elapsed time = 0.0242 secs

Training SVC model with this dataset variant:  Normalized Original Dataset
Accuracy = 0.6500
Elapsed time = 0.0211 secs

Training SVC model with this dataset variant:  Standardized Original Dataset
Accuracy = 0.6333
Elapsed time = 0.0208 secs

Training SVC model with this dataset variant:  Products of Attributes Dataset
Accuracy = 0.5667
Elapsed time = 0.0269 secs

Training SVC model with this dataset variant:  Normalized Products of Attributes Dataset
Accuracy = 0.6333
Elapsed time = 0.0289 secs

Training SVC model with this dataset variant:  Standarized Products of Attributes Dataset
Accuracy = 0.6500
Elapsed time = 0.0274 secs

By utilizing the dataset variant on SVC:  Standarized Products of Attributes Dataset
Highest Accuracy acheived  = 0.6500


In [None]:
#Aprox which kernel and parameters to use before search for max of SVC

def SVCApprox3(x_tr, y_tr, x_te, y_te):


  C = [.1, .5, 1.0, 3.0]
  Gamma = [.1, .5, 1.0, 3.0, 'scale', 'auto']
  COEF = [.1, .25, .5, .75, 1]
  AccuracyList = []
  Top4Acc = []
  #PrecisionList = []
  #RecallList = []
  #'poly',
  for kernel in ['linear', 'rbf', 'sigmoid']:
    print(f'SVC Model with kernel = {kernel}')
    KernelBestAcc = []
    for c in C:

      for G in Gamma:

        for CE in COEF:


          print(f"SVC paramters set to C={c}, Gamma ={G}, Coef={CE}")
          model = SVC(kernel=kernel, C = c, gamma = G, cache_size=10000, coef0 = CE)
              
          model.fit(x_tr, y_tr)
          pred = model.predict(x_te)
          AccuracyList.append(sm.accuracy_score(y_te, pred))
          KernelBestAcc.append(sm.accuracy_score(y_te, pred))
          print(f'Accuracy {AccuracyList[-1]:.4}')

    bestKernel = np.argmax(KernelBestAcc)
    Top4Acc.append(KernelBestAcc[bestKernel])
    print()
    print(f'SVC Model with kernel = {kernel} had a MAX_Accuracy = {KernelBestAcc[bestKernel]:.4}')
    print()

  bestA = np.argmax(AccuracyList)
  print(f'Total Size = {len(AccuracyList)}, Half size= {(len(AccuracyList)/2)}, Quarter size= {(len(AccuracyList)/4)}')
  print('Index of the Max Accuracy is in =', bestA)
  print(f'MAX_Accuracy = {AccuracyList[bestA]:.4}')

  return Top4Acc, AccuracyList[bestA]

In [None]:
def print6(T4):
  kernel = ['linear', 'rbf', 'sigmoid']
  for top in range(0,3):
    print(f'Kernel {kernel[top]} had a max accuracy of {T4[top]:.3}')


#Finding approx best to be able to go to max

print('No augumentation was done on the dataset')
Top4Acc6, Max_Acc6 = SVCApprox3(x_train3, y_train3, x_test3, y_test3)
print(f'Max Accuracy with no data augmentation = {Max_Acc6:.3}')
print6(Top4Acc6)

No augumentation was done on the dataset
SVC Model with kernel = linear
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.1
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.25
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.5
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.75
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =0.1, Coef=1
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.1
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.25
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.5
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.75
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =0.5, Coef=1
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.1
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.25
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.5
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.75
Accuracy 0.55
SVC paramters set to C=0.1, Gamma =1.0, Coef

In [None]:
print('Applied MixMax/Normalization onto the dataset')
#x_trainScaleMM7, x_testScaleMM7 = NormalizeData(x_train3, x_test3)
Top4Acc_MM7, Max_N_Acc7 = SVCApprox3(x_trainScaleMM3, y_train3, x_testScaleMM3, y_test3)
print(f'Max Accuracy with Normalization on data = {Max_N_Acc7:.3}')
print6(Top4Acc_MM7)

Applied MixMax/Normalization onto the dataset
SVC Model with kernel = linear
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.1
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.25
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.5
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.75
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =0.1, Coef=1
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.1
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.25
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.5
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.75
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =0.5, Coef=1
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.1
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.25
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.5
Accuracy 0.5833
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.75
Accuracy 0.5833
SVC paramte

In [None]:
print('Applied standarization onto the dataset')
#x_trainScaleSS7, x_testScaleSS7 = StandardizeData3(x_train3, x_test3)
Top4Acc_SS7, Max_S_Acc7 = SVCApprox3(x_trainScaleSS3, y_train3, x_testScaleSS3, y_test3)
print(f'Max Accuracy with Standardization on data = {Max_S_Acc7:.3}')
print6(Top4Acc_SS7)

Applied standarization onto the dataset
SVC Model with kernel = linear
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.1
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.25
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.5
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =0.1, Coef=0.75
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =0.1, Coef=1
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.1
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.25
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.5
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =0.5, Coef=0.75
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =0.5, Coef=1
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.1
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.25
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.5
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =1.0, Coef=0.75
Accuracy 0.65
SVC paramters set to C=0.1, Gamma =1.0, Coef=

### Dense Network

In [None]:
# attempting to fix overfitting of dense network


In [None]:
#data for dense network

#reshape
x_train3 = np.float32(x_train3/255).reshape(x_train3.shape[0],-1)
x_test3 = np.float32(x_test3/255).reshape(x_test3.shape[0],-1)

#one hot
y_train3 = tf.keras.utils.to_categorical(y_train3, num_classes=2)
y_test3 = tf.keras.utils.to_categorical(y_test3, num_classes=2)


#reshape
x_train3_products = np.float32(x_train_products/255).reshape(x_train_products.shape[0],-1)
x_test3_products = np.float32(x_test_products/255).reshape(x_test_products.shape[0],-1)

#one hot
y_train3_products = tf.keras.utils.to_categorical(y_train_products, num_classes=2)
y_test3_products = tf.keras.utils.to_categorical(y_test_products, num_classes=2)


rop = ReduceLROnPlateau(monitor='val_loss',factor=0.5, patience=2, verbose=1,min_lr=0.0005)

In [None]:
x_train3.shape[1]

11

In [None]:
#original data
num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train3, x_test3), axis=0)
targets = np.concatenate((y_train3, y_test3), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train3.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )


  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_234"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1170 (Dense)          (None, 500)               6000      
                                                                 
 dense_1171 (Dense)          (None, 500)               250500    
                                                                 
 dense_1172 (Dense)          (None, 500)               250500    
                                                                 
 dense_1173 (Dense)          (None, 500)               250500    
                                                                 
 dense_1174 (Dense)          (None, 2)                 1002      
                                                                 
Total params: 758,502
Trainable params: 758,502
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#Normalizated dataset
scalerMM3 = MinMaxScaler()
scalerMM3.fit(x_train3)
x_trainScaleMM3 = scalerMM3.transform(x_train3)
x_testScaleMM3 = scalerMM3.transform(x_test3)

#standardized dataset
scalerSS3 = StandardScaler()
scalerSS3.fit(x_train3)
x_trainScaleSS3 = scalerSS3.transform(x_train3)
x_testScaleSS3 = scalerSS3.transform(x_test3)

#Normalizated dataset with products of attributes
scalerMM3 = MinMaxScaler()
scalerMM3.fit(x_train3_products)
x_train_products_ScaleMM3 = scalerMM3.transform(x_train3_products)
x_test_products_ScaleMM3 = scalerMM3.transform(x_test3_products)

#standardized dataset with  products of attributes
scalerSS3 = StandardScaler()
scalerSS3.fit(x_train3_products)
x_train_products_ScaleSS3 = scalerSS3.transform(x_train3_products)
x_test_products_ScaleSS3 = scalerSS3.transform(x_test3_products)

In [None]:
#regular Normalized
#NORMALIZED DATA

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_trainScaleMM3, x_testScaleMM3), axis=0)
targets = np.concatenate((y_train3, y_test3), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_trainScaleMM3.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_244"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1220 (Dense)          (None, 500)               6000      
                                                                 
 dense_1221 (Dense)          (None, 500)               250500    
                                                                 
 dense_1222 (Dense)          (None, 500)               250500    
                                                                 
 dense_1223 (Dense)          (None, 500)               250500    
                                                                 
 dense_1224 (Dense)          (None, 2)                 1002      
                                                                 
Total params: 758,502
Trainable params: 758,502
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#regular Normalized
#NORMALIZED DATA products

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train_products_ScaleMM3, x_test_products_ScaleMM3), axis=0)
targets = np.concatenate((y_train3_products, y_test3_products), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train_products_ScaleMM3.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_254"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1270 (Dense)          (None, 500)               33000     
                                                                 
 dense_1271 (Dense)          (None, 500)               250500    
                                                                 
 dense_1272 (Dense)          (None, 500)               250500    
                                                                 
 dense_1273 (Dense)          (None, 500)               250500    
                                                                 
 dense_1274 (Dense)          (None, 2)                 1002      
                                                                 
Total params: 785,502
Trainable params: 785,502
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#regular standard
#STANDARDIZED DATA

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_trainScaleSS3, x_testScaleSS3), axis=0)
targets = np.concatenate((y_train3, y_test3), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_trainScaleSS3.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_274"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1370 (Dense)          (None, 500)               6000      
                                                                 
 dense_1371 (Dense)          (None, 500)               250500    
                                                                 
 dense_1372 (Dense)          (None, 500)               250500    
                                                                 
 dense_1373 (Dense)          (None, 500)               250500    
                                                                 
 dense_1374 (Dense)          (None, 2)                 1002      
                                                                 
Total params: 758,502
Trainable params: 758,502
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for

In [None]:
#standerdized products
#STANDARDIED DATA products

num_folds = 10

# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []


# Merge inputs and targets
inputs = np.concatenate((x_train_products_ScaleSS3, x_test_products_ScaleSS3), axis=0)
targets = np.concatenate((y_train3_products, y_test3_products), axis=0)

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1

input = x_train_products_ScaleSS3.shape[1]
all_history = []

for train, test in kfold.split(inputs, targets):
#def dense_model(inputs = input,hidden_1=500, hidden_2=500, hidden_3=500, hidden_4=500):
  model = tf.keras.models.Sequential()
  model.add(Dense(500, input_shape=(input,), activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(500, activation='relu'))
  model.add(Dense(2, activation='sigmoid'))

  model.summary()
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.00005),loss="categorical_crossentropy", metrics=["accuracy"])

  # Generate a print
  print('---------------------------------')
  print(f'Training for fold {fold_no} ...')

  history = model.fit(
     inputs[train], targets[train],
      epochs = 150, 
      batch_size=256, 
      verbose = 1,
  )

  # Generate generalization metrics
  scores = model.evaluate(inputs[test], targets[test], verbose=0)
  print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
  acc_per_fold.append(scores[1] * 100)
  loss_per_fold.append(scores[0])

  # Increase fold number
  fold_no = fold_no + 1

print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
  print('------------------------------------------------------------------------')
  print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)}  (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')

Model: "sequential_264"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense_1320 (Dense)          (None, 500)               33000     
                                                                 
 dense_1321 (Dense)          (None, 500)               250500    
                                                                 
 dense_1322 (Dense)          (None, 500)               250500    
                                                                 
 dense_1323 (Dense)          (None, 500)               250500    
                                                                 
 dense_1324 (Dense)          (None, 2)                 1002      
                                                                 
Total params: 785,502
Trainable params: 785,502
Non-trainable params: 0
_________________________________________________________________
---------------------------------
Training for