In [None]:
# --- Python Module Imports ---
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import sklearn as sk
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score
from sklearn.pipeline import Pipeline
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam
import re

In [None]:
"""
Description: Extracts the string as a column name after the closing parenthesis
and before the opening parenthesis if 1 exists

Arguments:
  currentLine(string): Line to extract string

Returns:
  newLine(string): Name of column extracted from line of text
"""
def extractColumnName(currentLine):
  openingParenIndex = currentLine.find(")") + 1
  if "(" in currentLine:
    closingParenIndex = currentLine.find("(")
    newLine = currentLine[openingParenIndex:closingParenIndex].strip()
  else:
    newLine = currentLine[openingParenIndex:].strip()
  return newLine

In [None]:
# Variable Initialization
columnNames = ["Class"]
pattern = r"^\d+\)"
reachedLine = False

In [None]:
# Generate column names for dataset
with open('wine.names', 'r') as columnsFile:
  for line in columnsFile:
    matchedPattern = False
    if "riclea@anchem.unige.it )" in line:
      reachedLine = True
      continue
    if reachedLine:
      if len(line) > 3:
        newLine = line.strip()
        if re.match(pattern, newLine):
          newLine = extractColumnName(newLine)
          columnNames.append(newLine)

In [None]:
# Process data from Breast Cancer dataset file
dataSet = pd.read_csv('wine.data', delimiter=',', names=columnNames)
pd.set_option('display.max_rows', None)

In [None]:
### Pipelined Preprocessing of Dataset ###

# Mean Imputation Function Filling out Missing Value with Mean Values of their Class
def imputer(dataSet):
  return dataSet.fillna(dataSet.groupby('Class').transform('mean'))

# Do Min Max Scaling for All Columns Except for the Class Column
def minMaxScaling(dataSet):
  scaler = MinMaxScaler()
  dataSetClass = dataSet['Class']
  dataSetColumns = dataSet.drop(columns=['Class'])
  scaledDataColumns = scaler.fit_transform(dataSetColumns)
  scaledDataSet = pd.DataFrame(scaledDataColumns, columns=dataSetColumns.columns)
  scaledDataSet.insert(0, 'Class', dataSetClass)
  scaledDataSet['Class'].astype(int)
  return scaledDataSet

# Pipeline that preprocesses the dataset
pipeline = Pipeline([
    ('mean_impute', FunctionTransformer(imputer)),
    ('min_max_scaler', FunctionTransformer(minMaxScaling))
])

dataSet = pd.DataFrame(pipeline.fit_transform(dataSet), columns=columnNames)

In [None]:
# Splitting features and target(in this case Diagnosis) of dataset
features = dataSet.drop(columns='Class', axis=1)
targetVar = dataSet['Class']

# Splitting dataset into training and testing data
X_train, X_test, y_train, y_test = train_test_split(features, targetVar, random_state=42, test_size=0.1, stratify=targetVar)

In [None]:
'''
Creating the multi-layer perceptron(MLP) network using backpropagation with
momentum learning algorithm solving the Wine Dataset classification problem
'''

# MLP Model Function
def createMLP(numLayers, numNeuronsPerLayer, learnRate):
  mlpModel = Sequential()
  for i in range(numLayers):
    mlpModel.add(Dense(numNeuronsPerLayer, activation='relu'))
  mlpModel.add(Dense(3, activation='softmax'))
  mlpModel.compile(optimizer=Adam(learning_rate=learnRate), loss='categorical_crossentropy', metrics=['accuracy'])
  return mlpModel

# Hyperparameters of MLP(number of layers, neurons per layer, learning rate)
paramGrid = {
  'layers': [1, 2, 3],
  'neurons_per_layer': [32, 64, 128],
  'learning_rate': [0.001, 0.01, 0.1]
}
# Stratified K Fold
stratKFold = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# One Hot Encoder on Targeted Variable for Training Data
encoder = OneHotEncoder(sparse_output=False)

# MLP Hyperparameter Variables
mean_f1Scores = {}
test = 0
highestF1Score = 0
best_params = ()

# Test MLP for each unique set of hyperparameters
for layers in paramGrid['layers']:
  for neuronsPerLayer in paramGrid['neurons_per_layer']:
    for learnRate in paramGrid['learning_rate']:
      # Retrieve F1 scores for each fold of the current MLP
      f1Scores = []

      # Store parameters of MLP in case the MLP achieves the highest F1 score
      parameters = (layers, neuronsPerLayer, learnRate)

      # There are 27 different MLPs for 27 sets of hyperparameters
      mlpModel = createMLP(layers, neuronsPerLayer, learnRate)


      for trainIdxes, validateIdxes in stratKFold.split(X_train, y_train):

        # Get features data for training fold from training dataset
        X_train_fold = X_train.iloc[trainIdxes]

        # Get features data for validation fold from training dataset
        X_validate_fold = X_train.iloc[validateIdxes]

        # Get target data for training fold from training dataset
        y_train_fold = pd.DataFrame(y_train.iloc[trainIdxes], columns=['Class'])

        # Get target data for training fold from training dataset
        y_validate_fold = pd.DataFrame(y_train.iloc[validateIdxes], columns=['Class'])

        # One Hot Encoding with the Training and Validation Data of the Target Variable
        y_train_encoded = encoder.fit_transform(y_train_fold[['Class']])
        y_validate_encoded = encoder.fit_transform(y_validate_fold[['Class']])

        # Fit the training data into the MLP model(note: target data must be one hot encoded)
        mlpModel.fit(X_train_fold, y_train_encoded)

        # Give predictions on what the target values should be for the validation features
        y_predict = mlpModel.predict(X_validate_fold)

        # Get the corresponding F1 score for the MLP with the predicted and validated target data
        f1Score = f1_score(np.argmax(y_validate_encoded, axis=1), np.argmax(y_predict, axis=1), average='micro')
        f1Scores.append(f1Score)

      # Get Mean F1 Score of the hyperparameter set
      meanF1Score = np.mean(f1Scores)

      # Update best parameters for MLP and its corresponding mean F1 score
      if meanF1Score > highestF1Score:
        highestF1Score = meanF1Score
        best_params = parameters

In [None]:
# Display MLP model with best parameters and its corresponding F1 Score
print("The best parameters are " + str(best_params[0]) +
      " hidden layers with " + str(best_params[1]) +
      " neurons per layer at a learning rate of " + str(best_params[2]))
print("The highest Mean F1 score is " + str(highestF1Score))

In [None]:
### Train Network with All Training Data using the Best Parameters and Validate with the Test Data ###

# Train Network with All the Training Data
bestMLPModel = createMLP(best_params[0], best_params[1], best_params[2])
X_train_all = X_train
y_train_all = pd.DataFrame(y_train, columns=['Class'])
y_train_all_encoded = encoder.fit_transform(y_train_all[['Class']])
y_test = pd.DataFrame(y_test, columns=['Class'])
y_test_encoded = encoder.fit_transform(y_test[['Class']])
bestMLPModel.fit(X_train_all, y_train_all_encoded)
y_predict_test = bestMLPModel.predict(X_test)

# Get F1 score of MLP model with the test data
f1ScoreTest = f1_score(np.argmax(y_test_encoded, axis=1), np.argmax(y_predict_test, axis=1), average='micro')
print("The Mean F1 score of the Best MLP Model on this testing dataset is "
  + str(f1ScoreTest))