# Tabular data

The data is given in a tabular form, where the order does not matter. So a very different approach is needed to solve this problem than the time series where a curve should be fitted to the data.<br /><br />
The idea is to predict wine quality based on features from the [Kaggle](https://www.kaggle.com/datasets/yasserh/wine-quality-dataset) dataset.
<br />
We take QSVM making it multiclass classification and not QNN because the data is tabular and not time series.
<br /><br />
The problem is that its unordered (order doesnt matter). So a QNN would not be the best approach (we can't just approximate a target function). We could use a QSVM with kernel to classify the data. We can use the QSVM to classify the data into the 7 classes of wine quality. We could also still try to use QNN either for classification or directly for regression. But the QSVM is the most straightforward approach.

In [2]:
import matplotlib.pyplot as plt
import numpy as np
from IPython.display import clear_output
from qiskit import QuantumCircuit
from qiskit.circuit import Parameter
from qiskit.circuit.library import RealAmplitudes, ZZFeatureMap
from qiskit_algorithms.optimizers import COBYLA, L_BFGS_B, ADAM
from qiskit_algorithms.utils import algorithm_globals

from qiskit_machine_learning.algorithms.classifiers import NeuralNetworkClassifier, VQC
from qiskit_machine_learning.algorithms.regressors import NeuralNetworkRegressor, VQR
from qiskit_machine_learning.neural_networks import SamplerQNN, EstimatorQNN
from qiskit_machine_learning.circuit.library import QNNCircuit
from qiskit.circuit.library import TwoLocal
import pandas as pd
from scipy.signal import argrelextrema
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from qiskit.providers.basic_provider import BasicSimulator
from qiskit.circuit import QuantumCircuit, Parameter, ParameterVector
from qiskit.circuit.library import PauliFeatureMap, ZFeatureMap, ZZFeatureMap
from qiskit.circuit.library import TwoLocal, NLocal, RealAmplitudes, EfficientSU2
from qiskit.circuit.library import HGate, RXGate, RYGate, RZGate, CXGate, CRXGate, CRZGate
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

algorithm_globals.random_seed = 42

## 1. Preprocessing

In [3]:
data = pd.read_csv('./data/WineQT.csv')

# explore data
# print(data.head())
# print(data.info())

data = data.drop('Id', axis=1)
# remove half of the data to make it easier model and faster to train
data = data.iloc[::2, :]
X = data.drop('quality', axis=1)
y = data['quality']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# range of quality
quality_min = y.min()
quality_max = y.max()
print (quality_min, quality_max)
LABELS = range(quality_min, quality_max+1)

# show for each label one example
num_labels = len(LABELS)
for i in range(num_labels):
    print(f'Label: {LABELS[i]}')
    print(X_train_scaled[y_train == LABELS[i]][0].reshape(1, -1))

# could also make some outlier detection and removal etc. here...

3 8
Label: 3
[[-5.67134037e-01  2.50534308e+00 -1.11586538e+00 -3.37124121e-01
  -5.04837386e-01  4.55823071e-01  1.13402933e-01  2.25138489e-01
   1.47266097e-04 -6.39637085e-01 -7.43429991e-01]]
Label: 4
[[-0.21704815  1.98316607 -1.21630865 -0.57710188 -0.56466382 -1.12341327
  -1.13944539  0.30396398  0.3366503  -1.95906902 -1.28632627]]
Label: 5
[[-0.74217698  1.54343807 -1.36697356 -0.01715377 -0.36524237 -0.82730645
  -0.77275807  0.146313    1.14425758 -0.57966291 -0.8339127 ]]
Label: 6
[[ 2.1168578  -0.73765094  2.09831925  0.22282399  0.17319557  0.15971626
  -0.31439893  1.46007123 -1.41316547 -0.27979201  0.25187985]]
Label: 7
[[ 1.29999072 -0.07805894  1.84721107 -0.09714636  0.41250132 -0.92600873
  -0.92554445  0.35651431 -1.48046608  0.02007888  1.15670698]]
Label: 8
[[-0.62548169 -0.79261694  0.18989712 -0.41711671 -0.60454812 -0.03768829
  -0.49774259 -0.99928418 -0.53825759  0.61982067  0.70429341]]


## 2. Model

In [4]:
repetition = 1

# This was the first attempt. Seems to be too complex for the data
# So the next cells tries a different approach


labels_train_3 = np.where(y_train==3, 1, 0)
labels_test_3 = np.where(y_test==3, 1, 0)
feature_map_3 = ZZFeatureMap(feature_dimension=X_train_scaled.shape[1], reps=repetition, entanglement='linear')
kernel_3 = FidelityQuantumKernel(feature_map=feature_map_3)
svc_3 = SVC(kernel='precomputed', probability=True)
matrix_train_3 = kernel_3.evaluate(x_vec=X_train)
svc_3.fit(matrix_train_3, labels_train_3)
matrix_val_3 = kernel_3.evaluate(x_vec=y_train, y_vec=X_train)
score_3 = svc_3.score(matrix_val_3, labels_test_3)
print(f'Accuracy of discriminating between label 3 and others: {score_3*100}%')

labels_train_4 = np.where(y_train==4, 1, 0)
labels_test_4 = np.where(y_test==4, 1, 0)
feature_map_4 = ZZFeatureMap(feature_dimension=X_train_scaled.shape[1], reps=repetition, entanglement='linear')
kernel_4 = FidelityQuantumKernel(feature_map=feature_map_4)
svc_4 = SVC(kernel='precomputed', probability=True)
matrix_train_4 = kernel_4.evaluate(x_vec=X_train)
svc_4.fit(matrix_train_4, labels_train_4)
matrix_val_4 = kernel_4.evaluate(x_vec=y_train, y_vec=X_train)
score_4 = svc_4.score(matrix_val_4, labels_test_4)
print(f'Accuracy of discriminating between label 4 and others: {score_4*100}%')

labels_train_5 = np.where(y_train==5, 1, 0)
labels_test_5 = np.where(y_test==5, 1, 0)
feature_map_5 = ZZFeatureMap(feature_dimension=X_train_scaled.shape[1], reps=repetition, entanglement='linear')
kernel_5 = FidelityQuantumKernel(feature_map=feature_map_5)
svc_5 = SVC(kernel='precomputed', probability=True)
matrix_train_5 = kernel_5.evaluate(x_vec=X_train)
svc_5.fit(matrix_train_5, labels_train_5)
matrix_val_5 = kernel_5.evaluate(x_vec=y_train, y_vec=X_train)
score_5 = svc_5.score(matrix_val_5, labels_test_5)
print(f'Accuracy of discriminating between label 5 and others: {score_5*100}%')

labels_train_6 = np.where(y_train==6, 1, 0)
labels_test_6 = np.where(y_test==6, 1, 0)
feature_map_6 = ZZFeatureMap(feature_dimension=X_train_scaled.shape[1], reps=repetition, entanglement='linear')
kernel_6 = FidelityQuantumKernel(feature_map=feature_map_6)
svc_6 = SVC(kernel='precomputed', probability=True)
matrix_train_6 = kernel_6.evaluate(x_vec=X_train)
svc_6.fit(matrix_train_6, labels_train_6)
matrix_val_6 = kernel_6.evaluate(x_vec=y_train, y_vec=X_train)
score_6 = svc_6.score(matrix_val_6, labels_test_6)
print(f'Accuracy of discriminating between label 6 and others: {score_6*100}%')

labels_train_7 = np.where(y_train==7, 1, 0)
labels_test_7 = np.where(y_test==7, 1, 0)
feature_map_7 = ZZFeatureMap(feature_dimension=X_train_scaled.shape[1], reps=repetition, entanglement='linear')
kernel_7 = FidelityQuantumKernel(feature_map=feature_map_7)
svc_7 = SVC(kernel='precomputed', probability=True)
matrix_train_7 = kernel_7.evaluate(x_vec=X_train)
svc_7.fit(matrix_train_7, labels_train_7)
matrix_val_7 = kernel_7.evaluate(x_vec=y_train, y_vec=X_train)
score_7 = svc_7.score(matrix_val_7, labels_test_7)
print(f'Accuracy of discriminating between label 7 and others: {score_7*100}%')

labels_train_8 = np.where(y_train==8, 1, 0)
labels_test_8 = np.where(y_test==8, 1, 0)
feature_map_8 = ZZFeatureMap(feature_dimension=X_train_scaled.shape[1], reps=repetition, entanglement='linear')
kernel_8 = FidelityQuantumKernel(feature_map=feature_map_8)
svc_8 = SVC(kernel='precomputed', probability=True)
matrix_train_8 = kernel_8.evaluate(x_vec=X_train)
svc_8.fit(matrix_train_8, labels_train_8)
matrix_val_8 = kernel_8.evaluate(x_vec=y_train, y_vec=X_train)
score_8 = svc_8.score(matrix_val_8, labels_test_8)
print(f'Accuracy of discriminating between label 8 and others: {score_8*100}%')


matrix_test_3 = kernel_3.evaluate(x_vec=X_test, y_vec=X_train)
pred_3 = svc_3.predict_proba(matrix_test_3)[:, 1]
print(f'Probability of label 3: {np.round(pred_3, 2)}')

matrix_test_4 = kernel_4.evaluate(x_vec=X_test, y_vec=X_train)
pred_4 = svc_4.predict_proba(matrix_test_4)[:, 1]
print(f'Probability of label 4: {np.round(pred_4, 2)}')

matrix_test_5 = kernel_5.evaluate(x_vec=X_test, y_vec=X_train)
pred_5 = svc_5.predict_proba(matrix_test_5)[:, 1]
print(f'Probability of label 5: {np.round(pred_5, 2)}')

matrix_test_6 = kernel_6.evaluate(x_vec=X_test, y_vec=X_train)
pred_6 = svc_6.predict_proba(matrix_test_6)[:, 1]
print(f'Probability of label 6: {np.round(pred_6, 2)}')

matrix_test_7 = kernel_7.evaluate(x_vec=X_test, y_vec=X_train)
pred_7 = svc_7.predict_proba(matrix_test_7)[:, 1]
print(f'Probability of label 7: {np.round(pred_7, 2)}')

matrix_test_8 = kernel_8.evaluate(x_vec=X_test, y_vec=X_train)
pred_8 = svc_8.predict_proba(matrix_test_8)[:, 1]
print(f'Probability of label 8: {np.round(pred_8, 2)}')


pred_test = np.argmax([pred_3, pred_4, pred_5, pred_6, pred_7, pred_8], axis=0) + 3


KeyboardInterrupt: 

In [4]:
import numpy as np
from qiskit_machine_learning.kernels import FidelityQuantumKernel
from qiskit.circuit.library import ZZFeatureMap
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from joblib import Parallel, delayed

# Parameters
repetition = 1


In [None]:
repetitions = [1, 3, 5, 7, 9, 11, 13, 15]
labels_to_classify = [3, 4, 5, 6, 7, 8]
num_features = 8 # reduce the number of features to make it easier to train (easier here means faster to execute :))
# note: I tried with 2, 5 and 8 features. All of them gave similar results. 

pca = PCA(n_components=num_features)
X_train_reduced = pca.fit_transform(X_train_scaled)
X_test_reduced = pca.transform(X_test)

def train_and_score(label):
    print(f"Training SVC for label {label}...")

    labels_train = np.where(y_train == label, 1, 0)
    labels_test = np.where(y_test == label, 1, 0)
    
    svc = SVC(kernel='precomputed', probability=True)
    svc.fit(matrix_train, labels_train)
    
    matrix_val = kernel.evaluate(x_vec=X_test_reduced, y_vec=X_train_reduced)
    score = svc.score(matrix_val, labels_test)
    
    probs = svc.predict_proba(matrix_test)[:, 1]
    return label, score, svc, probs

total_result = {}

for repetition in repetitions:
    print(f"(i) Training now with {repetition} repetition(s)...")

    feature_map = ZZFeatureMap(feature_dimension=num_features, reps=repetition, entanglement='linear')
    kernel = FidelityQuantumKernel(feature_map=feature_map)

    print("Computing training kernel matrix...")
    matrix_train = kernel.evaluate(x_vec=X_train_reduced)
    print("Training kernel matrix computed.")

    print("Computing test kernel matrix...")
    matrix_test = kernel.evaluate(x_vec=X_test_reduced, y_vec=X_train_reduced)
    print("Test kernel matrix computed.")

    results = Parallel(n_jobs=-1)(delayed(train_and_score)(label) for label in labels_to_classify)

    svc_models = {label: svc for label, _, svc, _ in results}
    scores = {label: score for label, score, _, _ in results}
    probabilities = np.array([probs for _, _, _, probs in results])

    for label, score, _, _ in results:
        print(f"Accuracy of discriminating between label {label} and others: {score * 100}%")

    total_acc = np.mean(list(scores.values())) * 100   
    total_result[repetition] = total_acc 

    print(f"Total accuracy: {total_acc}%")

print(f"\n\n{total_result}")


(i) Training now with 1 repetition(s)...
Computing training kernel matrix...




Training kernel matrix computed.
Computing test kernel matrix...
Test kernel matrix computed.
Accuracy of discriminating between label 3 and others: 99.1304347826087%
Accuracy of discriminating between label 4 and others: 94.78260869565217%
Accuracy of discriminating between label 5 and others: 57.391304347826086%
Accuracy of discriminating between label 6 and others: 60.86956521739131%
Accuracy of discriminating between label 7 and others: 86.95652173913044%
Accuracy of discriminating between label 8 and others: 98.26086956521739%
Total accuracy: 82.89855072463767%
(i) Training now with 3 repetition(s)...
Computing training kernel matrix...
Training kernel matrix computed.
Computing test kernel matrix...
Test kernel matrix computed.
