In [None]:
from qiskit.circuit.library import ZZFeatureMap
from qiskit.utils import algorithm_globals
import wandb
from qiskit.circuit.library import RealAmplitudes
from qiskit.algorithms.optimizers import COBYLA, SPSA
from qiskit_ibm_runtime import Session, Sampler, Options
from qiskit_machine_learning.algorithms.classifiers import NeuralNetworkClassifier  # noqa: E501
from qiskit_machine_learning.neural_networks import SamplerQNN
from qiskit import QuantumCircuit
from qiskit.providers.fake_provider import FakeJakarta
from qiskit_aer.noise import NoiseModel
from sklearn.model_selection import train_test_split
import os
import urllib
import json
import numpy as np
import os
import pandas as pd
import zipfile
import shutil
from qiskit import QuantumCircuit, transpile
from qiskit_aer import AerSimulator

from sklearn import preprocessing
from sklearn.decomposition import KernelPCA
from qiskit_ibm_runtime import QiskitRuntimeService

from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score
)

In [None]:
def get_performance_metrics(test_labels, preds):
    performance_metrics = {}

    performance_metrics['accuracy'] = accuracy_score(test_labels, preds)
    performance_metrics['precision'] = precision_score(test_labels, preds)
    performance_metrics['recall'] = recall_score(test_labels, preds)
    performance_metrics['f1_score'] = f1_score(test_labels, preds)

    return performance_metrics

In [None]:
def prepare_data(
        output_dir,
        output_filename,
        downsample=True,
        num_features=5,
        scale_features=True,
        pca_components=3):
    
    # set paths and filenames
    base_data_dir = "data"
    remote_file = "https://oup.silverchair-cdn.com/oup/backfile/Content_public/Journal/clinchem/66/9/10.1093_clinchem_hvaa134/1/hvaa134_supplementary_data.zip?Expires=1688683795&Signature=OFBeHiIqyKjgH8P3rv13K4P~8dbtJkimb2OEhxg0IRN5AUzLlGMdUQMaXP6d48b38m0Q1RXtyS7NmKOwQLF-f-nJtTjKs4DFqDUqWLCJh4SsSEbly1llz-7w6EkLddTtCSgV09nsrAs68Yz8u~vRW0PCwYuoGQFFg-Ob3e94xOULqq9Qf7Ut3N08Vmg1X6DgMqgQiVlWVieKPr50FcowFH987KxN7jqj~a0LvvAbyH6cZiiRupN517uJ4Qac-yScHNZN4~BaEBESvBryez-3GdKUwxl76TlRrOszxxIo6OUf5aHqj4T0IzYBAq08gVajya4yi-RGFM1uHRh3LGxt1g__&Key-Pair-Id=APKAIE5G5CRDK6RD3PGA"

    raw_dir = os.path.join(base_data_dir, "raw")
    raw_data_file = os.path.join(raw_dir, "amino_acid_data.csv")
    output_file = os.path.join(output_dir, output_filename + ".csv")
    output_metadata_file = os.path.join(output_dir, output_filename + ".json")

    os.makedirs(raw_dir, exist_ok=True)
    os.makedirs(output_dir, exist_ok=True)

    if not os.path.exists(raw_data_file):
        urllib.request.urlretrieve(remote_file, os.path.join(raw_dir, "data.zip"))
        with zipfile.ZipFile(os.path.join(raw_dir, "data.zip"), 'r') as zip_ref:
            zip_ref.extractall(os.path.join(raw_dir))
        os.remove(os.path.join(raw_dir, "data.zip"))
        
        shutil.move(os.path.join(raw_dir, "hvaa134-suppl_data", "clc317479-file001.csv"), raw_data_file)
        shutil.rmtree(os.path.join(raw_dir, "hvaa134-suppl_data"))


    # read in data
    df = pd.read_csv(raw_data_file)

    # drop subject ID
    df = df.drop(['SID'], axis = 1)

    # convert SEX column to numerical codes
    df.loc[(df['SEX'] == 'F'), 'SEX'] = 0
    df.loc[(df['SEX'] == 'M'), 'SEX'] = 1
    df.loc[(df['SEX'] == 'U'), 'SEX'] = 2

    # convert ASA column to numerical codes
    df.loc[(df['ASA'] == 'N'), 'ASA'] = 0
    df.loc[(df['ASA'] == 'Y'), 'ASA'] = 1

    # convert Allo column to numerical codes
    df.loc[(df['Allo'] == 'N'), 'Allo'] = 0
    df.loc[(df['Allo'] == 'Y'), 'Allo'] = 1

    # convert Hcys column to numerical codes
    df.loc[(df['Hcys'] == 'N'), 'Hcys'] = 0
    df.loc[(df['Hcys'] == 'Y'), 'Hcys'] = 1

    # convert labels from text to numerical codes 
    df.loc[(df['Class'] == 'No.significant.abnormality.detected.'), 'Class'] = 0
    df.loc[(df['Class'] == 'X.Abnormal'), 'Class'] = 1

    # downsample majority class
    if downsample:
        total_1 = sum(df["Class"])
        df_0 = df[df["Class"]==0].sample(n = total_1)
        # df_comb = df[df["Class"]==1].append(df_0)
        df_comb = pd.concat([df[df["Class"]==1],df_0])
        df = df_comb.sample(frac=1).reset_index(drop=True)

    percent_positive_class = df[df["Class"]==1]["Class"].sum()/len(df["Class"])

    # check num_features
    max_features = len(df.columns) - 1
    if num_features > max_features:
        raise Exception(f"ERROR: For this dataset, the number of features must be {max_features} or less. Currently set to {num_features}")
    else:
        # save data as X and labels as Y
        X = df.iloc[:,:num_features].to_numpy()
        Y = df.iloc[:,-1:].to_numpy().astype("int").flatten()

    # scale features
    if scale_features:
        scaler = preprocessing.StandardScaler().fit(X)
        X = scaler.transform(X)

    #reduce dimentionality of data
    if pca_components is not None:
        kernel_pca_rbf = KernelPCA(n_components=pca_components, kernel= "rbf")
        kernel_pca_rbf.fit(X)
        X = kernel_pca_rbf.transform(X)

    # log final X shape
    x_shape = [X.shape[0],X.shape[1]]

    # write csv
    data = np.concatenate((X, Y[:, np.newaxis].astype(int)), axis=1)
    np.savetxt(output_file, data, delimiter=",")

    metadata = {
        "downsample": downsample,
        "num_features": num_features,
        "scale_features": scale_features,
        "records": len(Y),
        "percent_positive_class": percent_positive_class,
        "x_shape": x_shape,
        "filename": output_filename + ".csv"
    }

    with open(output_metadata_file, "w") as outfile:
        json.dump(metadata, outfile)

In [None]:
def load_data(
        processed_dir,
        downsample,
        num_features,
        scale_features,
        pca_components):

    processed_filename = f"d{downsample}_f{num_features}_s{scale_features}"
    if pca_components is not None:
        processed_filename += f"_p{pca_components}"

    processed_file_path = os.path.join(
        processed_dir,
        processed_filename + ".csv"
    )
    metadata_file_path = os.path.join(
        processed_dir,
        processed_filename + ".json"
    )

    if not os.path.exists(processed_file_path):
        prepare_data(
            output_dir=processed_dir,
            output_filename=processed_filename,
            downsample=downsample,
            num_features=num_features,
            scale_features=scale_features,
            pca_components=pca_components
        )

    return (processed_filename, processed_file_path, metadata_file_path)

In [None]:
# parity maps bitstrings to 0 or 1
def parity(x):
    return "{:b}".format(x).count("1") % 2

In [None]:
# Set and create local paths
base_data_dir = "data"
processed_dir = os.path.join(base_data_dir, "processed")
result_dir = os.path.join(base_data_dir, "results")
model_dir = os.path.join(base_data_dir, "models")

os.makedirs(processed_dir, exist_ok=True)
os.makedirs(result_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

In [None]:

# Load Data
filename, processed_file_path, metadata_file_path = load_data(
    processed_dir=processed_dir,
    downsample=True,
    num_features=28,
    scale_features=True,
    pca_components=3
)

df = pd.read_csv(processed_file_path, header=None)
df.head()

In [None]:
# save data as X and labels as Y into numpy arrays
X = df.iloc[:, :-1].to_numpy()
Y = df.iloc[:, -1:].to_numpy().astype("int").flatten()

# create train/test datasets
train_features, test_features, train_labels, test_labels = train_test_split(  # noqa: E501
    X,
    Y,
    train_size=0.8,
    random_state=42
)

In [None]:
algorithm_globals.random_seed = 42

feature_map = ZZFeatureMap(
    feature_dimension=train_features.shape[1],
    reps=3
)

ansatz = RealAmplitudes(
    num_qubits=train_features.shape[1],
    reps=3
)


In [None]:
# construct quantum circuit
qc = QuantumCircuit(train_features.shape[1])
qc.append(feature_map, range(train_features.shape[1]))
qc.append(ansatz, range(train_features.shape[1]))

In [None]:
options = Options()
options.optimization_level = 3
options.execution.shots = 100

In [None]:
backend = AerSimulator()
service = QiskitRuntimeService()

y_pred = None

with Session(service, backend=backend) as session:
    sampler = Sampler(session=session, options=options)

    # corresponds to the number of classes,
    # possible outcomes of the (parity) mapping
    output_shape = 2

    # construct QNN
    sampler_qnn = SamplerQNN(
        circuit=qc,
        input_params=feature_map.parameters,
        weight_params=ansatz.parameters,
        interpret=parity,
        output_shape=output_shape,
        sampler=sampler
    )

    optimizer = None
    optimizer_config = wandb.config["quantum_params"]["optimizer"]

    optimizer = COBYLA(
        maxiter=100
    )

    # construct classifier
    sampler_classifier = NeuralNetworkClassifier(
        neural_network=sampler_qnn,
        optimizer=optimizer
    )

    sampler_classifier.fit(train_features, train_labels)

    y_pred = sampler_classifier.predict(test_features)

In [None]:
metrics = get_performance_metrics(test_labels, y_pred)

In [None]:
metrics