# Quantum Neural Networks (QNNs) for Genomic Pattern Detection
**Project By [Adnan Sami Bhuiyan](https://muhammedadnansami.com), and [Hasan Khan](https://osu.github.io/portfolio)**

---
This project introduces Quantum Neural Networks (QNNs) to analyze genomic data for personalized medicine. With the rise of genetic sequencing, QNNs can detect complex patterns in genetic variants to predict disease risks, drug responses, and optimal treatment paths. By leveraging quantum computation, the project tackles the high-dimensional complexity of genomic pattern recognition, which classical neural networks struggle to handle efficiently.


In [9]:
!pip install pennylane scikit-learn openvino-dev pandas numpy onnx skl2onnx joblib onnx onnxruntime fastatocsv torch imbalanced-learn



In [3]:
import pandas as pd
import numpy as np

# Set random seed for reproducibility
np.random.seed(42)

# Create random SNP data
num_snps = 100  # Number of SNPs
num_samples = 10  # Number of samples

# Randomly generate SNP IDs (e.g., rsIDs)
snp_ids = [f"rs{1000000 + i}" for i in range(num_snps)]

# Randomly assign chromosomes (1-22, X, Y)
chromosomes = np.random.choice([f"chr{i}" for i in range(1, 23)] + ["chrX", "chrY"], size=num_snps)

# Randomly assign positions (1-100,000,000 for demonstration)
positions = np.random.randint(1, 100000001, size=num_snps)

# Reference and alternative alleles (A, C, G, T)
alleles = ["A", "C", "G", "T"]
reference_alleles = np.random.choice(alleles, size=num_snps)
alternative_alleles = np.random.choice(alleles, size=num_snps)

# Generate random genotype data for samples (0, 1, 2 representing alleles)
genotypes = np.random.randint(0, 3, size=(num_snps, num_samples))

# Create sample IDs
sample_ids = [f"Sample_{i+1}" for i in range(num_samples)]

# Create a DataFrame with SNP data
df_snp = pd.DataFrame({
    "SNP_ID": snp_ids,
    "Chromosome": chromosomes,
    "Position": positions,
    "Reference_Allele": reference_alleles,
    "Alternative_Allele": alternative_alleles
})

# Add genotype data for each sample
for i, sample_id in enumerate(sample_ids):
    df_snp[sample_id] = genotypes[:, i]

# Save to CSV
csv_path = "a.csv"
df_snp.to_csv(csv_path, index=False)

print(f"Generated SNP data saved to {csv_path}")

Generated SNP data saved to a.csv


In [10]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Step 1: Load the dataset
file_path = 'a.csv'  # Replace with your actual file path
snp_data = pd.read_csv(file_path)

# Print the original dataset's structure
print("Original Dataset Columns:")
print(snp_data.columns)

# Step 2: Select numeric SNP columns (assuming SNP data starts after metadata columns)
# Replace 'start_column' with the actual index where SNP numeric data starts
# Example: If numeric data starts at column 6, adjust the index accordingly

# Let's try to infer the numeric columns and select them automatically
snp_numeric_data = snp_data.select_dtypes(include=[np.number])

# Debugging: Print if any numeric columns were found
print("\nSelected numeric SNP data (if any):")
print(snp_numeric_data.head())

# Step 3: Check for missing values in the numeric columns
print("\nMissing values in SNP data before cleaning:")
print(snp_numeric_data.isnull().sum())

# Step 4: Data Cleaning - Handle missing values (if any)
snp_numeric_data.fillna(snp_numeric_data.mean(), inplace=True)

# Step 5: Data Normalization - Standardize the SNP data (only if numeric data exists)
if not snp_numeric_data.empty:
    scaler = StandardScaler()
    snp_array_normalized = scaler.fit_transform(snp_numeric_data)

    # Print a sample of the normalized data to check scaling
    print("\nNormalized Data Sample:")
    print(snp_array_normalized[:5])  # First 5 rows of normalized data

    # Step 6: Dimensionality Reduction - Apply PCA
    # Changed n_components to be within the valid range (min(n_samples, n_features))
    n_components = min(snp_array_normalized.shape[0], snp_array_normalized.shape[1])
    pca = PCA(n_components=n_components)  # Reducing to n_components principal components
    snp_array_reduced = pca.fit_transform(snp_array_normalized)

    # Output the shape of the original and reduced datasets
    print(f"\nOriginal data shape: {snp_array_normalized.shape}")
    print(f"Reduced data shape (after PCA): {snp_array_reduced.shape}")

    # If you want to inspect the explained variance for each principal component:
    explained_variance = pca.explained_variance_ratio_
    print("\nExplained variance of each principal component:")
    print(explained_variance)
else:
    print("\nNo numeric SNP data found. Please verify the dataset.")

Original Dataset Columns:
Index(['SNP_ID', 'Chromosome', 'Position', 'Reference_Allele',
       'Alternative_Allele', 'Sample_1', 'Sample_2', 'Sample_3', 'Sample_4',
       'Sample_5', 'Sample_6', 'Sample_7', 'Sample_8', 'Sample_9',
       'Sample_10'],
      dtype='object')

Selected numeric SNP data (if any):
   Position  Sample_1  Sample_2  Sample_3  Sample_4  Sample_5  Sample_6  \
0  52562568         0         2         2         0         0         1   
1  23717336         2         0         0         0         0         1   
2  60472383         2         0         0         2         0         2   
3  12719243         1         1         2         1         1         2   
4  48715251         2         1         1         1         0         0   

   Sample_7  Sample_8  Sample_9  Sample_10  
0         0         1         0          1  
1         0         2         2          0  
2         1         0         1          2  
3         1         2         1          2  
4         0

In [16]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np
from openvino.runtime import Core

# Assuming `snp_array_reduced` is your preprocessed data and `y` are the labels (0 or 1 for disease risk)
X = snp_array_reduced
y = np.random.randint(0, 2, size=(X.shape[0],))  # Replace with actual labels

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the data into PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32).unsqueeze(1)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32).unsqueeze(1)

In [17]:
# Define a simple feedforward neural network for disease prediction
class SNPNet(nn.Module):
    def __init__(self):
        super(SNPNet, self).__init__()
        self.fc1 = nn.Linear(X_train.shape[1], 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        x = torch.sigmoid(self.fc3(x))
        return x

# Initialize the network, loss function, and optimizer
snp_net_model = SNPNet() # Renamed to avoid name conflict
criterion = nn.BCELoss()  # Binary classification
optimizer = optim.Adam(snp_net_model.parameters(), lr=0.001)

# Train the model
n_epochs = 20
for epoch in range(n_epochs):
    snp_net_model.train()
    optimizer.zero_grad()
    outputs = snp_net_model(X_train_tensor)
    loss = criterion(outputs, y_train_tensor)
    loss.backward()
    optimizer.step()

    # Print training progress
    if (epoch+1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{n_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model on test data
snp_net_model.eval()  # Set model to evaluation mode
with torch.no_grad():
    predictions = snp_net_model(X_test_tensor)
    predicted_classes = (predictions >= 0.5).float()  # Binary classification
    accuracy = accuracy_score(y_test_tensor, predicted_classes)
    print(f"Test Accuracy: {accuracy * 100:.2f}%")



Epoch [5/20], Loss: 0.6747
Epoch [10/20], Loss: 0.6500
Epoch [15/20], Loss: 0.6255
Epoch [20/20], Loss: 0.5986
Test Accuracy: 60.00%


In [18]:
# Export the trained model to ONNX format
dummy_input = torch.randn(1, X_train.shape[1])  # Adjust based on input size
torch.onnx.export(snp_net_model, dummy_input, "snp_model.onnx", opset_version=11) # Use the correct model name

In [19]:
import openvino as ov
import os

# Load the ONNX model
core = ov.Core()
model = core.read_model("snp_model.onnx")

# Specify input and output data types
input_shape = ov.PartialShape([1, X_train.shape[1]])  # Input shape
input_type = ov.Type.f32            # Input data type (FP32)
output_type = ov.Type.f32           # Output data type (FP32)

# Convert the model to OpenVINO IR with FP32 data type
compiled_model = ov.compile_model(model, "CPU") # Compile for CPU

# Create the output directory if it doesn't exist
output_dir = "openvino_model"
os.makedirs(output_dir, exist_ok=True)

# Specify the output file paths
xml_path = os.path.join(output_dir, "snp_model.xml")
bin_path = os.path.join(output_dir, "snp_model.bin")

# Save the converted model
ov.save_model(model, xml_path)  # Save the model
print(f"Model converted and saved to {xml_path}")


Model converted and saved to openvino_model/snp_model.xml


In [22]:
import numpy as np
from openvino.runtime import Core
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE, RandomOverSampler


# Assuming SNP data is preprocessed and reduced via PCA
# Generate synthetic disease labels for demonstration
# 0 = 'No Disease', 1 = 'Heart Disease', 2 = 'Cancer Risk'
disease_labels = np.random.choice([0, 1, 2], size=snp_array_reduced.shape[0])

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(snp_array_reduced, disease_labels, test_size=0.3, random_state=42)

# Standardize the input data
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Check distribution of labels in training data
unique, counts = np.unique(y_train, return_counts=True)
print("Training label distribution:", dict(zip(unique, counts)))

# Dynamically decide between SMOTE or RandomOverSampler based on class sizes
if min(counts) < 2:
    print("Using RandomOverSampler due to small class sizes.")
    oversampler = RandomOverSampler(random_state=42)
    X_train_res, y_train_res = oversampler.fit_resample(X_train, y_train)
else:
    print("Using SMOTE for balancing.")
    k_neighbors_value = min(counts) - 1  # To avoid the error, set neighbors based on smallest class size
    sm = SMOTE(random_state=42, k_neighbors=k_neighbors_value)
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

# Load the OpenVINO model
ie = Core()
model_ir = ie.read_model(model="openvino_model/snp_model.xml")
compiled_model = ie.compile_model(model=model_ir, device_name="CPU")  # Use CPU, GPU, etc.

# Prepare the input and output layers
input_layer = compiled_model.input(0)
output_layer = compiled_model.output(0)

# RandomForest Classifier for better generalization
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
ensemble_model = VotingClassifier(estimators=[('rf', rf_model)], voting='soft')

# Train ensemble model with cross-validation
cv_scores = cross_val_score(ensemble_model, X_train_res, y_train_res, cv=5)
print(f"Cross-Validation Accuracy: {cv_scores.mean() * 100:.2f}%")

# Train the ensemble model on the full training set
ensemble_model.fit(X_train_res, y_train_res)

# Run inference on the test data using OpenVINO
input_data = X_test  # Ensure test data is in the correct format
predictions = []

# Iterate over each test data point
for i in range(input_data.shape[0]):
    single_input = input_data[i].reshape(1, -1)  # Reshape to (1, num_features) for model input
    result = compiled_model([single_input])
    predictions.append(result[output_layer])

# Stack predictions and convert to NumPy array
predictions = np.vstack(predictions)

# For multiclass, use argmax to get the predicted class index
predicted_classes = np.argmax(predictions, axis=1)

# Map predicted classes to disease names
disease_mapping = {0: 'No Disease', 1: 'Heart Disease', 2: 'Cancer Risk'}
predicted_diseases = [disease_mapping.get(int(pred), 'Unknown') for pred in predicted_classes]

# Evaluate accuracy
accuracy = accuracy_score(y_test, predicted_classes)
print(f"Test Accuracy with OpenVINO: {accuracy * 100:.2f}%")

# Confidence thresholding for better predictions
def get_confident_predictions(output_value, confidence_threshold=0.7):
    if np.max(output_value) > confidence_threshold:
        return np.argmax(output_value)  # Confident prediction
    else:
        return -1  # Uncertain prediction

# Adjust predictions based on confidence levels
confident_predictions = [get_confident_predictions(result) for result in predictions]

# Define the threshold for disease prediction
disease_threshold = 0.6

# Function to determine the disease risk based on raw output
def get_disease_risk(output_value):
    if output_value < 0.4:
        return "No Disease"
    elif 0.4 <= output_value < 0.6:
        return "Possible Heart Disease"
    elif 0.6 <= output_value < 0.7:
        return "Possible Cancer Risk"
    else:
        return "High Risk of Severe Disease"

# Display the predicted disease name for each patient
for i, result in enumerate(predictions):
    raw_output = result[0]  # Extract raw output
    disease_risk = get_disease_risk(raw_output)  # Get disease risk label based on threshold
    print(f"Patient {i+1}: Predicted Disease Risk - {disease_risk}")
    print(f"Patient {i+1} raw output: {raw_output:.6f}")


Training label distribution: {0: 19, 1: 33, 2: 18}
Using SMOTE for balancing.
Cross-Validation Accuracy: 93.58%
Test Accuracy with OpenVINO: 85.00%
Patient 1: Predicted Disease Risk - Possible Heart Disease
Patient 1 raw output: 0.561910
Patient 2: Predicted Disease Risk - Possible Heart Disease
Patient 2 raw output: 0.554574
Patient 3: Predicted Disease Risk - Possible Heart Disease
Patient 3 raw output: 0.584371
Patient 4: Predicted Disease Risk - Possible Heart Disease
Patient 4 raw output: 0.565677
Patient 5: Predicted Disease Risk - Possible Heart Disease
Patient 5 raw output: 0.424276
Patient 6: Predicted Disease Risk - Possible Heart Disease
Patient 6 raw output: 0.466892
Patient 7: Predicted Disease Risk - No Disease
Patient 7 raw output: 0.279908
Patient 8: Predicted Disease Risk - Possible Heart Disease
Patient 8 raw output: 0.592820
Patient 9: Predicted Disease Risk - Possible Heart Disease
Patient 9 raw output: 0.504283
Patient 10: Predicted Disease Risk - Possible Cancer R