In [26]:
from typing import List
import random
import numpy as np
from hdcam import Controller as HDCAM
drive_path = "C:/Users/bromotec/leo_2/leo_2_bringup/smart_uart_win_env/pyshell/"

In [27]:
Segment = str
Genome = List[Segment]

In [28]:
influenza = drive_path + "dataset/influenza.fna"
lassa = drive_path + "dataset/lassa.fna"
measles = drive_path + "dataset/measles.fna"
rotavirus = drive_path + "dataset/rotavirus.fna"
sars_cov_2 = drive_path + "dataset/sars_cov_2.fna"

all_viruses = [influenza, lassa, measles, rotavirus, sars_cov_2]

In [29]:
def buildGenome(filename):
    genome_file = open(filename, 'r')
    genome: Genome = []
    genome_idx = -1
    for line in genome_file:
        if line[0] == ">":
            genome.append("")
            genome_idx += 1
            continue
        else:
            genome[genome_idx] += line[:-1]
    genome_file.close
    return genome

In [30]:
def getSample(genome):
    fragment_len = 150
    samples = []
    num_of_frags_in_seg = len(genome) - fragment_len + 1
    sample_prob = 0.05
    should_sample = np.random.uniform(size=num_of_frags_in_seg) < sample_prob
    for i in range(num_of_frags_in_seg):
        if should_sample[i]:
            samples.append(genome[i:i+fragment_len])
    return samples

In [31]:
# Define a dictionary to store datasets for each virus
virus_datasets = {}

for virus_path in all_viruses:
    virus_name = virus_path.split("/")[-1].split(".")[0]  # Extract virus name from the file path
    virus_genome = buildGenome(virus_path)

    samples_full = []
    for g in virus_genome:
        sample_full = getSample(g)
        samples_full += sample_full

    virus_datasets[virus_name] = samples_full

# Now you have separate datasets for each virus in the virus_datasets dictionary
# You can access the datasets using the virus names as keys
# For example:
influenza_dataset = virus_datasets["influenza"]
lassa_dataset = virus_datasets["lassa"]
measles_dataset = virus_datasets["measles"]
rotavirus_dataset = virus_datasets["rotavirus"]
sars_cov_2_dataset = virus_datasets["sars_cov_2"]

# Print lengths for verification
print("Influenza Dataset Length:", len(influenza_dataset))
print("Lassa Dataset Length:", len(lassa_dataset))
print("Measles Dataset Length:", len(measles_dataset))
print("Rotavirus Dataset Length:", len(rotavirus_dataset))
print("SARS-CoV-2 Dataset Length:", len(sars_cov_2_dataset))


Influenza Dataset Length: 650
Lassa Dataset Length: 512
Measles Dataset Length: 1482
Rotavirus Dataset Length: 822
SARS-CoV-2 Dataset Length: 2991


In [32]:
def kmer_fun(seq, K):
    '''This function outputs the K-mers of a sequence'''
    kmer_list = []
    if seq is not None:
        for x in range(len(seq) - K + 1):
            kmer_list.append(seq[x:x+K])
        return kmer_list
    else:
        print('Variable is None')

# Define k-mer length
kmer_length = 21

# Divide each dataset into k-mers
kmer_influenza = [kmer_fun(seq, kmer_length) for seq in influenza_dataset]
kmer_lassa = [kmer_fun(seq, kmer_length) for seq in lassa_dataset]
kmer_measles = [kmer_fun(seq, kmer_length) for seq in measles_dataset]
kmer_rotavirus = [kmer_fun(seq, kmer_length) for seq in rotavirus_dataset]
kmer_sars_cov_2 = [kmer_fun(seq, kmer_length) for seq in sars_cov_2_dataset]

# Print the length of k-mers for verification
print("Length of k-mers for Influenza:", len(kmer_influenza[0]))
print("Length of k-mers for Lassa:", len(kmer_lassa))
print("Length of k-mers for Measles:", len(kmer_measles))
print("Length of k-mers for Rotavirus:", len(kmer_rotavirus))
print("Length of k-mers for SARS-CoV-2:", len(kmer_sars_cov_2))


Length of k-mers for Influenza: 130
Length of k-mers for Lassa: 512
Length of k-mers for Measles: 1482
Length of k-mers for Rotavirus: 822
Length of k-mers for SARS-CoV-2: 2991


In [33]:
from itertools import chain

# Flatten the lists of k-mers for each virus
flat_kmer_influenza = list(chain.from_iterable(kmer_influenza))
flat_kmer_lassa = list(chain.from_iterable(kmer_lassa))
flat_kmer_measles = list(chain.from_iterable(kmer_measles))
flat_kmer_rotavirus = list(chain.from_iterable(kmer_rotavirus))
flat_kmer_sars_cov_2 = list(chain.from_iterable(kmer_sars_cov_2))

# Print the length of flattened k-mers for verification
print("Length of flattened k-mers for Influenza:", len(flat_kmer_influenza))
print("Length of flattened k-mers for Lassa:", len(flat_kmer_lassa))
print("Length of flattened k-mers for Measles:", len(flat_kmer_measles))
print("Length of flattened k-mers for Rotavirus:", len(flat_kmer_rotavirus))
print("Length of flattened k-mers for SARS-CoV-2:", len(flat_kmer_sars_cov_2))


Length of flattened k-mers for Influenza: 84500
Length of flattened k-mers for Lassa: 66560
Length of flattened k-mers for Measles: 192660
Length of flattened k-mers for Rotavirus: 106860
Length of flattened k-mers for SARS-CoV-2: 388830


In [34]:
# Convert flattened k-mers to sets to get unique values
unique_kmer_influenza = set(flat_kmer_influenza)
unique_kmer_lassa = set(flat_kmer_lassa)
unique_kmer_measles = set(flat_kmer_measles)
unique_kmer_rotavirus = set(flat_kmer_rotavirus)
unique_kmer_sars_cov_2 = set(flat_kmer_sars_cov_2)

# Print the length of unique k-mers for verification
print("Length of unique k-mers for Influenza:", len(unique_kmer_influenza))
print("Length of unique k-mers for Lassa:", len(unique_kmer_lassa))
print("Length of unique k-mers for Measles:", len(unique_kmer_measles))
print("Length of unique k-mers for Rotavirus:", len(unique_kmer_rotavirus))
print("Length of unique k-mers for SARS-CoV-2:", len(unique_kmer_sars_cov_2))


Length of unique k-mers for Influenza: 13256
Length of unique k-mers for Lassa: 10606
Length of unique k-mers for Measles: 15862
Length of unique k-mers for Rotavirus: 17713
Length of unique k-mers for SARS-CoV-2: 58028


In [35]:
def encode_sequence(sequence, encoding_dict):
    return ''.join(encoding_dict[letter] for letter in sequence)

# Encode each dataset
encoding_dict = {'A': '000', 'G': '011', 'C': '101', 'T': '110'}
encoded_influenza = [encode_sequence(seq, encoding_dict) for seq in unique_kmer_influenza]
encoded_lassa = [encode_sequence(seq, encoding_dict) for seq in unique_kmer_lassa]
encoded_measles = [encode_sequence(seq, encoding_dict) for seq in unique_kmer_measles]
encoded_rotavirus = [encode_sequence(seq, encoding_dict) for seq in unique_kmer_rotavirus]
encoded_sars_cov_2 = [encode_sequence(seq, encoding_dict) for seq in unique_kmer_sars_cov_2]

# Print the first 10 characters of the encoded sequences for verification
print("Encoded Influenza Sample:", encoded_influenza[:10])
print("Encoded Lassa Sample:", encoded_lassa[:10])
print("Encoded Measles Sample:", encoded_measles[:10])
print("Encoded Rotavirus Sample:", encoded_rotavirus[:10])
print("Encoded SARS-CoV-2 Sample:", encoded_sars_cov_2[:10])


Encoded Influenza Sample: ['011101000000101110110011110110101011000011000000000110110110110', '101000110011110000110110101000011000110110110110101000110110110', '011101110000110110110000101101000110000000011000101000000011000', '000110110011000101110110101011000011110101110011011000101011011', '101101000000000110101000000000000011000110000000110000000101011', '000110101000000000000110110101110011000011011011000000101000011', '000011101000011000110101110101000000000000011101000101110101000', '110011011110110101000110011101110000000110011101101101000000011', '011101000011110000110000110000101000110110011000000011110101110', '011011000011011110101000011110011000000000101000101000011011011']
Encoded Lassa Sample: ['110011011011110110101000110011110011110101101110000101110011110', '000110101000011000011000101110011000000110011011000101110110000', '101000011110110011101101000101011011101101110011000011110101000', '101000000110110000000000110000000011000110000011101101110110000', '10100001100

In [37]:
def binary_to_decimal(encoded_list):
    number_list = []
    for kmer in encoded_list:
        result = int(kmer, 2)
        number_list.append(result)
    return number_list

# Apply the conversion to each encoded dataset
decimal_influenza = binary_to_decimal(encoded_influenza)
decimal_lassa = binary_to_decimal(encoded_lassa)
decimal_measles = binary_to_decimal(encoded_measles)
decimal_rotavirus = binary_to_decimal(encoded_rotavirus)
decimal_sars_cov_2 = binary_to_decimal(encoded_sars_cov_2)[:20000]

# Print the first 10 elements for verification
print("Decimal Influenza Sample:", decimal_influenza[:10])
print("Decimal Lassa Sample:", decimal_lassa[:10])
print("Decimal Measles Sample:", decimal_measles[:10])
print("Decimal Rotavirus Sample:", decimal_rotavirus[:10])
print("Decimal SARS-CoV-2 Sample:", decimal_sars_cov_2[:10])


Decimal Influenza Sample: [4180987442496540086, 5881168197140042166, 4289353576981745688, 979738897510150491, 6485416560293863467, 954766881656242499, 523291574030732200, 7419296912098908675, 4187811509579810734, 3898943259804979419]
Decimal Lassa Sample: [7412595578207931294, 955621138443897776, 5833980706102544296, 5779807584508828592, 5818830200392768536, 553500249117884440, 1644901319501702, 4338571771869671798, 554985464546475328, 4325089368212144494]
Decimal Measles Sample: [7837124708215665077, 15122589346916077, 486629897171647320, 7039143177786765317, 6978435967506738589, 13125889954604784, 6745948097955254683, 7439959836551711558, 3514618814789538030, 7638951434650570989]
Decimal Rotavirus Sample: [841846464557714822, 7038566042680647899, 6644511890399255070, 7405807762771765958, 115888552155507251, 3460478161531822400, 7405434484914064437, 6973465035722391603, 103688720772446720, 6971805737831387190]
Decimal SARS-CoV-2 Sample: [549008548014853440, 68413312253048734, 55478992

In [38]:
# Function to print information about a dataset
def print_dataset_info(name, dataset):
    total_elements = len(dataset)
    unique_elements = len(set(dataset))

    print(f"{name} - Total Elements: {total_elements}, Unique Elements: {unique_elements}")

# Print information for each dataset
print_dataset_info("Influenza", decimal_influenza)
print_dataset_info("Lassa", decimal_lassa)
print_dataset_info("Measles", decimal_measles)
print_dataset_info("Rotavirus", decimal_rotavirus)
print_dataset_info("SARS-CoV-2", decimal_sars_cov_2)


Influenza - Total Elements: 13256, Unique Elements: 13256
Lassa - Total Elements: 10606, Unique Elements: 10606
Measles - Total Elements: 15862, Unique Elements: 15862
Rotavirus - Total Elements: 17713, Unique Elements: 17713
SARS-CoV-2 - Total Elements: 20000, Unique Elements: 20000


In [39]:
import random

# Define the size of the mini-dataset
mini_dataset_size = 1920

# Function to create a mini-dataset from a given dataset
def create_mini_dataset(dataset, size):
    return random.sample(dataset, size)

# Create mini-datasets for each decimal dataset
mini_decimal_influenza = create_mini_dataset(decimal_influenza, mini_dataset_size)
mini_decimal_lassa = create_mini_dataset(decimal_lassa, mini_dataset_size)
mini_decimal_measles = create_mini_dataset(decimal_measles, mini_dataset_size)
mini_decimal_rotavirus = create_mini_dataset(decimal_rotavirus, mini_dataset_size)
mini_decimal_sars_cov_2 = create_mini_dataset(decimal_sars_cov_2, mini_dataset_size)

# Print the lengths of each mini-dataset
print("Length of Mini Decimal Influenza:", len(mini_decimal_influenza))
print("Length of Mini Decimal Lassa:", len(mini_decimal_lassa))
print("Length of Mini Decimal Measles:", len(mini_decimal_measles))
print("Length of Mini Decimal Rotavirus:", len(mini_decimal_rotavirus))
print("Length of Mini Decimal SARS-CoV-2:", len(mini_decimal_sars_cov_2))


Length of Mini Decimal Influenza: 1920
Length of Mini Decimal Lassa: 1920
Length of Mini Decimal Measles: 1920
Length of Mini Decimal Rotavirus: 1920
Length of Mini Decimal SARS-CoV-2: 1920


In [40]:
# Calculate the percentage for each mini-dataset
percentage_influenza = (len(mini_decimal_influenza) / len(decimal_influenza)) * 100
percentage_lassa = (len(mini_decimal_lassa) / len(decimal_lassa)) * 100
percentage_measles = (len(mini_decimal_measles) / len(decimal_measles)) * 100
percentage_rotavirus = (len(mini_decimal_rotavirus) / len(decimal_rotavirus)) * 100
percentage_sars_cov_2 = (len(mini_decimal_sars_cov_2) / len(decimal_sars_cov_2)) * 100

# Print the percentages for each mini-dataset
print("Percentage of Mini Decimal Influenza compared to Original Dataset:", percentage_influenza)
print("Percentage of Mini Decimal Lassa compared to Original Dataset:", percentage_lassa)
print("Percentage of Mini Decimal Measles compared to Original Dataset:", percentage_measles)
print("Percentage of Mini Decimal Rotavirus compared to Original Dataset:", percentage_rotavirus)
print("Percentage of Mini Decimal SARS-CoV-2 compared to Original Dataset:", percentage_sars_cov_2)


Percentage of Mini Decimal Influenza compared to Original Dataset: 14.48400724200362
Percentage of Mini Decimal Lassa compared to Original Dataset: 18.10296058834622
Percentage of Mini Decimal Measles compared to Original Dataset: 12.104400453915016
Percentage of Mini Decimal Rotavirus compared to Original Dataset: 10.839496415062383
Percentage of Mini Decimal SARS-CoV-2 compared to Original Dataset: 9.6


In [41]:
# def create_dataset_excluding_mini_dataset(full_dataset, mini_dataset):
#     return [element for element in full_dataset if element not in mini_dataset]

# # Create new datasets excluding elements from the mini-datasets
# decimal_influenza = create_dataset_excluding_mini_dataset(decimal_influenza, mini_decimal_influenza)
# decimal_lassa = create_dataset_excluding_mini_dataset(decimal_lassa, mini_decimal_lassa)
# decimal_measles = create_dataset_excluding_mini_dataset(decimal_measles, mini_decimal_measles)
# decimal_rotavirus = create_dataset_excluding_mini_dataset(decimal_rotavirus, mini_decimal_rotavirus)
# decimal_sars_cov_2 = create_dataset_excluding_mini_dataset(decimal_sars_cov_2, mini_decimal_sars_cov_2)

# # Print the lengths of each new dataset
# print("Length of New Dataset Excluding Mini Decimal Influenza:", len(decimal_influenza))
# print("Length of New Dataset Excluding Mini Decimal Lassa:", len(decimal_lassa))
# print("Length of New Dataset Excluding Mini Decimal Measles:", len(decimal_measles))
# print("Length of New Dataset Excluding Mini Decimal Rotavirus:", len(decimal_rotavirus))
# print("Length of New Dataset Excluding Mini Decimal SARS-CoV-2:", len(decimal_sars_cov_2))


In [42]:
def create_dataset_excluding_virus(virus_dataset, all_datasets_except_virus):
    # Calculate the size for each other virus
    size_per_other_virus = len(virus_dataset) // 4
    # Randomly select elements from each other virus
    new_dataset = []
    for other_virus_dataset in all_datasets_except_virus:
        random_selection = random.sample(other_virus_dataset, size_per_other_virus)
        new_dataset += random_selection

    # Shuffle the new dataset to mix elements from different viruses
    random.shuffle(new_dataset)

    return new_dataset

# Create datasets excluding each virus
datasets_except_influenza = create_dataset_excluding_virus(decimal_influenza, [decimal_lassa, decimal_measles, decimal_rotavirus, decimal_sars_cov_2])
datasets_except_lassa = create_dataset_excluding_virus(decimal_lassa, [decimal_influenza, decimal_measles, decimal_rotavirus, decimal_sars_cov_2])
datasets_except_measles = create_dataset_excluding_virus(decimal_measles, [decimal_influenza, decimal_lassa, decimal_rotavirus, decimal_sars_cov_2])
datasets_except_rotavirus = create_dataset_excluding_virus(decimal_rotavirus, [decimal_influenza, decimal_lassa, decimal_measles, decimal_sars_cov_2])
datasets_except_sars_cov_2 = create_dataset_excluding_virus(decimal_sars_cov_2, [decimal_influenza, decimal_lassa, decimal_measles, decimal_rotavirus])

# Print the lengths of each dataset
print("Length of Datasets Except Influenza:", len(datasets_except_influenza))
print("Length of Datasets Except Lassa:", len(datasets_except_lassa))
print("Length of Datasets Except Measles:", len(datasets_except_measles))
print("Length of Datasets Except Rotavirus:", len(datasets_except_rotavirus))
print("Length of Datasets Except SARS-CoV-2:", len(datasets_except_sars_cov_2))


Length of Datasets Except Influenza: 13256
Length of Datasets Except Lassa: 10604
Length of Datasets Except Measles: 15860
Length of Datasets Except Rotavirus: 17712
Length of Datasets Except SARS-CoV-2: 20000


In [93]:
hdcam = HDCAM()


List of active serial ports:

port:COM9 ; desc:USB Serial Port (COM9) ; hwid:USB VID:PID=0403:6011 SER=6
USB Serial Port (COM9) OK
port:COM10 ; desc:USB Serial Port (COM10) ; hwid:USB VID:PID=0403:6011 SER=6
USB Serial Port (COM10) OK
port:COM11 ; desc:USB Serial Port (COM11) ; hwid:USB VID:PID=0403:6011 SER=6
USB Serial Port (COM11) OK
port:COM12 ; desc:USB Serial Port (COM12) ; hwid:USB VID:PID=0403:6011 SER=6
USB Serial Port (COM12) OK

Found USB Serial Port at COM11



In [49]:
# Function to calculate TP, FN, TN, FP for a given virus
def calculate_confusion_matrix(virus_name, mini_virus_dataset, virus_dataset, all_datasets_except_virus):
    # Write the virus dataset to hdcam
    hdcam.write(mini_virus_dataset)

    # Calculate TP and FN for the virus
    number_of_hits = 0
    for i in range(0, len(virus_dataset), 1920):
        number_of_hits += hdcam.read(virus_dataset[i:1920 + i])

    tp = number_of_hits
    fn = len(virus_dataset) - number_of_hits

    # Calculate TN and FP for the virus by reading other datasets combined
    number_of_hits = 0
    for i in range(0, len(all_datasets_except_virus), 1920):
        number_of_hits += hdcam.read(all_datasets_except_virus[i:1920 + i])

    fp = number_of_hits
    tn = len(all_datasets_except_virus) - number_of_hits

    print(f"For {virus_name}: TP is {tp}, FN is {fn}, TN is {tn}, FP is {fp}")
    return tp, fn, fp, tn


In [90]:
def calculate_metrics(virus_name, tp, fn, fp, tn):
    accuracy = (tp + tn) / (tp + tn + fn + fp)
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    precision = tp / (tp + fp)
    f1 = 2 * (precision * sensitivity) / (precision + sensitivity)

    print(f'Metrics for {virus_name}:')
    print('------------------------')
    print('Accuracy: {:.2%}'.format(accuracy))
    print('Sensitivity: {:.2%}'.format(sensitivity))
    print('Specificity: {:.2%}'.format(specificity))
    print('Precision: {:.2%}'.format(precision))
    print('F1 Score: {:.2%}'.format(f1))
    print()


In [135]:
TP_Influenza, FN_Influenza, FP_Influenza, TN_Influenza = calculate_confusion_matrix("Influenza", mini_decimal_influenza, decimal_influenza, datasets_except_influenza)
TP_Lassa, FN_Lassa, FP_Lassa, TN_Lassa = calculate_confusion_matrix("Lassa", mini_decimal_lassa, decimal_lassa, datasets_except_lassa)
TP_Measles, FN_Measles, FP_Measles, TN_Measles = calculate_confusion_matrix("Measles", mini_decimal_measles, decimal_measles, datasets_except_measles)
TP_Rotavirus, FN_Rotavirus, FP_Rotavirus, TN_Rotavirus = calculate_confusion_matrix("Rotavirus", mini_decimal_rotavirus, decimal_rotavirus, datasets_except_rotavirus)
TP_SARS_COV_2, FN_SARS_COV_2, FP_SARS_COV_2, TN_SARS_COV_2 = calculate_confusion_matrix("SARS-CoV-2", mini_decimal_sars_cov_2, decimal_sars_cov_2, datasets_except_sars_cov_2)

For Influenza: TP is 5857, FN is 7399, TN is 10064, FP is 3192
For Lassa: TP is 4005, FN is 6601, TN is 7412, FP is 3192
For Measles: TP is 4742, FN is 11120, TN is 11095, FP is 4765
For Rotavirus: TP is 13794, FN is 3919, TN is 10218, FP is 7494
For SARS-CoV-2: TP is 7095, FN is 12905, TN is 13802, FP is 6198


In [136]:
overall_TP = TP_Influenza + TP_Lassa + TP_Measles + TP_Rotavirus + TP_SARS_COV_2
overall_FN = FN_Influenza + FN_Lassa + FN_Measles + FN_Rotavirus + FN_SARS_COV_2
overall_FP = FP_Influenza + FP_Lassa + FP_Measles + FP_Rotavirus + FP_SARS_COV_2
overall_TN = TN_Influenza + TN_Lassa + TN_Measles + TN_Rotavirus + TN_SARS_COV_2

calculate_metrics("Overall", overall_TP, overall_FN, overall_FP, overall_TN)
calculate_metrics("Influenza", TP_Influenza, FN_Influenza, FP_Influenza, TN_Influenza)
calculate_metrics("Lassa",TP_Lassa, FN_Lassa, FP_Lassa, TN_Lassa)
calculate_metrics("Measles",TP_Measles, FN_Measles, FP_Measles, TN_Measles)
calculate_metrics("Rotavirus",TP_Rotavirus, FN_Rotavirus, FP_Rotavirus, TN_Rotavirus)
calculate_metrics("Sarc_cov_2",TP_SARS_COV_2, FN_SARS_COV_2, FP_SARS_COV_2, TN_SARS_COV_2)

Metrics for Overall:
------------------------
Accuracy: 56.88%
Sensitivity: 45.83%
Specificity: 67.92%
Precision: 58.83%
F1 Score: 51.52%

Metrics for Influenza:
------------------------
Accuracy: 60.05%
Sensitivity: 44.18%
Specificity: 75.92%
Precision: 64.73%
F1 Score: 52.52%

Metrics for Lassa:
------------------------
Accuracy: 53.83%
Sensitivity: 37.76%
Specificity: 69.90%
Precision: 55.65%
F1 Score: 44.99%

Metrics for Measles:
------------------------
Accuracy: 49.92%
Sensitivity: 29.90%
Specificity: 69.96%
Precision: 49.88%
F1 Score: 37.38%

Metrics for Rotavirus:
------------------------
Accuracy: 67.78%
Sensitivity: 77.88%
Specificity: 57.69%
Precision: 64.80%
F1 Score: 70.74%

Metrics for Sarc_cov_2:
------------------------
Accuracy: 52.24%
Sensitivity: 35.48%
Specificity: 69.01%
Precision: 53.37%
F1 Score: 42.62%

