In [1]:

from time import time
import logging

In [2]:
from seq_utils import read_fasta
# Read the FASTA file containing all human cDNA sequences.
cdnas = read_fasta('./data/Homo_sapiens.GRCh38.cdna.all.fa')

In [3]:
## reduce size of the cdnas to make benchmarking more concistant
## since the sequencing is done on the 3' end, the tail of the cDNA is kept
for i in cdnas:
    if len(i['sequence']) > 1000:
        i['sequence'] = i['sequence'][-1000:]

In [4]:
from hdfs import InsecureClient

# Set the HDFS URL. 'host.docker.internal' allows the Docker container to communicate with the host machine.
# The port 9870 is typically used for the HDFS web UI.
hdfs_url        = 'http://host.docker.internal:9870'
ubuntu_Benutzer = 'alfa'

# Create an instance of the InsecureClient class to interact with HDFS.
client = InsecureClient(hdfs_url, user=ubuntu_Benutzer)

In [5]:
from random import sample
from benchmark_utils import generate_random_dna_sequence, writeToHadoop, deleteTestSet
filepath = '/bigdata/cdna_blastn'
from benchmark_utils import blastn_pyspark, blastn_hadoop

In [8]:
# Set up logging
start_time = f'{time():.0f}' # Get current time in seconds for logging file name
logger = logging.getLogger(__name__)
logging.basicConfig(filename='./benchmark/blastn_benchmark_' + start_time + '.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

runtimes = []
# for nrun in range(3):
#     for len_arr in [20000]:
#         for len_query in [80]:
nrun = 0
for len_query in [80, 60, 40, 20, 10]: # Loop over different query sequence lengths
    for len_arr in [20000, 15000, 10000, 5000, 2000, 1000, 100, 10, 1]: # Loop over different cdna array lengths
        # Generate random DNA sequence with specified length and weights for nucleotides
        query_sequence = generate_random_dna_sequence(len_query, weights= {'A': 6, 'C': 4, 'G': 4, 'T': 6})
        # Sample a subset of cDNAs from the loaded dataset
        cdnas_ds = sample(cdnas, len_arr)
        # Write the sampled cDNAs to Hadoop HDFS using the client and specified filepath
        writeToHadoop(client, filepath, cdnas_ds)
            
        # Log information about the current run
        logging.info(f"Run number {nrun+1:.0f} with {len_arr:.0f} sequences and {query_sequence} as query sequence")

        # Execute BLASTN using PySpark and measure runtime
        spark_time, spark_result = blastn_pyspark(query_sequence, filepath)
        logging.info(f"Spark BLASTN runtime: {spark_time:.4f} seconds")
        logging.info("Spark BLASTN results: " + str(spark_result))
        runtimes.append(('Spark', nrun, len_arr, len_query, spark_time))
            
        # Execute BLASTN using Hadoop and measure runtime
        hadoop_time, hadoop_result = blastn_hadoop(query_sequence, filepath)
        logging.info(f"Hadoop BLASTN runtime: {hadoop_time:.4f} seconds")
        logging.info("Hadoop BLASTN results: " + str(hadoop_result))
        runtimes.append(('Hadoop', nrun, len_arr, len_query, hadoop_time))

        # Compare results from Spark and Hadoop BLASTN
        if sorted(spark_result) == sorted(hadoop_result):
            logging.info("Spark and Hadoop BLASTN results are matching")
        else:
            logging.warning("Spark and Hadoop BLASTN results does NOT match")

        # Delete the test set from Hadoop HDFS using the client and specified filepath
        deleteTestSet(client, filepath)

# Write the collected runtimes to a text file
with open('./benchmark/blastn_runtimes_' + start_time + '.txt', 'w') as f:
    for line in runtimes:
        f.write(f"{line}\n")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

In [9]:
runtimes

[('Spark', 0, 20000, 80, 2556.827931880951),
 ('Hadoop', 0, 20000, 80, 1388.7604849338531),
 ('Spark', 0, 15000, 80, 1707.871378660202),
 ('Hadoop', 0, 15000, 80, 980.1165933609009),
 ('Spark', 0, 10000, 80, 1170.049753189087),
 ('Hadoop', 0, 10000, 80, 661.6926681995392),
 ('Spark', 0, 5000, 80, 548.5153164863586),
 ('Hadoop', 0, 5000, 80, 345.3649334907532),
 ('Spark', 0, 2000, 80, 221.74194884300232),
 ('Hadoop', 0, 2000, 80, 166.25478982925415),
 ('Spark', 0, 1000, 80, 116.97669506072998),
 ('Hadoop', 0, 1000, 80, 100.02249145507812),
 ('Spark', 0, 100, 80, 13.289900779724121),
 ('Hadoop', 0, 100, 80, 43.869657039642334),
 ('Spark', 0, 10, 80, 3.1744937896728516),
 ('Hadoop', 0, 10, 80, 37.24662685394287),
 ('Spark', 0, 1, 80, 1.965494155883789),
 ('Hadoop', 0, 1, 80, 36.30191206932068),
 ('Spark', 0, 20000, 60, 1639.8343772888184),
 ('Hadoop', 0, 20000, 60, 976.2617943286896),
 ('Spark', 0, 15000, 60, 1269.9237580299377),
 ('Hadoop', 0, 15000, 60, 756.3479599952698),
 ('Spark', 0,

In [18]:
len_query = 5
len_arr = 10

query_sequence = generate_random_dna_sequence(len_query, weights= {'A': 6, 'C': 4, 'G': 4, 'T': 6})
cdnas_ds = sample(cdnas, len_arr)
writeToHadoop(client, filepath, cdnas_ds)

temp_p = blastn_pyspark(query_sequence, filepath)
temp_h = blastn_hadoop(query_sequence, filepath)

deleteTestSet(client, filepath)

print(temp_p)
print(temp_h)

                                                                                

(2.715787410736084, [('ENST00000476472.5', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000410037.5', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000427765.1', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000531951.6', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000552917.1', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000550912.1', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000676748.1', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000633875.4', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000360096.3', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000556353.1', 10.0, 'CTGCC', 'CTGCC')])
(63.40727353096008, [('ENST00000552917.1', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000531951.6', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000427765.1', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000410037.5', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000476472.5', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000556353.1', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000360096.3', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000633875.4', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000676748.1', 10.0, 'CTGCC', 'CTGCC'), ('ENST00000550912.1', 10.0, 'CTGCC', 'CTGCC')])


In [19]:
sorted(temp_p[1]) == sorted(temp_h[1])

True

In [9]:

            deleteTestSet(client, filepath)