In [1]:
from time import time
import logging

In [2]:
from seq_utils import read_fasta, read_fastq_barcode
# Read the FASTA file containing all human cDNA sequences.
cdnas = read_fasta('./data/Homo_sapiens.GRCh38.cdna.all.fa')
# Read the FASTQ file containing an 10x single cell sequencing experiment
seq_path = './data/500_PBMC_3p_LT_Chromium_X_S4_L004_R2_001.fastq_ds100000'
bc_path = './data/500_PBMC_3p_LT_Chromium_X_S4_L004_R1_001.fastq_ds100000'
sequences = read_fastq_barcode(seq_path, bc_path)

In [3]:
## reduce size of the cdnas to make benchmarking more concistant
## since the sequencing is done on the 3' end, the tail of the cDNA is kept
for i in cdnas:
    if len(i['sequence']) > 1000:
        i['sequence'] = i['sequence'][-1000:]

In [4]:
from hdfs import InsecureClient

# Set the HDFS URL. 'host.docker.internal' allows the Docker container to communicate with the host machine.
# The port 9870 is typically used for the HDFS web UI.
hdfs_url        = 'http://host.docker.internal:9870'
ubuntu_Benutzer = 'alfa'

# Create an instance of the InsecureClient class to interact with HDFS.
client = InsecureClient(hdfs_url, user=ubuntu_Benutzer)

In [5]:
from random import sample
filepath_cdna = '/bigdata/cdna_align'
filepath_seq = '/bigdata/seq_align'
from benchmark_utils import writeToHadoop, deleteTestSet, align_pyspark, align_hadoop

In [7]:
# Set up logging
start_time = f'{time():.0f}' # Get current time in seconds for logging file name
logger = logging.getLogger(__name__)
logging.basicConfig(filename='./benchmark/align_benchmark_' + start_time + '.log', level=logging.INFO,
                    format='%(asctime)s - %(levelname)s - %(message)s')

runtimes = []
nrun = 0
# for len_cdna in [5, 1]:
#     for len_seq in [5, 1]:
for len_cdna in [100, 80, 60, 40, 20, 10, 5, 1]: # Loop over different sequence array lengths
    for len_seq in [100, 80, 60, 40, 20, 10, 5, 1]: # Loop over different cdna array lengths

        # Sample a subset of cDNAs from the loaded dataset
        cdnas_ds = sample(cdnas, len_cdna)
        writeToHadoop(client, filepath_cdna, cdnas_ds)
        # Sample a subset of sequences from the loaded dataset
        sequences_ds = sample(sequences, len_seq)
        writeToHadoop(client, filepath_seq, sequences_ds)
            
        # Log information about the current run
        logging.info(f"Run number {nrun+1:.0f} with {len_cdna:.0f} cdnas and {len_seq:.0f} as sequences")

        # Execute sequence alignment using PySpark and measure runtime
        spark_time, spark_result = align_pyspark(filepath_cdna, filepath_seq)
        logging.info(f"Spark alignment runtime: {spark_time:.4f} seconds")
        logging.info("Spark alignment results: " + str(spark_result))
        runtimes.append(('Spark', nrun, len_cdna, len_seq, spark_time))
            
        # Execute sequence alignment using Hadoop and measure runtime
        hadoop_time, hadoop_result = align_hadoop(filepath_cdna, filepath_seq)
        logging.info(f"Hadoop alignment runtime: {hadoop_time:.4f} seconds")
        logging.info("Hadoop alignment results: " + str(hadoop_result))
        runtimes.append(('Hadoop', nrun, len_cdna, len_seq, hadoop_time))

        # Compare results from Spark and Hadoop BLASTN
        if sorted(spark_result) == sorted(hadoop_result):
            logging.info("Spark and Hadoop alignment results are matching")
        else:
            logging.warning("Spark and Hadoop alignment results does not match")

        # Delete the test set from Hadoop HDFS using the client and specified filepath
        deleteTestSet(client, filepath_cdna)
        deleteTestSet(client, filepath_seq)

# Write the collected runtimes to a text file
with open('./benchmark/align_runtimes_' + start_time + '.txt', 'w') as f:
    for line in runtimes:
        f.write(f"{line}\n")

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

In [7]:
len_cdna = 5
len_seq = 5

ts = time()

cdnas_ds = sample(cdnas, len_cdna)
writeToHadoop(client, filepath_cdna, cdnas_ds)
sequences_ds = sample(sequences, len_seq)
writeToHadoop(client, filepath_seq, sequences_ds)

temp_p = align_pyspark(filepath_cdna, filepath_seq)
temp_h = align_hadoop(filepath_cdna, filepath_seq)

deleteTestSet(client, filepath_cdna)
deleteTestSet(client, filepath_seq)

te = time()

print(temp_p)
print(temp_h)
print('func took: %2.4f sec' % (te-ts))

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
                                                                                

(20.25559091567993, [('ENST00000507135.5', 1), ('ENST00000611279.4', 1), ('ENST00000640046.1', 2), ('ENST00000559515.1', 1)])
(111.48353052139282, [])
func took: 131.8595 sec


In [8]:
sorted(temp_p[1]) == sorted(temp_h[1])

True

In [1]:
len('TATGGGCCTAGGTCTTCGGTGTTTACACATTAATGGCAAGAAAACGGTAGGGAGAGCGTG')

60