In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
    .appName('DNABaseCount')\
    .config("spark.driver.bindAddress", "127.0.0.1")\
    .getOrCreate()

### 1. Create an RDD[String] from the Input

In [3]:
input_path = "fasta_example.txt"

In [4]:
records_rdd = records_rdd = spark.sparkContext.textFile(input_path)

In [5]:
records_rdd.collect()


['>Seq1 [organism=Carpodacus mexicanus] [clone=6b] actin (act) mRNA, partial cds',
 'CCTTTATCTAATCTTTGGAGCATGAGCTGGCATAGTTGGAACCGCCCTCAGCCTCCTCATCCGTGCAGAA',
 'TAATAATTTTCTTTATAGTAATACCAATCATGATCGGTGGTTTCGGAAACTGACTAGTCCCACTCATAAT',
 '>Seq2 [organism=uncultured bacillus sp.] [isolate=A2] corticotropin (CT) gene, complete cds',
 'GGTAGGTACCGCCCTAAGNCTCCTAATCCGAGCAGAACTANGCCAACCCGGAGCCCTTCTGGGAGACGAC',
 'TCAACACCACCTTCTTTGACCCAGCAGGAGGAGGAGACCCAGTACTATACCAGCACCTATTCTGATTCTT',
 '>Seq3 [organism=Phalaenopsis equestris var. leucaspis]',
 'CCTATACCTAATTTTCGGCGCATGAGCCGGAATGGTGGGTACCGCTCTAAGCCTCCTCATTCGAGCAGAA',
 'CTAGGCCAACCCGGAGCCCTTCTGGGAGACGACCAAGTCTACAACGTGGTTGTCACGGCCCATGCCTTCG',
 '>Seq9 [organism=Petunia integrifolia subsp. inflata]',
 'TAGTTGGAACAGCCCTCAGCCTACTCATCCGAGCAGAACTAGGCCAACCCGGAACCCTCCTGGGAGATGA',
 'CCAAATCTACAATGTAATCGTCACTGCCCATGCCTTCGTAATAATCTTCTTCATAGTAATACCAGTCATA']

### 2. Define a Mapper Function

In [9]:
from collections import defaultdict
# Parameter: fasta_record: String, a single FASTA record
# output: a list of (dna_letter, frequency)
#
def process_FASTA_as_hashmap(fasta_record):
    if (fasta_record.startswith(">")):
        return [("seq", 1)]

    hashmap = defaultdict(int)
    chars = fasta_record.lower()
    for c in chars:
        hashmap[c] += 1
        #end-for
    print("hashmap=", hashmap)

    key_value_list = [(k, v) for k, v in hashmap.items()]
    print("key_value_list=", key_value_list)
    return key_value_list

In [10]:
pairs_rdd = records_rdd.flatMap(lambda rec: process_FASTA_as_hashmap(rec))

In [18]:
pairs_rdd.take(10)

[('z', 1),
 ('c', 21),
 ('t', 19),
 ('a', 15),
 ('g', 15),
 ('t', 25),
 ('a', 22),
 ('c', 13),
 ('g', 10),
 ('z', 1)]

### 3. Find the Frequencies of DNA Letters

In [17]:
frequencies_rdd = pairs_rdd.reduceByKey(lambda x, y: x+y)
frequencies_rdd.collect()

[('c', 165), ('g', 115), ('z', 4), ('t', 134), ('a', 144), ('n', 2)]