In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder\
    .appName('DNABaseCount')\
    .config("spark.driver.bindAddress", "127.0.0.1")\
    .getOrCreate()

### 1. Create an RDD[String] from the input

In [3]:
input_path = "fasta_example.txt"

In [4]:
    recordsRDD = spark.sparkContext.textFile(input_path)

In [5]:
# records_rdd.collect()
recordsRDD.getNumPartitions()

2

### 2. Define a function to handle a partition

In [6]:
from collections import defaultdict

# This function creates a hash map of DNA Letters
# seq: Total number of FASTA Records for a single partition
def process_FASTA_partition(iterator):
    # returns a (letter, count) list
    # iterator: represents a single partition
    
    # create an empty dictionary
    hashmap = defaultdict(int)

    for fasta_record in iterator:
        if (fasta_record.startswith(">")):
            hashmap["seq"] += 1
        else:
            chars = fasta_record.lower()
            for c in chars:
                hashmap[c] += 1

    print("hashmap=", hashmap)
    key_value_list = [(k, v) for k, v in hashmap.items()]

    return key_value_list
    

### 3. Apply the custom function to each partition

In [7]:
    pairsRDD = recordsRDD.mapPartitions(process_FASTA_partition)

In [12]:
pairsRDD.take(5)

[('seq', 2), ('c', 79), ('t', 71), ('a', 73), ('g', 55)]

### 4.  Get the frequencies of DNA letters

In [9]:
 frequenciesRDD = pairsRDD.reduceByKey(lambda x, y: x+y)

In [11]:
frequenciesRDD.collect()

[('seq', 4), ('c', 165), ('g', 115), ('t', 134), ('a', 144), ('n', 2)]