In [1]:
import os
import atexit
import sys

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import findspark
from sparkhpc import sparkjob

#Exit handler to clean up the Spark cluster if the script exits or crashes
def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass

findspark.init()

#Parameters for the Spark cluster
nodes=3
tasks_per_node=8 
memory_per_task=1024 #1 gig per process, adjust accordingly
# Please estimate walltime carefully to keep unused Spark clusters from sitting 
# idle so that others may use the resources.
walltime="1:00" #1 hour
os.environ['SBATCH_PARTITION']='single' #Set the appropriate ARC partition

sj = sparkjob.sparkjob(
     ncores=nodes*tasks_per_node,
     cores_per_executor=tasks_per_node,
     memory_per_core=memory_per_task,
     walltime=walltime
    )

sj.wait_to_start()
sc = sj.start_spark()

#Register the exit handler                                                                                                     
atexit.register(exitHandler,sj,sc)

#You need this line if you want to use SparkSQL
sqlCtx=SQLContext(sc)


INFO:sparkhpc.sparkjob:Submitted batch job 632797

INFO:sparkhpc.sparkjob:Submitted cluster 2


In [None]:
data = [1, 5, 4, 4, 5,5]
distData = sc.parallelize(data)



distinctOutput=distData.distinct()#retrun distinct rows
mapOutput=distData.map(lambda x: x*x)#multiple each element in teh list by itself

reduceOutput=distData.reduce(lambda x,y: x*y)#multiply two elemnts and retrun a value. The output of this action is one value.
filterOutput=distinctOutput.filter(lambda x: x % 3 == 0) 

takeOutput=distData.take(3) #retrieve top 3 raws

takeOrderedOutput=distData.takeOrdered(5,lambda s:-1*s) # sort data in descending order and then take top 5

print(mapOutput.glom().collect())#print the structure of RDD


In [None]:
myInfo=sc.textFile("./SparkTestFile.txt")

print(myInfo.getNumPartitions()) #get number of partitions
print(myInfo.glom().collect()) #Partitions structure:
print(myInfo.count()) #get Number of rows

In [2]:
data=["hello world","how are you"]
mapData = sc.parallelize(data).map(lambda w: w.split(" "))
flatMapData = sc.parallelize(data).flatMap(lambda w: w.split(" "))
print(mapData.getNumPartitions())
print(mapData.glom().collect())
print(flatMapData.collect())

24
[[], [], [], [], [], [], [], [], [], [], [], [['hello', 'world']], [], [], [], [], [], [], [], [], [], [], [], [['how', 'are', 'you']]]
['hello', 'world', 'how', 'are', 'you']


In [None]:
print(mapData.collect())

In [None]:
myData=["abed","abed","data","ENSF","SPAR"]
data = sc.parallelize(myData,2).map(lambda w: (w,1)).reduceByKey(lambda p,q: p+q)
print(data.glom().collect())

In [None]:
myData=[('k',5),('s',3),('s',4),('p',7),('p',5),('t',8),('k',6)]
data = sc.parallelize(myData,3)
group = data.groupByKey().collect()
print(group)

In [None]:
myData=[('k',5),('s',3),('s',4),('p',7),('p',5),('t',8),('k',6)] 
data = sc.parallelize(myData,3)
group = data.groupByKey().agg()
print(group)

In [None]:
for i in group[1][1]:
    print (i)

In [None]:
myData=["abed","abed","data","ENSF","SPAR"]
data = sc.parallelize(myData,5).map(lambda w: (w,1)).reduceByKey(lambda i,j: i+j)
print(data.glom().collect())