In [1]:
import os
import atexit
import sys
import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import findspark
from sparkhpc import sparkjob
import pandas
from pyspark.sql import SparkSession

#Exit handler to clean up the Spark cluster if the script exits or crashes
def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass

findspark.init()

#Parameters for the Spark cluster
nodes=3
tasks_per_node=8 
memory_per_task=1024 #1 gig per process, adjust accordingly
# Please estimate walltime carefully to keep unused Spark clusters from sitting 
# idle so that others may use the resources.
walltime="60:00" #60 min 
os.environ['SBATCH_PARTITION']='single' #Set the appropriate ARC partition

sj = sparkjob.sparkjob(
     ncores=nodes*tasks_per_node,
     cores_per_executor=tasks_per_node,
     memory_per_core=memory_per_task,
     walltime=walltime
    )

try:
    print('Cleaning up Spark Job')
    sj.stop()
except:
    pass

sj.wait_to_start()

try:
    print('Cleaning up Spark Context')
    sc.stop()
except:
    pass

sc = sj.start_spark()

#Register the exit handler                                                                                                     
atexit.register(exitHandler,sj,sc)

#You need this line if you want to use SparkSQL
sqlCtx=SQLContext(sc)


INFO:sparkhpc.sparkjob:Submitted batch job 521240



Cleaning up Spark Job


INFO:sparkhpc.sparkjob:Submitted cluster 3


Cleaning up Spark Context


In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('aggs').getOrCreate()
df = spark.read.csv('./TestCSV.csv',inferSchema=True,header=True)

print(df.show(n=2))

+-----+---+-----+
|index|age|grade|
+-----+---+-----+
|    1|  2|    3|
|    5|  6|    7|
+-----+---+-----+
only showing top 2 rows

None


In [3]:
 df.select("index").show() 

+-----+
|index|
+-----+
|    1|
|    5|
|    5|
+-----+



In [4]:
myInfo=sc.textFile("./name-age.csv")
data = myInfo.map(lambda x: x.split(","))
aggregatedData=data.map(lambda x: (x[0], [int(x[1]),1])).reduceByKey(lambda x,y:[x[0]+y[0],x[1]+y[1]]).map(lambda x:[x[0],x[1][0]/x[1][1]])
print(aggregatedData.collect())

[['mike', 18.5], ['sara', 20.5], ['alex', 15.0]]


In [7]:
from pyspark.sql import SparkSession
#from pyspark.sql.functions import col, avg
spark = SparkSession.builder.appName('aggs').getOrCreate()
df = spark.read.csv('./name-age-Header.csv',inferSchema=True,header=True)
df.createOrReplaceTempView("people")
aggregatedData2=sqlCtx.table("people").groupBy("name").agg({"age": "avg"}).collect()
print(aggregatedData2)



[Row(name='alex', avg(age)=15.0), Row(name='sara', avg(age)=20.5), Row(name='mike', avg(age)=18.5)]


In [8]:
aggregatedData2=sqlCtx.table("people").groupBy("name").agg({"age": "avg"}).collect()
print(aggregatedData2)

[Row(name='alex', avg(age)=15.0), Row(name='sara', avg(age)=20.5), Row(name='mike', avg(age)=18.5)]


In [9]:
from pyspark.sql.functions import col, avg, min
aggregatedData3=sqlCtx.table("people").groupBy("name").agg(avg("age"),min("age")).where(avg("age") >15).collect()
print(aggregatedData3)

[Row(name='sara', avg(age)=20.5, min(age)=15), Row(name='mike', avg(age)=18.5, min(age)=12)]


In [10]:
aggregatedData4=sqlCtx.sql("select * from people")
print(aggregatedData4.collect())


[Row(name='mike', age=25), Row(name='sara', age=26), Row(name='alex', age=15), Row(name='mike', age=12), Row(name='sara', age=15)]


In [11]:
print(df.select("age","name").collect())

print(df.select(df["age"]).collect())




[Row(age=25, name='mike'), Row(age=26, name='sara'), Row(age=15, name='alex'), Row(age=12, name='mike'), Row(age=15, name='sara')]
[Row(age=25), Row(age=26), Row(age=15), Row(age=12), Row(age=15)]


In [12]:
print(df.where((df.age == 15) & (df.age>10)).collect())

[Row(name='alex', age=15), Row(name='sara', age=15)]


In [14]:
spark = SparkSession.builder.appName('aggs').getOrCreate()
coursesDf = spark.read.csv('./name-course.csv',inferSchema=True,header=True)
coursesDf.createOrReplaceTempView("courses")

print(coursesDf.collect())


[Row(name='mike', course='cpsc 447'), Row(name='sara', course='ensf 619'), Row(name='alex', course='stat 213'), Row(name='mike', course='ensf 619.28'), Row(name='sara', course='ensf 19.25')]


In [15]:
print(df.join(coursesDf , df.name==coursesDf.name, 'inner').collect())

[Row(name='mike', age=12, name='mike', course='cpsc 447'), Row(name='mike', age=25, name='mike', course='cpsc 447'), Row(name='sara', age=15, name='sara', course='ensf 619'), Row(name='sara', age=26, name='sara', course='ensf 619'), Row(name='alex', age=15, name='alex', course='stat 213'), Row(name='mike', age=12, name='mike', course='ensf 619.28'), Row(name='mike', age=25, name='mike', course='ensf 619.28'), Row(name='sara', age=15, name='sara', course='ensf 19.25'), Row(name='sara', age=26, name='sara', course='ensf 19.25')]
