In [1]:
import os
import atexit
import sys

import pyspark
from pyspark.context import SparkContext
from pyspark.sql import SQLContext
import findspark
from sparkhpc import sparkjob

#Exit handler to clean up the Spark cluster if the script exits or crashes
def exitHandler(sj,sc):
    try:
        print('Trapped Exit cleaning up Spark Context')
        sc.stop()
    except:
        pass
    try:
        print('Trapped Exit cleaning up Spark Job')
        sj.stop()
    except:
        pass

findspark.init()

#Parameters for the Spark cluster
nodes=2
tasks_per_node=3 
memory_per_task=1024 #1 gig per process, adjust accordingly
# Please estimate walltime carefully to keep unused Spark clusters from sitting 
# idle so that others may use the resources.
walltime="1:00" #1 hour
os.environ['SBATCH_PARTITION']='single' #Set the appropriate ARC partition

sj = sparkjob.sparkjob(
     ncores=nodes*tasks_per_node,
     cores_per_executor=tasks_per_node,
     memory_per_core=memory_per_task,
     walltime=walltime
    )

sj.wait_to_start()
sc = sj.start_spark()

#Register the exit handler                                                                                                     
atexit.register(exitHandler,sj,sc)

#You need this line if you want to use SparkSQL
sqlCtx=SQLContext(sc)

INFO:sparkhpc.sparkjob:Submitted batch job 630576

INFO:sparkhpc.sparkjob:Submitted cluster 1


In [2]:
myInfo=sc.textFile("./adult.data.csv")
header=myInfo.first()
datawithoutHeader=myInfo.filter(lambda x: x != header)

print(datawithoutHeader.take(1))


['39, State-gov,77516, Bachelors,13, Never-married, Adm-clerical, Not-in-family, White, Male,2174,0,40, United-States']


In [3]:
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('aggs').getOrCreate()
df = spark.read.csv('./adult.data.csv',inferSchema=True,header=True)
df.createOrReplaceTempView("adultsTbl")

xx=df.sort(df["hoursperweek"].desc()).select(df["nativecountry"]).distinct().limit(10)

print(xx.show())
#df.printSchema()

+-------------------+
|      nativecountry|
+-------------------+
| Dominican-Republic|
|            Ireland|
|               Cuba|
|          Guatemala|
|               Iran|
|             Taiwan|
|        El-Salvador|
|      United-States|
|              South|
|              Japan|
+-------------------+

None


In [12]:
myInfo=sc.textFile("./adult.data.csv")
header=myInfo.first()
datawithoutHeader=myInfo.filter(lambda x: x != header)

data = datawithoutHeader.map(lambda x: x.split(","))
aggregatedData=data.map(lambda x: (x[9], [int(x[12]),1])).reduceByKey(lambda x,y:[x[0]+y[0],x[1]+y[1]]).map(lambda x:[x[0],x[1][0]/x[1][1]])
print(aggregatedData.collect())

[[' Male', 42.42808627810923], [' Female', 36.410361154953115]]


In [11]:
#Using RDD Operation
myInfo=sc.textFile("./adult.data.csv")
header=myInfo.first()
datawithoutHeader=myInfo.filter(lambda x: x != header)
ageColumn=datawithoutHeader.map(lambda x:x.split(",")[0])
#print(ageColumn.collect())

#Using Dataframe API

#print(df.select("age").show())

#Using SQL 

df.createOrReplaceTempView()

#sqlAge=sqlCtx.sql("select * from adultsTbl where age < 50")

#print(sqlAge.show())

In [10]:
#Using RDD Operation
#ageColumn=datawithoutHeader.map(lambda x:x.split(",")[0]).filter(lambda x: int(x)>10)
#print(ageColumn.collect())

#Using Dataframe API

#print(df.select("age").where(df.age>10).show())

#Using SQL 

df.createOrReplaceTempView("adultsTbl")

sqlAge=sqlCtx.sql("select * from adultsTbl where age > 50")

print(sqlAge.show())

+---+-----------------+------+-------------+------------+-------------------+------------------+---------------+-------------------+-------+-----------+-----------+------------+--------------+
|age|        workclass|fnlwgt|    education|educationnum|      maritalstatus|        occupation|   relationship|               race|    sex|capitalgain|capitalloss|hoursperweek| nativecountry|
+---+-----------------+------+-------------+------------+-------------------+------------------+---------------+-------------------+-------+-----------+-----------+------------+--------------+
| 53|          Private|234721|         11th|           7| Married-civ-spouse| Handlers-cleaners|        Husband|              Black|   Male|          0|          0|          40| United-States|
| 52| Self-emp-not-inc|209642|      HS-grad|           9| Married-civ-spouse|   Exec-managerial|        Husband|              White|   Male|          0|          0|          45| United-States|
| 54|          Private|302146|     

In [8]:
#RDD Operations
myInfo=sc.textFile('./adult.data.csv')
header=myInfo.first()
datawithoutHeader=myInfo.filter(lambda x: x != header)
data = datawithoutHeader.filter(lambda x: x.split(",")[5].lstrip().startswith("Married-")).map(lambda w: (w.split(",")[6].lower().strip(),1)).reduceByKey(lambda i,j: i+j).sortBy(lambda x:x[1]) #word count


print(data.collect())

#print("hello")

#data = datawithoutHeader.map(lambda x: x.split(",")).filter(lambda x: x[5].strip().startswith("Married-")).Flatmap(lambda w: (w[6].strip(),1)).reduceByKey(lambda i,j: i+j) 

#print(data.collect())


#sqlOcc=sqlCtx.sql("""SELECT occupation,ROUND(AVG(if(LTRIM(maritalstatus) LIKE 'Married-%',1,0)),2) as marriedrate FROM adultsTbl group by occupation""")

#print(sqlOcc.show())

[('armed-forces', 3), ('priv-house-serv', 20), ('protective-serv', 389), ('tech-support', 410), ('handlers-cleaners', 490), ('farming-fishing', 600), ('', 668), ('other-service', 786), ('transport-moving', 1014), ('machine-op-inspct', 1017), ('adm-clerical', 1050), ('sales', 1699), ('prof-specialty', 2176), ('exec-managerial', 2476), ('craft-repair', 2619)]


In [12]:
sqlOcc=sqlCtx.sql("""SELECT occupation,sum(if(LTRIM(maritalstatus) LIKE 'Married-%',1,0)) as xx,ROUND(AVG(if(LTRIM(maritalstatus) LIKE 'Married-%',1,0)),2) as marriedrate FROM adultsTbl GROUP BY occupation
  ORDER BY xx DESC""")

print(sqlOcc.show())

+------------------+----+-----------+
|        occupation|  xx|marriedrate|
+------------------+----+-----------+
|      Craft-repair|2619|       0.64|
|   Exec-managerial|2476|       0.61|
|    Prof-specialty|2176|       0.53|
|             Sales|1699|       0.47|
|      Adm-clerical|1050|       0.28|
| Machine-op-inspct|1017|       0.51|
|  Transport-moving|1014|       0.63|
|     Other-service| 786|       0.24|
|              null| 668|       0.36|
|   Farming-fishing| 600|        0.6|
| Handlers-cleaners| 490|       0.36|
|      Tech-support| 410|       0.44|
|   Protective-serv| 389|        0.6|
|   Priv-house-serv|  20|       0.13|
|      Armed-Forces|   3|       0.33|
+------------------+----+-----------+

None


In [26]:
# import what we will need
from pyspark.sql.functions import when, col, mean, desc, round

# wrangle the data a bit
df_result = df.select(
  df['occupation'], 
  # create a 1/0 type col on the fly
  when( col('maritalstatus') == ' Divorced' , 1 ).otherwise(0).alias('isdivorced')
)
# do grouping (and a round)
df_result = df_result.groupBy('occupation').agg(round(mean('isdivorced'),2).alias('divorcedrate'))
# do ordering
df_result = df_result.orderBy(desc('divorcedrate'))
# show results
df_result.show(5)

+----------------+------------+
|      occupation|divorcedrate|
+----------------+------------+
|    Adm-clerical|        0.22|
| Priv-house-serv|        0.19|
|    Tech-support|        0.15|
|   Other-service|        0.15|
| Exec-managerial|        0.15|
+----------------+------------+
only showing top 5 rows

