In [23]:
import pandas as pd
import numpy as np
import datetime as dt
import os

import findspark
findspark.init()
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql.functions import *#avg, count, expr
from pyspark.sql.types import *

In [10]:
# initialize
sc = pyspark.SparkContext()
spark = SparkSession(sc)
spark.sparkContext.appName = 'joinAppend'
# show the number of cores
print('%d cores'%spark._jsc.sc().getExecutorMemoryStatus().keySet().size())
spark

1 cores


## make / get some fake data

In [12]:
'''herb = (('Elephant', 1, True), ('Sea Turtle', 2, True), ('Caterpillar', 3, True), ('Human', 4, True))
herbivores = spark.createDataFrame(herb, ['Animal', 'Id', 'EatsPlants'])
herbivores.show()

carn = (('Shark', 5, True), ('Sea Turtle', 2, True), ('Tiger', 6, True), ('Human', 4, True))
carnivores = spark.createDataFrame(carn, ['Animal', 'Id', 'EatsMeat'])
carnivores.show()'''

schem = StructType([StructField('Animal', StringType()), StructField('Id', IntegerType()), StructField('EatsPlants', BooleanType())])
herbivores = spark.read.format('csv').options(header=True).schema(schem).load('../data/herbivores.csv')
herbivores.show()

schem = StructType([StructField('Animal', StringType()), StructField('Id', IntegerType()), StructField('EatsMeat', BooleanType())])
carnivores = spark.read.format('csv').options(header=True).schema(schem).load('../data/carnivores.csv')
carnivores.show()

+-----------+---+----------+
|     Animal| Id|EatsPlants|
+-----------+---+----------+
|   Elephant|  1|      true|
| Sea Turtle|  2|      true|
|Caterpillar|  3|      true|
|      Human|  4|      true|
+-----------+---+----------+

+----------+---+--------+
|    Animal| Id|EatsMeat|
+----------+---+--------+
|     Shark|  5|    true|
|Sea Turtle|  2|    true|
|     Tiger|  6|    true|
|     Human|  4|    true|
+----------+---+--------+



In [15]:
# append (union or pd.concat)
eatsall = herbivores.select('Id', 'Animal').union(carnivores.select('Id', 'Animal')).distinct()
eatsall.orderBy('Id').show()

+---+-----------+
| Id|     Animal|
+---+-----------+
|  1|   Elephant|
|  2| Sea Turtle|
|  3|Caterpillar|
|  4|      Human|
|  5|      Shark|
|  6|      Tiger|
+---+-----------+



In [17]:
# inner join
omnivores = herbivores.join(carnivores, how='inner', on=['Id', 'Animal'])
omnivores.show()

+---+----------+----------+--------+
| Id|    Animal|EatsPlants|EatsMeat|
+---+----------+----------+--------+
|  2|Sea Turtle|      true|    true|
|  4|     Human|      true|    true|
+---+----------+----------+--------+



In [19]:
# full outer join
eatsall = herbivores.join(carnivores, how='outer', on=['Id', 'Animal'])
eatsall.orderBy('Id').show()

+---+-----------+----------+--------+
| Id|     Animal|EatsPlants|EatsMeat|
+---+-----------+----------+--------+
|  1|   Elephant|      true|    null|
|  2| Sea Turtle|      true|    true|
|  3|Caterpillar|      true|    null|
|  4|      Human|      true|    true|
|  5|      Shark|      null|    true|
|  6|      Tiger|      null|    true|
+---+-----------+----------+--------+



In [22]:
# just get records in herbivores not in carnivores
herbonly = herbivores.join(carnivores, how='left', on=['Id', 'Animal']).where(isnull('EatsMeat'))
herbonly.show()

+---+-----------+----------+--------+
| Id|     Animal|EatsPlants|EatsMeat|
+---+-----------+----------+--------+
|  1|   Elephant|      true|    null|
|  3|Caterpillar|      true|    null|
+---+-----------+----------+--------+



In [30]:
# load univ of wisconsin madison data into a dict of dataframes
path = '../data/uw-madison-courses/'
uofw = {f.split('.')[0]:spark.read.format('csv').options(header=True, inferSchema=True).load(path+f) for f in os.listdir(path) if f[-4:] == '.csv'}

In [31]:
# show the head of each
for (key, val) in uofw.items():
    print(key)
    val.show(5, truncate=False)

subject_memberships
+------------+------------------------------------+
|subject_code|course_offering_uuid                |
+------------+------------------------------------+
|220         |344b3ebe-da7e-314c-83ed-9425269695fd|
|320         |344b3ebe-da7e-314c-83ed-9425269695fd|
|346         |344b3ebe-da7e-314c-83ed-9425269695fd|
|612         |344b3ebe-da7e-314c-83ed-9425269695fd|
|636         |344b3ebe-da7e-314c-83ed-9425269695fd|
+------------+------------------------------------+
only showing top 5 rows

instructors
+-------+------------------+
|id     |name              |
+-------+------------------+
|761703 |JOHN ARCHAMBAULT  |
|3677061|STEPHANIE KANN    |
|788586 |KATHY PREM        |
|1600463|KRISTIN KLARKOWSKI|
|693634 |DAVID BOHNHOFF    |
+-------+------------------+
only showing top 5 rows

rooms
+------------------------------------+-------------+---------+
|uuid                                |facility_code|room_code|
+------------------------------------+-------------+-----

In [54]:
thisID = '344b3ebe-da7e-314c-83ed-9425269695fd' # in case I want to filter
joined = uofw['course_offerings'].withColumnRenamed('uuid', 'course_offering_uuid').withColumnRenamed('name', 'course_name')\
    .join(uofw['sections'].withColumnRenamed('uuid', 'section_uuid'), how='inner', on='course_offering_uuid')\
    .join(uofw['teachings'].withColumnRenamed('uuid', 'teaching_uuid'), how='inner', on='section_uuid')\
    .join(uofw['instructors'].withColumnRenamed('id', 'instructor_id').withColumnRenamed('name', 'instructor_name'),
          how='inner', on='instructor_id')\
    .join(uofw['schedules'].withColumn('days', array('mon', 'tues', 'wed', 'thurs', 'fri'))\
          .withColumnRenamed('uuid', 'schedule_uuid').select('schedule_uuid', 'days', 'start_time', 'end_time'),
          how='inner', on='schedule_uuid')

display(joined.select('course_name', 'instructor_name', 'days', 'start_time', 'end_time', 'section_type', 'number', ).limit(4).toPandas())

Unnamed: 0,course_name,instructor_name,days,start_time,end_time,section_type,number
0,Master's Research or Thesis,THOMAS JAHNS,"[False, False, False, False, False]",-1,-1,IND,45
1,Wrkshp-Industrl Organizatn,JEAN-FRANCOIS HOUDE,"[False, False, True, False, False]",945,1050,LEC,1
2,Workshop - Public Economics,CHRISTOPHER R TABER,"[False, True, False, False, False]",945,1050,LEC,1
3,Plant Cell Biology,MARISA S OTEGUI,"[False, True, False, True, False]",595,645,LEC,1


In [55]:
sc.stop()