In [2]:
from pyspark.sql import SparkSession 

In [3]:
spark = SparkSession.builder.appName("7_8").getOrCreate()

In [4]:
spark 

In [5]:
df = spark.read.format("csv")\
    .option("header","true") \
    .option("inferschema","true")\
    .load("./data/retail-data/all/*.csv")\
    .coalesce(5)
df.cache()

DataFrame[InvoiceNo: string, StockCode: string, Description: string, Quantity: int, InvoiceDate: string, UnitPrice: double, CustomerID: int, Country: string]

In [6]:
df.createGlobalTempView("dftable")

In [52]:
df.tail(20)

[Row(InvoiceNo='581585', StockCode='22466', Description='FAIRY TALE COTTAGE NIGHT LIGHT', Quantity=12, InvoiceDate='12/9/2011 12:31', UnitPrice=1.95, CustomerID=15804, Country='United Kingdom'),
 Row(InvoiceNo='581586', StockCode='22061', Description='LARGE CAKE STAND  HANGING STRAWBERY', Quantity=8, InvoiceDate='12/9/2011 12:49', UnitPrice=2.95, CustomerID=13113, Country='United Kingdom'),
 Row(InvoiceNo='581586', StockCode='23275', Description='SET OF 3 HANGING OWLS OLLIE BEAK', Quantity=24, InvoiceDate='12/9/2011 12:49', UnitPrice=1.25, CustomerID=13113, Country='United Kingdom'),
 Row(InvoiceNo='581586', StockCode='21217', Description='RED RETROSPOT ROUND CAKE TINS', Quantity=24, InvoiceDate='12/9/2011 12:49', UnitPrice=8.95, CustomerID=13113, Country='United Kingdom'),
 Row(InvoiceNo='581586', StockCode='20685', Description='DOORMAT RED RETROSPOT', Quantity=10, InvoiceDate='12/9/2011 12:49', UnitPrice=7.08, CustomerID=13113, Country='United Kingdom'),
 Row(InvoiceNo='581587', Stoc

In [13]:
df.count()

541909

# ch7. AGGREGATION

In [14]:
# count
from pyspark.sql.functions import count ,countDistinct

df.select(count("StockCode")).show()

+----------------+
|count(StockCode)|
+----------------+
|          541909|
+----------------+



In [16]:
from pyspark.sql.functions import count ,countDistinct
df.select(countDistinct("StockCode")).show()

+-------------------------+
|count(DISTINCT StockCode)|
+-------------------------+
|                     4070|
+-------------------------+



In [17]:
from pyspark.sql.functions import approx_count_distinct
df.select(approx_count_distinct("StockCode",0.1)).show()

+--------------------------------+
|approx_count_distinct(StockCode)|
+--------------------------------+
|                            3364|
+--------------------------------+



In [19]:
from pyspark.sql.functions import first,last

df.select(first("StockCode"),last("StockCode")).show()

+----------------+---------------+
|first(StockCode)|last(StockCode)|
+----------------+---------------+
|          85123A|          22138|
+----------------+---------------+



In [20]:
from pyspark.sql.functions import min, max 
df.select(min("Quantity"),max("Quantity")).show()

+-------------+-------------+
|min(Quantity)|max(Quantity)|
+-------------+-------------+
|       -80995|        80995|
+-------------+-------------+



In [22]:
from pyspark.sql.functions import sum 
df.select(sum("Quantity")).show()

+-------------+
|sum(Quantity)|
+-------------+
|      5176450|
+-------------+



In [24]:
# in python
from pyspark.sql.functions import var_pop, stddev_pop 
from pyspark.sql.functions import var_samp,stddev_samp

df.select(var_pop("Quantity"),var_samp("Quantity"),stddev_pop("Quantity"),stddev_samp("Quantity")).show()

+------------------+------------------+--------------------+---------------------+
| var_pop(Quantity)|var_samp(Quantity)|stddev_pop(Quantity)|stddev_samp(Quantity)|
+------------------+------------------+--------------------+---------------------+
|47559.303646609056|47559.391409298754|  218.08095663447796|   218.08115785023418|
+------------------+------------------+--------------------+---------------------+



In [25]:
from pyspark.sql.functions import kurtosis,skewness
df.select(skewness("Quantity"),kurtosis("Quantity")).show()

+-------------------+------------------+
| skewness(Quantity)|kurtosis(Quantity)|
+-------------------+------------------+
|-0.2640755761052562|119768.05495536952|
+-------------------+------------------+



In [26]:
# in Python
from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
covar_pop("InvoiceNo", "Quantity")).show()

+-------------------------+-------------------------------+------------------------------+
|corr(InvoiceNo, Quantity)|covar_samp(InvoiceNo, Quantity)|covar_pop(InvoiceNo, Quantity)|
+-------------------------+-------------------------------+------------------------------+
|     4.912186085635685E-4|             1052.7280543902734|            1052.7260778741693|
+-------------------------+-------------------------------+------------------------------+



In [27]:
from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"),collect_list("Country")).show()

+--------------------+---------------------+
|collect_set(Country)|collect_list(Country)|
+--------------------+---------------------+
|[Portugal, Italy,...| [United Kingdom, ...|
+--------------------+---------------------+



In [28]:
df.groupBy("InvoiceNo","CustomerId").count().show()

+---------+----------+-----+
|InvoiceNo|CustomerId|count|
+---------+----------+-----+
|   536846|     14573|   76|
|   537026|     12395|   12|
|   537883|     14437|    5|
|   538068|     17978|   12|
|   538279|     14952|    7|
|   538800|     16458|   10|
|   538942|     17346|   12|
|  C539947|     13854|    1|
|   540096|     13253|   16|
|   540530|     14755|   27|
|   541225|     14099|   19|
|   541978|     13551|    4|
|   542093|     17677|   16|
|   536596|      null|    6|
|   537252|      null|    1|
|   538041|      null|    1|
|   543188|     12567|   63|
|   543590|     17377|   19|
|  C543757|     13115|    1|
|  C544318|     12989|    1|
+---------+----------+-----+
only showing top 20 rows



In [7]:
spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")
from pyspark.sql.functions import col, to_date,to_timestamp

df_date = df.withColumn("date",to_date(col("InvoiceDate"),"MM/d/yyyy H:mm"))
df_date.createOrReplaceTempView("dfdate")

In [64]:
df_date.select("date").show()

+----------+
|      date|
+----------+
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
|2010-12-01|
+----------+
only showing top 20 rows



In [9]:

from pyspark.sql.window import Window
from pyspark.sql.functions import desc 

windo_spec = Window\
    .partitionBy("CustomerId","date") \
    .orderBy(desc("Quantity"))\
    .rowsBetween(Window.unboundedPreceding,Window.currentRow)

from pyspark.sql.functions import max, dense_rank, rank  
max_purchaseQuantity = max(col("Quantity")).over(windo_spec)

purchaseDenseRank = dense_rank().over(windo_spec)
purchaseRank = rank().over(windo_spec)

In [66]:
from pyspark.sql.functions import col 

df_date.where("CustomerId IS NOT NULL").orderBy("CustomerId")\
    .select(
        col("CustomerId"),
        col("date"),
        col("Quantity"),
        purchaseRank.alias("quantity_rank"),
        purchaseDenseRank.alias("quantity_dense_rank"),
        max_purchaseQuantity.alias("max_purchaseQuan")
    ).show()

+----------+----------+--------+-------------+-------------------+----------------+
|CustomerId|      date|Quantity|quantity_rank|quantity_dense_rank|max_purchaseQuan|
+----------+----------+--------+-------------+-------------------+----------------+
|     12346|2011-01-18|   74215|            1|                  1|           74215|
|     12346|2011-01-18|  -74215|            2|                  2|           74215|
|     12347|2010-12-07|      36|            1|                  1|              36|
|     12347|2010-12-07|      30|            2|                  2|              36|
|     12347|2010-12-07|      24|            3|                  3|              36|
|     12347|2010-12-07|      12|            4|                  4|              36|
|     12347|2010-12-07|      12|            4|                  4|              36|
|     12347|2010-12-07|      12|            4|                  4|              36|
|     12347|2010-12-07|      12|            4|                  4|          

In [10]:
df_date.select(max_purchaseQuantity).show()

+-------------------------------------------------------------------------------------------------------------------------------------+
|max(Quantity) OVER (PARTITION BY CustomerId, date ORDER BY Quantity DESC NULLS LAST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)|
+-------------------------------------------------------------------------------------------------------------------------------------+
|                                                                                                                                  100|
|                                                                                                                                  100|
|                                                                                                                                  100|
|                                                                                                                                  100|
|                                               

In [11]:
no_null_df = df_date.dropna()

In [12]:
no_null_df.count()

406829

In [13]:
no_null = df_date.drop()

In [16]:
from pyspark.sql.functions import sum 
roll_up_df = no_null.rollup("date","Country").agg(sum("Quantity"))\
    .select("date","Country",("sum(Quantity)").alias("total_quan")).orderBy("date")

In [17]:
roll_up_df.show()

+----------+--------------+----------+
|      date|       Country|total_quan|
+----------+--------------+----------+
|      null|          null|   5176450|
|2010-12-01|   Netherlands|        97|
|2010-12-01|United Kingdom|     23949|
|2010-12-01|        Norway|      1852|
|2010-12-01|        France|       449|
|2010-12-01|          EIRE|       243|
|2010-12-01|       Germany|       117|
|2010-12-01|     Australia|       107|
|2010-12-01|          null|     26814|
|2010-12-02|          EIRE|         4|
|2010-12-02|          null|     21023|
|2010-12-02|       Germany|       146|
|2010-12-02|United Kingdom|     20873|
|2010-12-03|       Belgium|       528|
|2010-12-03|        France|       239|
|2010-12-03|       Germany|       170|
|2010-12-03|         Spain|       400|
|2010-12-03|         Italy|       164|
|2010-12-03|   Switzerland|       110|
|2010-12-03|          null|     14830|
+----------+--------------+----------+
only showing top 20 rows



In [18]:
# find the grand total where null is 
roll_up_df.where("Country is Null").show()

+----------+-------+----------+
|      date|Country|total_quan|
+----------+-------+----------+
|      null|   null|   5176450|
|2010-12-01|   null|     26814|
|2010-12-02|   null|     21023|
|2010-12-03|   null|     14830|
|2010-12-05|   null|     16395|
|2010-12-06|   null|     21419|
|2010-12-07|   null|     24995|
|2010-12-08|   null|     22741|
|2010-12-09|   null|     18431|
|2010-12-10|   null|     20297|
|2010-12-12|   null|     10565|
|2010-12-13|   null|     17623|
|2010-12-14|   null|     20098|
|2010-12-15|   null|     18229|
|2010-12-16|   null|     29632|
|2010-12-17|   null|     16069|
|2010-12-19|   null|      3795|
|2010-12-20|   null|     14965|
|2010-12-21|   null|     15467|
|2010-12-22|   null|      3192|
+----------+-------+----------+
only showing top 20 rows



In [19]:
# where 
roll_up_df.where("Date is null").show()

+----+-------+----------+
|date|Country|total_quan|
+----+-------+----------+
|null|   null|   5176450|
+----+-------+----------+



In [20]:
no_null.cube("date","Country").agg(sum(col("Quantity")))\
    .select("date","Country","sum(Quantity)").orderBy("date").show()

+----+--------------------+-------------+
|date|             Country|sum(Quantity)|
+----+--------------------+-------------+
|null|                null|      5176450|
|null|               Japan|        25218|
|null|         Unspecified|         3300|
|null|           Australia|        83653|
|null|            Portugal|        16180|
|null|             Finland|        10666|
|null|                 RSA|          352|
|null|             Germany|       117448|
|null|             Lebanon|          386|
|null|              Cyprus|         6317|
|null|                 USA|         1034|
|null|United Arab Emirates|          982|
|null|           Hong Kong|         4769|
|null|           Singapore|         5234|
|null|              Norway|        19247|
|null|               Spain|        26824|
|null|     Channel Islands|         9479|
|null|  European Community|          497|
|null|      Czech Republic|          592|
|null|             Denmark|         8188|
+----+--------------------+-------

In [25]:
# grouping id 
from pyspark.sql.functions import grouping_id,sum, expr 
no_null.cube("customerId","StockCode").agg(grouping_id(), sum("Quantity"))\
    .orderBy(grouping_id()).show()

+----------+---------+-------------+-------------+
|customerId|StockCode|grouping_id()|sum(Quantity)|
+----------+---------+-------------+-------------+
|     13767|    21484|            0|            8|
|     15862|    22384|            0|            1|
|     16218|    22383|            0|          150|
|     14729|    22919|            0|            2|
|     15525|    22411|            0|            2|
|     15485|    22819|            0|           36|
|     12433|    21981|            0|           96|
|     13093|    22960|            0|           48|
|     16274|    21147|            0|            2|
|     13576|    21756|            0|            7|
|     18011|    22910|            0|            1|
|     15658|    21756|            0|            3|
|     14901|    22301|            0|            6|
|     13117|    84879|            0|            8|
|     15574|    22659|            0|            1|
|     15574|    21587|            0|            6|
|     14775|    22405|         

In [26]:
pivoted = df_date.groupBy("date").pivot("Country").sum()

In [38]:
pivoted.where("date > '2011-12-05'").select("date" ,"`USA_sum(CAST(Quantity AS BIGINT))`").show()

+----------+---------------------------------+
|      date|USA_sum(CAST(Quantity AS BIGINT))|
+----------+---------------------------------+
|2011-12-06|                             null|
|2011-12-09|                             null|
|2011-12-08|                             -196|
|2011-12-07|                             null|
+----------+---------------------------------+



In [30]:
pivoted.columns

['date',
 'Australia_sum(CAST(Quantity AS BIGINT))',
 'Australia_sum(UnitPrice)',
 'Australia_sum(CAST(CustomerID AS BIGINT))',
 'Austria_sum(CAST(Quantity AS BIGINT))',
 'Austria_sum(UnitPrice)',
 'Austria_sum(CAST(CustomerID AS BIGINT))',
 'Bahrain_sum(CAST(Quantity AS BIGINT))',
 'Bahrain_sum(UnitPrice)',
 'Bahrain_sum(CAST(CustomerID AS BIGINT))',
 'Belgium_sum(CAST(Quantity AS BIGINT))',
 'Belgium_sum(UnitPrice)',
 'Belgium_sum(CAST(CustomerID AS BIGINT))',
 'Brazil_sum(CAST(Quantity AS BIGINT))',
 'Brazil_sum(UnitPrice)',
 'Brazil_sum(CAST(CustomerID AS BIGINT))',
 'Canada_sum(CAST(Quantity AS BIGINT))',
 'Canada_sum(UnitPrice)',
 'Canada_sum(CAST(CustomerID AS BIGINT))',
 'Channel Islands_sum(CAST(Quantity AS BIGINT))',
 'Channel Islands_sum(UnitPrice)',
 'Channel Islands_sum(CAST(CustomerID AS BIGINT))',
 'Cyprus_sum(CAST(Quantity AS BIGINT))',
 'Cyprus_sum(UnitPrice)',
 'Cyprus_sum(CAST(CustomerID AS BIGINT))',
 'Czech Republic_sum(CAST(Quantity AS BIGINT))',
 'Czech Republic_

In [37]:
df.select("Country").distinct().orderBy(col("Country").desc()).show()

+--------------------+
|             Country|
+--------------------+
|         Unspecified|
|      United Kingdom|
|United Arab Emirates|
|                 USA|
|         Switzerland|
|              Sweden|
|               Spain|
|           Singapore|
|        Saudi Arabia|
|                 RSA|
|            Portugal|
|              Poland|
|              Norway|
|         Netherlands|
|               Malta|
|           Lithuania|
|             Lebanon|
|               Japan|
|               Italy|
|              Israel|
+--------------------+
only showing top 20 rows



# 8. Joins

- 1. Inner Joins 
- 2. Outer Joins 
- 3. Left Outer joins 
- 4. Right Outer joins
- 5. Left semi joins  
- 6. Left anti joins
- 7. Natural Joins 
- 8. Cross joins    

+---+----------------+----------------+---------------+
| id|            name|graduate_program|   spark_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  0|Michael Armbrust|               1|     [250, 100]|
+---+----------------+----------------+---------------+



In [5]:
# person data
person =  spark.createDataFrame(
    [
        (0,"Bill Chambers",0,[100]),
        (1,"Matei Zaharia",1,[500,250,100]),
        (2,"Michael Armbrust",1,[250,100]),(3,"avi",4,[100])
    ],schema=["id","name","graduate_program","spark_status"])
person.show()



# graduated df 
grad_data = [
    (0, "Masters", "School of Information", "UC Berkeley"),
(2, "Masters", "EECS", "UC Berkeley"),
(1, "Ph.D.", "EECS", "UC Berkeley")]
col = ["id", "degree", "department", "school"]

graduated_df = spark.createDataFrame(grad_data,schema=col)

# spark status df 
spark_data = [(500, "Vice President"),
(250, "PMC Member"),
(100, "Contributor")]
spark_schema =["id",'status']

spark_status_df = spark.createDataFrame(spark_data,schema=spark_schema)

+---+----------------+----------------+---------------+
| id|            name|graduate_program|   spark_status|
+---+----------------+----------------+---------------+
|  0|   Bill Chambers|               0|          [100]|
|  1|   Matei Zaharia|               1|[500, 250, 100]|
|  2|Michael Armbrust|               1|     [250, 100]|
|  3|             avi|               4|          [100]|
+---+----------------+----------------+---------------+



In [6]:
person.createOrReplaceTempView("person")
graduated_df.createOrReplaceTempView("graduateProgram")
spark_status_df.createOrReplaceTempView("sparkStatus")

In [60]:
# inner join
join_expr = person["graduate_program"]==graduated_df['id']

person.join(graduated_df,person.graduate_program==graduated_df.id).show()

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  0|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+



In [56]:
# outer join
person.join(graduated_df,person.graduate_program==graduated_df.id,"outer").show()

+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   0|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|            null|           null|  2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+



In [61]:
# left outer join 
person.join(graduated_df,join_expr,"left_outer").show()

+---+----------------+----------------+---------------+----+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status|  id| degree|          department|     school|
+---+----------------+----------------+---------------+----+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|   0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|   1|  Ph.D.|                EECS|UC Berkeley|
|  0|Michael Armbrust|               1|     [250, 100]|   1|  Ph.D.|                EECS|UC Berkeley|
|  3|             avi|               4|          [100]|null|   null|                null|       null|
+---+----------------+----------------+---------------+----+-------+--------------------+-----------+



In [62]:
# right_outer jon
person.join(graduated_df,join_expr,"right_outer").show()

+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+
|   0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|   1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|   0|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|null|            null|            null|           null|  2|Masters|                EECS|UC Berkeley|
+----+----------------+----------------+---------------+---+-------+--------------------+-----------+



In [68]:
# left_semi
graduated_df.join(person,graduated_df.id==person.graduate_program,'left_semi').show()

+---+-------+--------------------+-----------+
| id| degree|          department|     school|
+---+-------+--------------------+-----------+
|  0|Masters|School of Informa...|UC Berkeley|
|  1|  Ph.D.|                EECS|UC Berkeley|
+---+-------+--------------------+-----------+



In [69]:
# left antijoin
graduated_df.join(person,graduated_df.id==person.graduate_program,'left_anti').show()

+---+-------+----------+-----------+
| id| degree|department|     school|
+---+-------+----------+-----------+
|  2|Masters|      EECS|UC Berkeley|
+---+-------+----------+-----------+



In [73]:
spark.sql('SELECT * FROM graduateProgram NATURAL JOIN person').show(
    
)

+---+-------+--------------------+-----------+----------------+----------------+---------------+
| id| degree|          department|     school|            name|graduate_program|   spark_status|
+---+-------+--------------------+-----------+----------------+----------------+---------------+
|  0|Masters|School of Informa...|UC Berkeley|   Bill Chambers|               0|          [100]|
|  0|Masters|School of Informa...|UC Berkeley|Michael Armbrust|               1|     [250, 100]|
|  1|  Ph.D.|                EECS|UC Berkeley|   Matei Zaharia|               1|[500, 250, 100]|
+---+-------+--------------------+-----------+----------------+----------------+---------------+



In [78]:
join_expr = person["graduate_program"]==graduated_df['id']
joinType = "cross"
graduated_df.join(person,how=joinType).show()

IllegalArgumentException: requirement failed: Unsupported using join type Cross

In [79]:
person.crossJoin(graduated_df).show()

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  0|   Bill Chambers|               0|          [100]|  2|Masters|                EECS|UC Berkeley|
|  0|   Bill Chambers|               0|          [100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  2|Masters|                EECS|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  0|Masters|School of Informa...|UC 

In [80]:
# 
from pyspark.sql.functions import expr 

person.withColumnRenamed("id","personId")\
    .join(spark_status_df,expr("array_contains(spark_status,id)")).show()

+--------+----------------+----------------+---------------+---+--------------+
|personId|            name|graduate_program|   spark_status| id|        status|
+--------+----------------+----------------+---------------+---+--------------+
|       0|   Bill Chambers|               0|          [100]|100|   Contributor|
|       1|   Matei Zaharia|               1|[500, 250, 100]|500|Vice President|
|       1|   Matei Zaharia|               1|[500, 250, 100]|250|    PMC Member|
|       1|   Matei Zaharia|               1|[500, 250, 100]|100|   Contributor|
|       2|Michael Armbrust|               1|     [250, 100]|250|    PMC Member|
|       2|Michael Armbrust|               1|     [250, 100]|100|   Contributor|
|       3|             avi|               4|          [100]|100|   Contributor|
+--------+----------------+----------------+---------------+---+--------------+



In [8]:
graduate_dum1 = graduated_df.withColumnRenamed("id","graduate_program")

graduate_dum1.columns

['graduate_program', 'degree', 'department', 'school']

In [7]:
person.join(graduated_df,person.graduate_program==graduated_df.id).show()

+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status| id| degree|          department|     school|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|  0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|  1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+---+-------+--------------------+-----------+



In [10]:
person.join(graduate_dum1,person.graduate_program==graduate_dum1.graduate_program).show()

+---+----------------+----------------+---------------+----------------+-------+--------------------+-----------+
| id|            name|graduate_program|   spark_status|graduate_program| degree|          department|     school|
+---+----------------+----------------+---------------+----------------+-------+--------------------+-----------+
|  0|   Bill Chambers|               0|          [100]|               0|Masters|School of Informa...|UC Berkeley|
|  1|   Matei Zaharia|               1|[500, 250, 100]|               1|  Ph.D.|                EECS|UC Berkeley|
|  2|Michael Armbrust|               1|     [250, 100]|               1|  Ph.D.|                EECS|UC Berkeley|
+---+----------------+----------------+---------------+----------------+-------+--------------------+-----------+



In [12]:
person.join(graduate_dum1,"graduate_program").select("graduate_program").show()

+----------------+
|graduate_program|
+----------------+
|               0|
|               1|
|               1|
+----------------+



In [13]:
spark.stop()