# Spark 101 Exercises

In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import pyspark
import pyspark.sql.functions as F
from pydataset import data

spark = pyspark.sql.SparkSession.builder.getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/05/19 08:59:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/05/19 08:59:41 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


## 1

Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [2]:
# create dataframe and view schema
df = spark.createDataFrame(pd.DataFrame(['Python', 'C++', 'C#', 'BF', 'TheOneWithTheCode'], columns = ['language']))
df.printSchema()

root
 |-- language: string (nullable = true)



In [3]:
# print shape of the dataframe
print((df.count(), len(df.columns)))

[Stage 0:>                                                          (0 + 8) / 8]

(5, 1)


                                                                                

In [4]:
df.show(5)

+-----------------+
|         language|
+-----------------+
|           Python|
|              C++|
|               C#|
|               BF|
|TheOneWithTheCode|
+-----------------+



## 2

Load the mpg dataset as a spark dataframe.

- Create 1 column of output that contains a message like the one below:

        The 1999 audi a4 has a 4 cylinder engine.

    For each vehicle.

- Transform the trans column so that it only contains either manual or auto.

In [5]:
mpg = spark.createDataFrame(data('mpg'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [6]:
# Create a column in the form "The (year) (manufacturer) (model) has a (cyl) engine."

mpg.select(
    F.concat(
        F.lit('The '),
        mpg.year,
        F.lit(' '),
        mpg.manufacturer,
        F.lit(' '),
        mpg.model,
        F.lit(' has a '),
        mpg.cyl,
        F.lit(' engine.')
    ).alias('Vehicle Description')
).show(5, truncate = False)

+--------------------------------+
|Vehicle Description             |
+--------------------------------+
|The 1999 audi a4 has a 4 engine.|
|The 1999 audi a4 has a 4 engine.|
|The 2008 audi a4 has a 4 engine.|
|The 2008 audi a4 has a 4 engine.|
|The 1999 audi a4 has a 6 engine.|
+--------------------------------+
only showing top 5 rows



In [7]:
# Remove the text in parantheses from the trans column.
mpg.select(F.regexp_replace('trans', r'\(\w+\)', '')).show(5)

+-----------------------------------+
|regexp_replace(trans, \(\w+\), , 1)|
+-----------------------------------+
|                               auto|
|                             manual|
|                             manual|
|                               auto|
|                               auto|
+-----------------------------------+
only showing top 5 rows



## 3

Load the tips dataset as a spark dataframe.

- What percentage of observations are smokers?
- Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker.

In [10]:
df = spark.createDataFrame(data('tips'))

In [11]:
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [12]:
# Calculate the percentage of observations that are smokers.
df.where(df.smoker == 'Yes').count() / df.count()

0.38114754098360654

In [13]:
# Create tip percentage column.
df = df.withColumn('tip_percentage', df.tip / df.total_bill)
df.show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|     tip_percentage|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



In [15]:
# Calculate average tip percentage for groups of sex and smoker.
df.groupBy('sex', 'smoker').agg(F.mean(df.tip_percentage)).show()

[Stage 17:>                                                         (0 + 8) / 8]

+------+------+-------------------+
|   sex|smoker|avg(tip_percentage)|
+------+------+-------------------+
|  Male|    No| 0.1606687151291298|
|Female|    No| 0.1569209707691836|
|  Male|   Yes|0.15277117520248512|
|Female|   Yes|0.18215035269941032|
+------+------+-------------------+



                                                                                

## 4

Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to fahrenheit.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [18]:
from vega_datasets import data

In [26]:
df = spark.createDataFrame(data.seattle_weather())
df.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [27]:
# Convert temperatures to fahrenheit.

df = df.withColumn('temp_max', df.temp_max * 1.8 + 32).withColumn('temp_min', df.temp_min * 1.8 + 32)
df.show(5)

+-------------------+-------------+------------------+--------+----+-------+
|               date|precipitation|          temp_max|temp_min|wind|weather|
+-------------------+-------------+------------------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|55.040000000000006|    41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|             51.08|   37.04| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|             53.06|   44.96| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|             53.96|   42.08| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|48.019999999999996|   37.04| 6.1|   rain|
+-------------------+-------------+------------------+--------+----+-------+
only showing top 5 rows



In [34]:
# Calculate month with most rain on average.

df.withColumn('month', F.month('date'))\
    .where(df.weather == 'rain')\
    .groupBy('month')\
    .agg(F.mean('precipitation'))\
    .sort('avg(precipitation)', ascending = False).show(1)

+-----+------------------+
|month|avg(precipitation)|
+-----+------------------+
|   10|             9.675|
+-----+------------------+
only showing top 1 row



In [35]:
# Calculate windiest year.

df.withColumn('year', F.year('date'))\
    .groupBy('year')\
    .agg(F.sum('wind'))\
    .sort('sum(wind)', ascending = False).show(1)

+----+---------+
|year|sum(wind)|
+----+---------+
|2012|   1244.7|
+----+---------+
only showing top 1 row



In [47]:
# Calculate most frequent type of weather in January.

df.where(F.month('date') == 1)\
    .groupBy('weather')\
    .count()\
    .sort('count', ascending = False).show(1)

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
+-------+-----+
only showing top 1 row



In [56]:
# What is the average high and low temperature on sunny days in July in 2013 and 2014?

df.where((df.weather == 'sun')\
         & (F.month('date') == 7)\
         & ((F.year('date') == 2013)\
         | (F.year('date') == 2014)))\
    .select(F.mean('temp_max'), F.mean('temp_min')).show()

+-----------------+-----------------+
|    avg(temp_max)|    avg(temp_min)|
+-----------------+-----------------+
|80.29192307692308|57.52884615384615|
+-----------------+-----------------+



In [62]:
# What percentage of days were rainy in q3 of 2015?

from pyspark.sql.window import Window

df.where((F.year('date') == 2015) & (F.quarter('date') == 3))\
    .groupBy('weather')\
    .count()\
    .withColumn('percentage', F.col('count') / F.sum('count').over(Window.partitionBy())).show()

22/05/19 10:37:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/19 10:37:33 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/19 10:37:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.
22/05/19 10:37:34 WARN WindowExec: No Partition Defined for Window operation! Moving all data to a single partition, this can cause serious performance degradation.


+-------+-----+--------------------+
|weather|count|          percentage|
+-------+-----+--------------------+
|    fog|   21| 0.22826086956521738|
|drizzle|    5| 0.05434782608695652|
|   rain|    2|0.021739130434782608|
|    sun|   64|  0.6956521739130435|
+-------+-----+--------------------+



                                                                                

In [71]:
# For each year, find what percentage of days it rained (had non-zero precipitation).

df.withColumn('year', F.year('date'))\
    .withColumn('non_zero_precipitation', (df.precipitation > 0).cast('int'))\
    .select('year', 'non_zero_precipitation')\
    .groupBy('year')\
    .agg(F.mean('non_zero_precipitation')).show()

[Stage 96:>                                                         (0 + 8) / 8]

+----+---------------------------+
|year|avg(non_zero_precipitation)|
+----+---------------------------+
|2012|        0.48360655737704916|
|2013|        0.41643835616438357|
|2014|          0.410958904109589|
|2015|        0.39452054794520547|
+----+---------------------------+



                                                                                