# Spark Exercises

In [1]:
import numpy as np
import pandas as pd

import pyspark
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark

## 1. Create a spark data frame that contains your favorite programming languages.

- The name of the column should be `language`
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [2]:
pdf = pd.DataFrame({
    "language": np.random.choice(["Python",\
                                  "JavaScript",\
                                  "SQL",\
                                  "Go",\
                                  "Java",\
                                  "R",\
                                  "Julia",\
                                  "C#",\
                                  "PHP",\
                                  "C/C++"], size=100)
})

np.random.seed(56)

pdf.head()

Unnamed: 0,language
0,Go
1,JavaScript
2,SQL
3,C#
4,C/C++


In [3]:
# The name of the column should be language
sdf = spark.createDataFrame(pdf)
sdf

DataFrame[language: string]

In [4]:
# View the schema of the dataframe
sdf.printSchema()

root
 |-- language: string (nullable = true)



In [5]:
# Output the shape of the dataframe
print(f"""The spark DataFrame is composed of {sdf.count()} rows
and {len(sdf.columns)} column(s).""")

The spark DataFrame is composed of 100 rows
and 1 column(s).


In [6]:
# Show the first 5 records in the dataframe
sdf.show(5)

+----------+
|  language|
+----------+
|        Go|
|JavaScript|
|       SQL|
|        C#|
|     C/C++|
+----------+
only showing top 5 rows



## 2. Load the `mpg` dataset as a spark dataframe.

1. Create 1 column of output that contains a message like the one below:

    `The 1999 audi a4 has a 4 cylinder engine.`

    For each vehicle.

2. Transform the `trans` column so that it only contains either `manual` or `auto`.

In [7]:
from pydataset import data

mpg = spark.createDataFrame(data("mpg"))
print(mpg)
mpg.show(5)

DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string]
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [8]:
mpg.select(concat(
    lit("The "),
    col("year"),
    lit(" "),
    col("manufacturer"),
    lit(" "),
    col("model"),
    lit(" has a "),
    col("cyl"),
    lit(" cylinder engine.")
).alias("vehicle_cyl_desc")).show(truncate=False)

+--------------------------------------------------------------+
|vehicle_cyl_desc                                              |
+--------------------------------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine.                     |
|The 1999 audi a4 has a 4 cylinder engine.                     |
|The 2008 audi a4 has a 4 cylinder engine.                     |
|The 2008 audi a4 has a 4 cylinder engine.                     |
|The 1999 audi a4 has a 6 cylinder engine.                     |
|The 1999 audi a4 has a 6 cylinder engine.                     |
|The 2008 audi a4 has a 6 cylinder engine.                     |
|The 1999 audi a4 quattro has a 4 cylinder engine.             |
|The 1999 audi a4 quattro has a 4 cylinder engine.             |
|The 2008 audi a4 quattro has a 4 cylinder engine.             |
|The 2008 audi a4 quattro has a 4 cylinder engine.             |
|The 1999 audi a4 quattro has a 6 cylinder engine.             |
|The 1999 audi a4 quattro

In [9]:
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [10]:
# Transform the trans column so that it only contains either
# manual or auto.
mpg.select(
    regexp_extract("trans", r"^([a-z]+)\(", 1).alias("trans_extract"),
    regexp_replace("trans", r"\(.+$", "").alias("trans_replace"),
    when(mpg.trans.like("auto%"), "auto").otherwise("manual").alias("trans_when")
    
).show()

+-------------+-------------+----------+
|trans_extract|trans_replace|trans_when|
+-------------+-------------+----------+
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|       manual|       manual|    manual|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
|         auto|         auto|      auto|
+-------------+-------------+----------+
only showing top

## 3. Load the `tips` dataset as a spark dataframe.
1. What percentage of observations are smokers?
2. Create a column that contains the tip percentage
3. Calculate the average tip percentage for each combination of sex and smoker.

In [11]:
tips = spark.createDataFrame(data("tips"))
print(tips)
tips.show(5)

DataFrame[total_bill: double, tip: double, sex: string, smoker: string, day: string, time: string, size: bigint]
+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [12]:
# What percentage of observations are smokers?
