# Spark 101 Exercises

In [19]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import pyspark
import pyspark.sql.functions as F
from pydataset import data

spark = pyspark.sql.SparkSession.builder.getOrCreate()

## 1

Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [11]:
# create dataframe and view schema
df = spark.createDataFrame(pd.DataFrame(['Python', 'C++', 'C#', 'BF', 'TheOneWithTheCode'], columns = ['language']))
df.printSchema()

root
 |-- language: string (nullable = true)



In [12]:
# print shape of the dataframe
print((df.count(), len(df.columns)))

(5, 1)


In [13]:
df.show(5)

+-----------------+
|         language|
+-----------------+
|           Python|
|              C++|
|               C#|
|               BF|
|TheOneWithTheCode|
+-----------------+



## 2

Load the mpg dataset as a spark dataframe.

- Create 1 column of output that contains a message like the one below:

        The 1999 audi a4 has a 4 cylinder engine.

    For each vehicle.

- Transform the trans column so that it only contains either manual or auto.

In [18]:
mpg = spark.createDataFrame(data('mpg'))
mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [24]:
mpg.select(
    F.concat(
        F.lit('The '),
        mpg.year,
        F.lit(' '),
        mpg.manufacturer,
        F.lit(' '),
        mpg.model,
        F.lit(' has a '),
        mpg.cyl,
        F.lit(' engine.')
    ).alias('Vehicle Description')
).show(5, truncate = False)

+--------------------------------+
|Vehicle Description             |
+--------------------------------+
|The 1999 audi a4 has a 4 engine.|
|The 1999 audi a4 has a 4 engine.|
|The 2008 audi a4 has a 4 engine.|
|The 2008 audi a4 has a 4 engine.|
|The 1999 audi a4 has a 6 engine.|
+--------------------------------+
only showing top 5 rows



In [26]:
mpg.select(F.regexp_replace('trans', r'\(\w+\)', '')).show(5)

+-----------------------------------+
|regexp_replace(trans, \(\w+\), , 1)|
+-----------------------------------+
|                               auto|
|                             manual|
|                             manual|
|                               auto|
|                               auto|
+-----------------------------------+
only showing top 5 rows



## 3

Load the tips dataset as a spark dataframe.

- What percentage of observations are smokers?
- Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker.