In [43]:
# Import libraries

import pandas as pd
import pyspark
from pyspark.sql.functions import *
from pydataset import data
from vega_datasets import data

import re

# Create spark session
spark = pyspark.sql.SparkSession.builder.getOrCreate()

### 1. Create a spark data frame that contains your favorite programming languages.

1. The name of the column should be language
1. View the schema of the dataframe
1. Output the shape of the dataframe
1. Show the first 5 records in the dataframe

In [6]:
# Create a pandas dataframe first

df = pd.DataFrame(['Python', 'JavaScript', 'Java', 'C#', 'C', 'C++', 'Go', 'R', 'Swift', 'PHP'], 
                  columns = ['language'])

# Conver the pandas dataframe to spark dataframe
df = spark.createDataFrame(df)

# View the schema of the spark dataframe
df.printSchema()

# Output the shape of the dataframe
print("DataFrame shape:", df.count(), " x ", len(df.columns))

# Show the first 5 records in the dataframe
df.show(5)

root
 |-- language: string (nullable = true)

DataFrame shape: 10  x  1
+----------+
|  language|
+----------+
|    Python|
|JavaScript|
|      Java|
|        C#|
|         C|
+----------+
only showing top 5 rows



### 2. Load the `mpg` dataset as a spark dataframe.

In [8]:
# Load the mpg dataset as a spark dataframe
mpg = spark.createDataFrame(data('mpg'))

# Print the schema
mpg.printSchema()

# Print the first 3 rows
mpg.show(3)

root
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- displ: double (nullable = true)
 |-- year: long (nullable = true)
 |-- cyl: long (nullable = true)
 |-- trans: string (nullable = true)
 |-- drv: string (nullable = true)
 |-- cty: long (nullable = true)
 |-- hwy: long (nullable = true)
 |-- fl: string (nullable = true)
 |-- class: string (nullable = true)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 3 rows



### 2-a. Create 1 column of output that contains a message like the one below:
`The 1999 audi a4 has a 4 cylinder engine.`

In [19]:
mpg.select(
    concat(
        lit("The "), 
        mpg.year,
        lit(" "), 
        mpg.manufacturer,
        lit(" "), 
        mpg.model, 
        lit(" has a "), 
        mpg.cyl, 
        lit(" cylinder engine") 
    ).alias("vehicle_cylinder_desc")
).show(5, truncate=False)

+----------------------------------------+
|vehicle_cylinder_desc                   |
+----------------------------------------+
|The 1999 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 2008 audi a4 has a 4 cylinder engine|
|The 1999 audi a4 has a 6 cylinder engine|
+----------------------------------------+
only showing top 5 rows



### 2-b. Transform the trans column so that it only contains either manual or auto.
- This can be done in many ways:
    - `regexp_extract`
    - `regexp_replace`
    - `when`

In [20]:
mpg.select(col("trans")).show(5)

+----------+
|     trans|
+----------+
|  auto(l5)|
|manual(m5)|
|manual(m6)|
|  auto(av)|
|  auto(l5)|
+----------+
only showing top 5 rows



In [45]:
mpg.select(
    col("trans"), 
    regexp_extract("trans", r"^([a-z]*)", 1).alias("transmission"), 
    regexp_extract("trans", r"^(\w+)\(", 1).alias("regexp_extract"),
    regexp_replace("trans", r"\(.+$", "").alias("regexp_replace"),
    when(mpg.trans.like("auto%"), "auto") # Let us specify a value to produce under certain condition
    .otherwise("manual").alias("when + like")
    
).show()

+----------+------------+--------------+--------------+-----------+
|     trans|transmission|regexp_extract|regexp_replace|when + like|
+----------+------------+--------------+--------------+-----------+
|  auto(l5)|        auto|          auto|          auto|       auto|
|manual(m5)|      manual|        manual|        manual|     manual|
|manual(m6)|      manual|        manual|        manual|     manual|
|  auto(av)|        auto|          auto|          auto|       auto|
|  auto(l5)|        auto|          auto|          auto|       auto|
|manual(m5)|      manual|        manual|        manual|     manual|
|  auto(av)|        auto|          auto|          auto|       auto|
|manual(m5)|      manual|        manual|        manual|     manual|
|  auto(l5)|        auto|          auto|          auto|       auto|
|manual(m6)|      manual|        manual|        manual|     manual|
|  auto(s6)|        auto|          auto|          auto|       auto|
|  auto(l5)|        auto|          auto|        

In [43]:
import re

def trans(string):
    regexp = r'^(\w+)\('
    return re.search(regexp, string)

In [44]:
trans("manual(m5)")

<re.Match object; span=(0, 7), match='manual('>

### Load the `tips` dataset as a spark dataframe

In [5]:
# Load the tips dataset as a spark dataframe

tips = spark.createDataFrame(data('tips'))

# Print the schema
tips.printSchema()

# Print the first 5 rows
tips.show(5)

root
 |-- total_bill: double (nullable = true)
 |-- tip: double (nullable = true)
 |-- sex: string (nullable = true)
 |-- smoker: string (nullable = true)
 |-- day: string (nullable = true)
 |-- time: string (nullable = true)
 |-- size: long (nullable = true)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



### 3-a. What percentage of observations are smokers?

In [39]:
# Use roll up

(
    tips.rollup('smoker')
        .count()
        .sort('smoker')
        .select(
            col("smoker"),
            (concat(
                round(col("count")/244*100, 0).cast("int"), 
                lit("%")).alias("percentage")
            )
        )
).show()

+------+----------+
|smoker|percentage|
+------+----------+
|  null|      100%|
|    No|       62%|
|   Yes|       38%|
+------+----------+



In [35]:
# Use groupby

(
    tips.groupby("smoker")
        .count()
        .withColumn(
            "percentage", # Add a new column named 'percentage'
            concat(
                round(col("count") / tips.count() * 100, 0).cast("int"),
                lit("%")
            )
        )
).show()

+------+-----+----------+
|smoker|count|percentage|
+------+-----+----------+
|    No|  151|       62%|
|   Yes|   93|       38%|
+------+-----+----------+



### 3-b. Create a column that contains the tip percentage

In [23]:
tips.select(
    tips.total_bill, 
    tips.tip,
    (tips.tip/tips.total_bill).alias("percentage")
).show(5)

+----------+----+-------------------+
|total_bill| tip|         percentage|
+----------+----+-------------------+
|     16.99|1.01|0.05944673337257211|
|     10.34|1.66|0.16054158607350097|
|     21.01| 3.5|0.16658733936220846|
|     23.68|3.31| 0.1397804054054054|
|     24.59|3.61|0.14680764538430255|
+----------+----+-------------------+
only showing top 5 rows



In [40]:
tips.withColumn(
    "tip_percentage",
    col("tip") / col("total_bill")
).show(5)

+----------+----+------+------+---+------+----+-------------------+
|total_bill| tip|   sex|smoker|day|  time|size|     tip_percentage|
+----------+----+------+------+---+------+----+-------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|0.05944673337257211|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|0.16054158607350097|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|0.16658733936220846|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1397804054054054|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|0.14680764538430255|
+----------+----+------+------+---+------+----+-------------------+
only showing top 5 rows



### 3-c. Calculate the average tip percentage for each combination of sex and smoker.

In [32]:
# groupby alone

(
    tips.groupby(tips.sex, tips.smoker)
        .agg((mean(col("tip")/col("total_bill")).alias("percentage")))
).show()

+------+------+-------------------+
|   sex|smoker|         percentage|
+------+------+-------------------+
|  Male|    No| 0.1606687151291298|
|  Male|   Yes|0.15277117520248512|
|Female|    No| 0.1569209707691836|
|Female|   Yes|0.18215035269941032|
+------+------+-------------------+



In [42]:
# Groupby + pivot

(
    tips.withColumn("tip_percentage", tips.tip/tips.total_bill)
    .groupby('sex')
    .pivot("smoker")
    .agg(mean("tip_percentage"))
).show(5)

+------+------------------+-------------------+
|   sex|                No|                Yes|
+------+------------------+-------------------+
|Female|0.1569209707691836|0.18215035269941032|
|  Male|0.1606687151291298|0.15277117520248512|
+------+------------------+-------------------+



### 4. Use the seattle weather dataset referenced in the lesson to answer the questions below.

In [44]:
# Load the seattle weather dataset in a pyspark dataframe
weather = spark.createDataFrame(data.seattle_weather())

# Print the Schema
weather.printSchema()

# Print the first 5 rows
weather.show(5)

root
 |-- date: timestamp (nullable = true)
 |-- precipitation: double (nullable = true)
 |-- temp_max: double (nullable = true)
 |-- temp_min: double (nullable = true)
 |-- wind: double (nullable = true)
 |-- weather: string (nullable = true)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



### 4-a. Convert the temperatures to farenheight.

In [48]:
# Chain the .withColumn() methods to convert max and min temparature to farenheight

(
    weather.withColumn("temp_max_f", weather.temp_max*9/5+32)
    .withColumn("temp_min_f", weather.temp_min*9/5+32)
).show(5)

+-------------------+-------------+--------+--------+----+-------+----------+----------+
|               date|precipitation|temp_max|temp_min|wind|weather|temp_max_f|temp_min_f|
+-------------------+-------------+--------+--------+----+-------+----------+----------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|     55.04|      41.0|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|     51.08|     37.04|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|     53.06|     44.96|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|     53.96|     42.08|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|     48.02|     37.04|
+-------------------+-------------+--------+--------+----+-------+----------+----------+
only showing top 5 rows



In [109]:
# The question is to convert NOT to add new columns

(
    weather.withColumn("temp_max", weather.temp_max*9/5+32)
    .withColumn("temp_min", weather.temp_min*9/5+32)
).show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|   48.02|   37.04| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



### 4-b. Which month has the most rain, on average?

In [122]:
(
    weather.withColumn("month", month("date"))
    .groupby("month")
    .agg(mean(weather.precipitation).alias("average_rain_per_day"))
    .sort(col("average_rain_per_day").desc())
).show()

+-----+--------------------+
|month|average_rain_per_day|
+-----+--------------------+
|   11|   5.354166666666667|
|   12|   5.021774193548389|
|    3|   4.888709677419355|
|   10|   4.059677419354839|
|    1|  3.7580645161290316|
|    2|   3.734513274336283|
|    4|   3.128333333333333|
|    9|  1.9624999999999997|
|    5|  1.6733870967741935|
|    8|  1.3201612903225806|
|    6|  1.1075000000000002|
|    7| 0.38870967741935486|
+-----+--------------------+



In [118]:
(
    weather.groupby(month("date"), year("date"))
    .agg(sum("precipitation").alias("total_monthly_precipitation"))
    .groupby("month(date)")
    .agg(mean("total_monthly_precipitation").alias("avg_monthly_rain"))
    .sort(col("avg_monthly_rain").desc())
).show(1)

+-----------+----------------+
|month(date)|avg_monthly_rain|
+-----------+----------------+
|         11|         160.625|
+-----------+----------------+
only showing top 1 row



### 4-c. Which year was the windiest?

In [64]:
(
    weather.withColumn("year", year("date"))
    .groupby("year")
    .agg((mean(weather.wind)).alias("avg_wind"))
    .sort(col("avg_wind").desc())
).show()

+----+------------------+
|year|          avg_wind|
+----+------------------+
|2012| 3.400819672131148|
|2014| 3.387671232876714|
|2015| 3.159726027397261|
|2013|3.0158904109589058|
+----+------------------+



In [123]:
(
    weather.withColumn("year", year("date"))
    .groupby("year")
    .agg((sum(weather.wind)).alias("total_wind"))
    .sort(col("total_wind").desc())
).show()

+----+------------------+
|year|        total_wind|
+----+------------------+
|2012|            1244.7|
|2014|1236.5000000000007|
|2015|1153.3000000000002|
|2013|1100.8000000000006|
+----+------------------+



### 4-d. What is the most frequent type of weather in January?

In [66]:
weather.groupby("weather").count().show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|  411|
|drizzle|   54|
|   rain|  259|
|    sun|  714|
|   snow|   23|
+-------+-----+



In [72]:
(
    weather.withColumn("month", month("date"))
    .filter(col("month") == 1)
    .groupby("month", "weather") # No need to groupby "month"
    .count()
    .sort(col("count").desc())
).show()

+-----+-------+-----+
|month|weather|count|
+-----+-------+-----+
|    1|    fog|   38|
|    1|   rain|   35|
|    1|    sun|   33|
|    1|drizzle|   10|
|    1|   snow|    8|
+-----+-------+-----+



In [124]:
(
    weather.withColumn("month", month("date"))
    .filter(col("month") == 1)
    .groupby("weather")
    .count()
    .sort(col("count").desc())
).show()

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



### 4-e. What is the average high and low temperature on sunny days in July in 2013 and 2014?
- Conditions: sunny days, 2013 and 2014
- aggregation: high, low

In [82]:
(
    weather.filter(year("date") > 2012) # pyspark df doesn't support &, |
    .filter(year("date") < 2015)
    .filter(col("weather") == 'sun')
    .filter(month("date") == 7)
    .agg(
        mean(weather.temp_max).alias("average_high"), # You can use avg
        mean(weather.temp_min).alias("average_low") # You can use avg
    )
).show()

+------------------+-----------------+
|      average_high|      average_low|
+------------------+-----------------+
|26.828846153846158|14.18269230769231|
+------------------+-----------------+



### 4-f. What percentage of days were rainy in q3 of 2015?
- Condition: q3 in 2015

In [98]:
(
    weather.filter(year("date") == 2015)
    .filter(quarter("date") == 3)
    .rollup("weather") # Run the code to find out the total count of the days
    .count()
    .withColumn(
        "percentage",
        round(col("count")/92*100,0)
    )
).show()

+-------+-----+----------+
|weather|count|percentage|
+-------+-----+----------+
|   null|   92|     100.0|
|    fog|   21|      23.0|
|    sun|   64|      70.0|
|   rain|    2|       2.0|
|drizzle|    5|       5.0|
+-------+-----+----------+



In [141]:
# Take the advange of 0 and 1

(
    weather.filter(year("date") == 2015)
    .filter(quarter("date") == 3)
    .select(
        when(col("weather") == "rain", 1).otherwise(0).alias("rain")
        )
    .agg(mean("rain"))
).show()

+--------------------+
|           avg(rain)|
+--------------------+
|0.021739130434782608|
+--------------------+



### 4-g. For each year, find what percentage of days it rained (had non-zero precipitation).
- Has all the data processed before groupby. 

In [146]:
(
    weather.withColumn(
        "rained_days", 
        when(weather.precipitation != 0, 1).otherwise(0) # Add a new column
    )
    .groupby(year("date")) # How to rename the columns
    .agg(mean("rained_days"))
    .sort("year(date)")
).show()

+----------+-------------------+
|year(date)|   avg(rained_days)|
+----------+-------------------+
|      2012|0.48360655737704916|
|      2013|0.41643835616438357|
|      2014|  0.410958904109589|
|      2015|0.39452054794520547|
+----------+-------------------+



In [151]:
# A different approach

(
    weather.withColumn("year", year("date")) # Add a new column
    .select(   # Select the columns I needed to answer the question
        "year", 
        when(col("precipitation") > 0, 1).otherwise(0).alias("rain") # A new column
    )
    .groupby('year')
    .agg(avg("rain"))
).show()

+----+-------------------+
|year|          avg(rain)|
+----+-------------------+
|2015|0.39452054794520547|
|2013|0.41643835616438357|
|2014|  0.410958904109589|
|2012|0.48360655737704916|
+----+-------------------+



### Sidebar: How to get values out of a spark dataframe
- `.show()` -- shows the information, but doesn't produce a value that can be stored in a variable
- `.first()` -- produce the first result as a Row object
- `.head(n)` -- produces a list of the first n Row objects