In [1]:
import pandas as pd

import pyspark
from pyspark.sql.functions import *

spark = pyspark.sql.SparkSession.builder.getOrCreate() #method to create spark session

### Create a spark data frame that contains your favorite programming languages.

- The name of the column should be language
- View the schema of the dataframe
- Output the shape of the dataframe
- Show the first 5 records in the dataframe

In [2]:
#First create a pandas dataframe
languages = ['Python', 'SQL', 'Spark', 'HTML', 'Julia', 'R', 'Java']
pandas_dataframe = pd.DataFrame({
    "language": languages
})

#Convert Dataframe to a Spark Onject
df = spark.createDataFrame(pandas_dataframe)


In [3]:

df.printSchema()

root
 |-- language: string (nullable = true)



In [4]:
df.show(5)

+--------+
|language|
+--------+
|  Python|
|     SQL|
|   Spark|
|    HTML|
|   Julia|
+--------+
only showing top 5 rows



### Load the mpg dataset as a spark dataframe.

- Create 1 column of output that contains a message like the one below:


- The 1999 audi a4 has a 4 cylinder engine.
- For each vehicle.

    - Transform the trans column so that it only contains either manual or auto.



In [5]:
from pydataset import data

mpg = spark.createDataFrame(data('mpg'))

mpg.show(5)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [6]:
from pyspark.sql.functions import concat
from pyspark.sql.functions import lit

In [7]:
mpg.select(concat(lit('The '), mpg.year, lit(' '), mpg.manufacturer, lit(' '), mpg.model, lit(' has a'), mpg.cyl, lit('cylinder engine.')).alias('string')).show(truncate=False)


+------------------------------------------------------------+
|string                                                      |
+------------------------------------------------------------+
|The 1999 audi a4 has a4cylinder engine.                     |
|The 1999 audi a4 has a4cylinder engine.                     |
|The 2008 audi a4 has a4cylinder engine.                     |
|The 2008 audi a4 has a4cylinder engine.                     |
|The 1999 audi a4 has a6cylinder engine.                     |
|The 1999 audi a4 has a6cylinder engine.                     |
|The 2008 audi a4 has a6cylinder engine.                     |
|The 1999 audi a4 quattro has a4cylinder engine.             |
|The 1999 audi a4 quattro has a4cylinder engine.             |
|The 2008 audi a4 quattro has a4cylinder engine.             |
|The 2008 audi a4 quattro has a4cylinder engine.             |
|The 1999 audi a4 quattro has a6cylinder engine.             |
|The 1999 audi a4 quattro has a6cylinder engine.       

In [8]:
from pyspark.sql.functions import when, sum
mpg.select(mpg.trans,
          when((mpg.trans.contains('auto')), 'auto')
          .otherwise('manual')
          .alias('trans')).show(20)

+----------+------+
|     trans| trans|
+----------+------+
|  auto(l5)|  auto|
|manual(m5)|manual|
|manual(m6)|manual|
|  auto(av)|  auto|
|  auto(l5)|  auto|
|manual(m5)|manual|
|  auto(av)|  auto|
|manual(m5)|manual|
|  auto(l5)|  auto|
|manual(m6)|manual|
|  auto(s6)|  auto|
|  auto(l5)|  auto|
|manual(m5)|manual|
|  auto(s6)|  auto|
|manual(m6)|manual|
|  auto(l5)|  auto|
|  auto(s6)|  auto|
|  auto(s6)|  auto|
|  auto(l4)|  auto|
|  auto(l4)|  auto|
+----------+------+
only showing top 20 rows



### Load the tips dataset as a spark dataframe.

- What percentage of observations are smokers?
- Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker.

In [9]:
tips = spark.createDataFrame(data('tips'))

tips.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [37]:
from pyspark.sql.functions import avg, round
print((tips.filter(tips.smoker == 'Yes').count() / tips.count() * 100), 'percent of observations are smokers.')
      

38.114754098360656 percent of observations are smokers.


In [41]:
tips = tips.select(
        tips.total_bill, tips.tip, tips.sex, tips.smoker, tips.day, tips.time, tips.size,
        (tips.total_bill / tips.tip).alias('tip_percent')
)

In [42]:
tips.show(5)

+----------+----+------+------+---+------+----+------------------+
|total_bill| tip|   sex|smoker|day|  time|size|       tip_percent|
+----------+----+------+------+---+------+----+------------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2| 16.82178217821782|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3| 6.228915662650603|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3| 6.002857142857144|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|7.1540785498489425|
|     24.59|3.61|Female|    No|Sun|Dinner|   4| 6.811634349030471|
+----------+----+------+------+---+------+----+------------------+
only showing top 5 rows



In [43]:

tips.rollup('sex','smoker').agg(avg(tips.tip_percent)).show()

+------+------+------------------+
|   sex|smoker|  avg(tip_percent)|
+------+------+------------------+
|Female|  null| 6.714850156922292|
|  Male|    No|6.6877637800136105|
|  Male|   Yes| 8.117240108903532|
|  null|  null| 7.048932200195786|
|Female|   Yes| 6.448210654922861|
|Female|    No| 6.877796519255277|
|  Male|  null|  7.23406046621358|
+------+------+------------------+



### Use the seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to fahrenheit.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [15]:
from vega_datasets import data

weather = spark.createDataFrame(data.seattle_weather())

In [16]:
weather.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|     8.9|     2.8| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [17]:
weather = weather.withColumn(
    "temp_max", (col("temp_max") * 9 / 5 + 32)
).withColumn("temp_min", (col("temp_min") * 9 / 5 + 32))

In [18]:
weather.show(5)

+-------------------+-------------+--------+--------+----+-------+
|               date|precipitation|temp_max|temp_min|wind|weather|
+-------------------+-------------+--------+--------+----+-------+
|2012-01-01 00:00:00|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02 00:00:00|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03 00:00:00|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04 00:00:00|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05 00:00:00|          1.3|   48.02|   37.04| 6.1|   rain|
+-------------------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [19]:
(
    weather.withColumn("month", month("date"))
    .withColumn("year", year("date"))
    .groupBy("month", "year")
    .agg(sum("precipitation").alias("total_monthly_precipitation"))
    .groupBy("month")
    .agg(mean("total_monthly_precipitation").alias("avg_monthly_rain"))
    .sort(col("avg_monthly_rain").desc())
    .first()
)

Row(month=11, avg_monthly_rain=160.625)

In [20]:
(
    weather.withColumn("year", year("date"))
    .groupBy("year")
    .agg(sum("wind").alias("total_winds"))
    .sort(col("total_winds").desc())
    .first()
)

Row(year=2012, total_winds=1244.6999999999998)

In [21]:
(
    weather.withColumn("month", month("date"))
    .filter(col("month") == 1)
    .groupBy("weather")
    .count()
    .sort(col("count").desc())
    .show()
)

+-------+-----+
|weather|count|
+-------+-----+
|    fog|   38|
|   rain|   35|
|    sun|   33|
|drizzle|   10|
|   snow|    8|
+-------+-----+



In [25]:
(
    weather.filter(month("date") == 7)
    .filter(year("date") > 2012)
    .filter(year("date") < 2015)
    .filter(col("weather") == lit("sun"))
    .agg(
        round(avg("temp_max"),2).alias("average_high_temp"),
        round(avg("temp_min"),2).alias("average_low_temp"),
    )
    .show()
)

+-----------------+----------------+
|average_high_temp|average_low_temp|
+-----------------+----------------+
|            80.29|           57.53|
+-----------------+----------------+



In [44]:
(
    weather.filter(year("date") == 2015)
    .filter(quarter("date") == 3)
    .select(when(col("weather") == "rain", 1).otherwise(0).alias("rain"))
    .agg(round(mean("rain"),2).alias("ave_rain"))
    .show()
)


+--------+
|ave_rain|
+--------+
|    0.02|
+--------+



In [50]:
(
    weather.select(when(col("precipitation") > 0, 1).otherwise(0).alias("rain"))
    .agg(round(mean("rain"),2).alias("ave_rain"))
    .show()
)

+--------+
|ave_rain|
+--------+
|    0.43|
+--------+

