# Spark

---
## Exercises

In [5]:
import pyspark
import pandas as pd
import numpy as np
from pydataset import data as pyd
from vega_datasets import data as vega
import pyspark.sql.functions as F

### 1. 

In [7]:
#create spark dataframe 
df = spark.createDataFrame(
    pd.DataFrame(
        dict(language=[
            'python',
            'sql',
            'html',
            'css',
            'javascript',
            'ruby',
            'c++'])
    )
)
df

DataFrame[language: string]

In [2]:
# View the schema of the dataframe
# Output the shape of the dataframe
# Show the first 5 records in the dataframe

# view the schema of the dataframe
df.printSchema()

# output the shape of the dataframe
print("Number of rows: ", df.count())
print("Number of columns: ", len(df.columns))

# show the first 5 records in the dataframe
df.show(5)

root
 |-- Language: string (nullable = true)
 |-- Popularity: long (nullable = true)



                                                                                

Number of rows:  10
Number of columns:  2
+----------+----------+
|  Language|Popularity|
+----------+----------+
|      Java|         1|
|    Python|         2|
|JavaScript|         3|
|        C#|         4|
|       PHP|         5|
+----------+----------+
only showing top 5 rows




### 2. 

Load the mpg dataset as a spark dataframe.

- Create 1 column of output that contains a message like the one below for each vehicle.

```

The 1999 audi a4 has a 4 cylinder engine.

```

- Transform the trans column so that it only contains either manual or auto.


In [8]:
df = spark.createDataFrame(pyd('mpg'))
df

DataFrame[manufacturer: string, model: string, displ: double, year: bigint, cyl: bigint, trans: string, drv: string, cty: bigint, hwy: bigint, fl: string, class: string]

In [9]:
df.show(5)


+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|manufacturer|model|displ|year|cyl|     trans|drv|cty|hwy| fl|  class|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
|        audi|   a4|  1.8|1999|  4|  auto(l5)|  f| 18| 29|  p|compact|
|        audi|   a4|  1.8|1999|  4|manual(m5)|  f| 21| 29|  p|compact|
|        audi|   a4|  2.0|2008|  4|manual(m6)|  f| 20| 31|  p|compact|
|        audi|   a4|  2.0|2008|  4|  auto(av)|  f| 21| 30|  p|compact|
|        audi|   a4|  2.8|1999|  6|  auto(l5)|  f| 16| 26|  p|compact|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+
only showing top 5 rows



In [10]:
df.withColumn(
    'vehicle_info',
    F.concat(
    F.lit('The '),
    df.year,
    F.lit(' '),
    df.manufacturer,
    F.lit(' '),
    df.model,
    F.lit(' has a '),
    df.cyl,
    F.lit(' cylinder engine.')
    )
).show(5,truncate=False)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+-----------------------------------------+
|manufacturer|model|displ|year|cyl|trans     |drv|cty|hwy|fl |class  |vehicle_info                             |
+------------+-----+-----+----+---+----------+---+---+---+---+-------+-----------------------------------------+
|audi        |a4   |1.8  |1999|4  |auto(l5)  |f  |18 |29 |p  |compact|The 1999 audi a4 has a 4 cylinder engine.|
|audi        |a4   |1.8  |1999|4  |manual(m5)|f  |21 |29 |p  |compact|The 1999 audi a4 has a 4 cylinder engine.|
|audi        |a4   |2.0  |2008|4  |manual(m6)|f  |20 |31 |p  |compact|The 2008 audi a4 has a 4 cylinder engine.|
|audi        |a4   |2.0  |2008|4  |auto(av)  |f  |21 |30 |p  |compact|The 2008 audi a4 has a 4 cylinder engine.|
|audi        |a4   |2.8  |1999|6  |auto(l5)  |f  |16 |26 |p  |compact|The 1999 audi a4 has a 6 cylinder engine.|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+--------------------------

In [11]:
df.withColumn(
    'vehicle_info',
    F.concat(
    F.lit('The '),
    df.year,
    F.lit(' '),
    df.manufacturer,
    F.lit(' '),
    df.model,
    F.lit(' has a '),
    df.cyl,
    F.lit(' cylinder engine.')
    )
).show(5,truncate=False)

+------------+-----+-----+----+---+----------+---+---+---+---+-------+-----------------------------------------+
|manufacturer|model|displ|year|cyl|trans     |drv|cty|hwy|fl |class  |vehicle_info                             |
+------------+-----+-----+----+---+----------+---+---+---+---+-------+-----------------------------------------+
|audi        |a4   |1.8  |1999|4  |auto(l5)  |f  |18 |29 |p  |compact|The 1999 audi a4 has a 4 cylinder engine.|
|audi        |a4   |1.8  |1999|4  |manual(m5)|f  |21 |29 |p  |compact|The 1999 audi a4 has a 4 cylinder engine.|
|audi        |a4   |2.0  |2008|4  |manual(m6)|f  |20 |31 |p  |compact|The 2008 audi a4 has a 4 cylinder engine.|
|audi        |a4   |2.0  |2008|4  |auto(av)  |f  |21 |30 |p  |compact|The 2008 audi a4 has a 4 cylinder engine.|
|audi        |a4   |2.8  |1999|6  |auto(l5)  |f  |16 |26 |p  |compact|The 1999 audi a4 has a 6 cylinder engine.|
+------------+-----+-----+----+---+----------+---+---+---+---+-------+--------------------------

In [12]:
df.withColumn(
    'trans',
    F.regexp_replace(
    'trans',
    r'(\(\w+\))',
    '')
).show(truncate=False)

+------------+------------------+-----+----+---+------+---+---+---+---+-------+
|manufacturer|model             |displ|year|cyl|trans |drv|cty|hwy|fl |class  |
+------------+------------------+-----+----+---+------+---+---+---+---+-------+
|audi        |a4                |1.8  |1999|4  |auto  |f  |18 |29 |p  |compact|
|audi        |a4                |1.8  |1999|4  |manual|f  |21 |29 |p  |compact|
|audi        |a4                |2.0  |2008|4  |manual|f  |20 |31 |p  |compact|
|audi        |a4                |2.0  |2008|4  |auto  |f  |21 |30 |p  |compact|
|audi        |a4                |2.8  |1999|6  |auto  |f  |16 |26 |p  |compact|
|audi        |a4                |2.8  |1999|6  |manual|f  |18 |26 |p  |compact|
|audi        |a4                |3.1  |2008|6  |auto  |f  |18 |27 |p  |compact|
|audi        |a4 quattro        |1.8  |1999|4  |manual|4  |18 |26 |p  |compact|
|audi        |a4 quattro        |1.8  |1999|4  |auto  |4  |16 |25 |p  |compact|
|audi        |a4 quattro        |2.0  |2


### 3. 


Load the tips dataset as a spark dataframe.

- What percentage of observations are smokers?
- Create a column that contains the tip percentage
- Calculate the average tip percentage for each combination of sex and smoker.

In [13]:
df = spark.createDataFrame(pyd('tips'))
df.show(5)

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
+----------+----+------+------+---+------+----+
only showing top 5 rows



In [14]:
((df.where(F.col('smoker')=='Yes').count())
/(df.count()))

                                                                                

0.38114754098360654

In [15]:
df1 = pyd('tips')
df1.smoker.value_counts(normalize=True)

smoker
No     0.618852
Yes    0.381148
Name: proportion, dtype: float64

In [16]:
df.withColumn(
    'tip_pct',
    F.round(df.tip/df.total_bill,4)
).show(5)

+----------+----+------+------+---+------+----+-------+
|total_bill| tip|   sex|smoker|day|  time|size|tip_pct|
+----------+----+------+------+---+------+----+-------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2| 0.0594|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3| 0.1605|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3| 0.1666|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2| 0.1398|
|     24.59|3.61|Female|    No|Sun|Dinner|   4| 0.1468|
+----------+----+------+------+---+------+----+-------+
only showing top 5 rows



In [17]:
df.groupBy('sex','smoker').agg(
    F.round(F.avg(df.tip/df.total_bill),4).alias('avg_tip_pct')
).show(5)



+------+------+-----------+
|   sex|smoker|avg_tip_pct|
+------+------+-----------+
|  Male|    No|     0.1607|
|Female|    No|     0.1569|
|  Male|   Yes|     0.1528|
|Female|   Yes|     0.1822|
+------+------+-----------+



                                                                                

### 4. 

Use the Seattle weather dataset referenced in the lesson to answer the questions below.

- Convert the temperatures to fahrenheit.
- Which month has the most rain, on average?
- Which year was the windiest?
- What is the most frequent type of weather in January?
- What is the average high and low temperature on sunny days in July in 2013 and 2014?
- What percentage of days were rainy in Q3 of 2015?
- For each year, find what percentage of days it rained (had non-zero precipitation).

In [18]:
weather = vega.seattle_weather().assign(date=lambda df: df.date.astype(str))
df = spark.createDataFrame(weather)
df.show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|    12.8|     5.0| 4.7|drizzle|
|2012-01-02|         10.9|    10.6|     2.8| 4.5|   rain|
|2012-01-03|          0.8|    11.7|     7.2| 2.3|   rain|
|2012-01-04|         20.3|    12.2|     5.6| 4.7|   rain|
|2012-01-05|          1.3|     8.9|     2.8| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [19]:
# f = c * 9/5) + 32
df.withColumn(
    'temp_min',
    ((df.temp_min * 9) / 5) + 32
    ).withColumn(
        'temp_max',
        ((df.temp_max * 9) / 5) + 32
    ).show(5)

+----------+-------------+--------+--------+----+-------+
|      date|precipitation|temp_max|temp_min|wind|weather|
+----------+-------------+--------+--------+----+-------+
|2012-01-01|          0.0|   55.04|    41.0| 4.7|drizzle|
|2012-01-02|         10.9|   51.08|   37.04| 4.5|   rain|
|2012-01-03|          0.8|   53.06|   44.96| 2.3|   rain|
|2012-01-04|         20.3|   53.96|   42.08| 4.7|   rain|
|2012-01-05|          1.3|   48.02|   37.04| 6.1|   rain|
+----------+-------------+--------+--------+----+-------+
only showing top 5 rows



In [20]:
(df.withColumn('mo',F.month('date')
    ).groupBy('mo'
        ).agg(F.round(F.avg('precipitation'),1
            ).alias('avg_rain')
            ).sort(F.col('avg_rain').desc())
            ).show(1)

[Stage 25:>                                                         (0 + 8) / 8]

+---+--------+
| mo|avg_rain|
+---+--------+
| 11|     5.4|
+---+--------+
only showing top 1 row



                                                                                

In [21]:
((df.withColumn('yr',F.year('date')
    ).groupBy('yr'
        ).agg(F.round(F.avg('wind'),4
                ).alias('avg_wind')
            ).sort(F.col('avg_wind').desc())
            )).show(1)

[Stage 28:>                                                         (0 + 8) / 8]

+----+--------+
|  yr|avg_wind|
+----+--------+
|2012|  3.4008|
+----+--------+
only showing top 1 row



                                                                                

In [22]:
df = df.withColumn('mo',F.month('date'))

In [23]:
df.where(df.mo==1
    ).groupBy(df.weather
        ).agg(F.count(df.weather
            ).alias('freq')).sort(F.col('freq').desc()).show(1)



+-------+----+
|weather|freq|
+-------+----+
|    fog|  38|
+-------+----+
only showing top 1 row



                                                                                

In [24]:
df = df.withColumn('yr',F.year('date'))

In [25]:
(df.where(df.mo==7
    ).where(df.yr>2012
    ).where(df.yr<2015
    ).where(df.weather=='sun'
    ).select(
        F.round(F.avg('temp_min'),2).alias('avg_low'),
        F.round(F.avg('temp_max'),2).alias('avg_high')
    )
).show()



+-------+--------+
|avg_low|avg_high|
+-------+--------+
|  14.18|   26.83|
+-------+--------+



                                                                                

In [26]:
(df.where(df.weather=='rain')).count()

259

In [27]:
(df.where(df.mo>9).where(df.yr==2015
    ).select(
        F.round((df.where(df.weather=='rain')).count()
        /F.count(df.weather),4).alias('pct_rainy_days')
    )
).show()



+--------------+
|pct_rainy_days|
+--------------+
|        2.8152|
+--------------+



                                                                                

In [28]:
(df.where(df.precipitation>0
    ).groupBy('yr'
        ).agg(F.count(df.precipitation).alias('avg_rain_days')
            ).select('yr',F.round(F.expr('avg_rain_days / 365'),4
                ).alias('pct_rain_days'))
                ).show()

[Stage 46:>                                                         (0 + 8) / 8]

+----+-------------+
|  yr|pct_rain_days|
+----+-------------+
|2012|       0.4849|
|2013|       0.4164|
|2014|        0.411|
|2015|       0.3945|
+----+-------------+



                                                                                

---
## Notes

In [1]:
# Installation test
import pyspark
spark = pyspark.sql.SparkSession.builder.getOrCreate()
spark.range(5).show()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/10/25 09:31:21 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
[Stage 0:>                                                          (0 + 8) / 8]

+---+
| id|
+---+
|  0|
|  1|
|  2|
|  3|
|  4|
+---+



                                                                                

In [2]:
import multiprocessing
import pyspark

nprocs = multiprocessing.cpu_count()

spark = (pyspark.sql.SparkSession.builder
 .master('local')
 .config('spark.jars.packages', 'mysql:mysql-connector-java:8.0.16')
 .config('spark.driver.memory', '4G')
 .config('spark.driver.cores', nprocs)
 .config('spark.sql.shuffle.partitions', nprocs)
 .appName('MySparkApplication')
 .getOrCreate())

23/10/25 09:31:43 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [3]:
import pyspark.sql.functions as F