In [1]:
from pyspark.sql import SparkSession

In [2]:
spark = SparkSession.builder.appName("WeatherPySpark").getOrCreate()

In [3]:
! ls -l rawweather

total 0
drwxr-xr-x  3 adammcquistan  staff  96 Feb 21 21:49 [34mlocation=lincoln[m[m


In [4]:
weather = spark.read.json("./rawweather")

In [5]:
weather.printSchema()

root
 |-- request: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- query: string (nullable = true)
 |    |    |-- type: string (nullable = true)
 |-- weather: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- astronomy: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |    |    |    |    |-- moon_illumination: string (nullable = true)
 |    |    |    |    |-- moon_phase: string (nullable = true)
 |    |    |    |    |-- moonrise: string (nullable = true)
 |    |    |    |    |-- moonset: string (nullable = true)
 |    |    |    |    |-- sunrise: string (nullable = true)
 |    |    |    |    |-- sunset: string (nullable = true)
 |    |    |-- avgtempC: string (nullable = true)
 |    |    |-- avgtempF: string (nullable = true)
 |    |    |-- date: string (nullable = true)
 |    |    |-- hourly: array (nullable = true)
 |    |    |    |-- element: struct (containsNull = true)
 |  

In [16]:
from pyspark.sql import Row
from pyspark.sql.functions import udf, explode, explode_outer
from pyspark.sql.types import FloatType
import numpy as np

In [13]:
eDF = spark.createDataFrame([Row(a=1, intlist=[1,2,3], mapfield={"a": "b"})])
eDF.show()

+---+---------+--------+
|  a|  intlist|mapfield|
+---+---------+--------+
|  1|[1, 2, 3]|[a -> b]|
+---+---------+--------+



In [21]:
eDF.select(explode(eDF.intlist).alias("anInt")).collect()

[Row(anInt=1), Row(anInt=2), Row(anInt=3)]

In [22]:
eDF.select(explode(eDF.intlist).alias("anInt")).mean().collect()

AttributeError: 'DataFrame' object has no attribute 'mean'

In [19]:
weather.select('location', 'year', 'month', 'day', explode_outer('weather'))\
    .select('location', 'year', 'month', 'day', 'col.avgtempC', 'col.hourly').show()

+--------+----+-----+---+--------+--------------------+
|location|year|month|day|avgtempC|              hourly|
+--------+----+-----+---+--------+--------------------+
| lincoln|2020|    1| 17|      -1|[[-12, 11, -14, 7...|
| lincoln|2020|    1| 11|     -10|[[-15, 5, -20, -3...|
| lincoln|2020|    1| 16|      -9|[[-19, -2, -18, 0...|
| lincoln|2020|    1| 10|      -4|[[-1, 30, -3, 27,...|
| lincoln|2020|    1|  8|       1|[[-5, 22, -4, 25,...|
| lincoln|2020|    1|  7|       3|[[-5, 23, -4, 24,...|
| lincoln|2020|    1|  3|       0|[[-2, 28, -5, 23,...|
| lincoln|2020|    1| 12|      -9|[[-16, 4, -17, 1,...|
| lincoln|2020|    1|  6|       2|[[-6, 22, -3, 28,...|
| lincoln|2020|    1| 13|      -5|[[-10, 14, -12, 1...|
| lincoln|2020|    1|  4|       1|[[-8, 18, -9, 16,...|
| lincoln|2020|    1|  9|       5|[[1, 33, 1, 34, 4...|
| lincoln|2020|    1|  2|       3|[[-2, 28, -2, 28,...|
| lincoln|2020|    1| 15|      -5|[[-3, 27, -6, 22,...|
| lincoln|2020|    1|  5|       4|[[-1, 30, -0, 

In [7]:
def avg_precip(items):
    nums = [float(item['precipMM']) for item in items if item['precipMM']]
    return np.mean(nums)

In [8]:
avg_precip_udf = udf(lambda x: avg_precip(x), FloatType())

In [11]:
weather = weather.withColumn('avg_precip_mm', avg_precip_udf('hourly'))

AnalysisException: cannot resolve '`hourly`' given input columns: [day, location, month, request, weather, year];;
'Project [request#7, weather#8, location#9, year#10, month#11, day#12, <lambda>('hourly) AS avg_precip_mm#20]
+- Relation[request#7,weather#8,location#9,year#10,month#11,day#12] json


In [12]:
weather.select('weather').show()

+--------------------+
|             weather|
+--------------------+
|[[[[47, Last Quar...|
|[[[[100, Full Moo...|
|[[[[54, Waning Gi...|
|[[[[100, Full Moo...|
|[[[[88, Waxing Gi...|
|[[[[80, Waxing Gi...|
|[[[[51, First Qua...|
|[[[[83, Waxing Gi...|
|[[[[73, Waxing Gi...|
|[[[[76, Waning Gi...|
|[[[[59, First Qua...|
|[[[[95, Waxing Gi...|
|[[[[44, First Qua...|
|[[[[61, Waning Gi...|
|[[[[66, First Qua...|
|[[[[69, Waning Gi...|
+--------------------+



In [13]:
df = spark.createDataFrame(
    [(1, ["foo", "bar"], {"x": 1.0}), (2, [], {}), (3, None, None)],
    ("id", "an_array", "a_map")
)
df.show()

+---+----------+----------+
| id|  an_array|     a_map|
+---+----------+----------+
|  1|[foo, bar]|[x -> 1.0]|
|  2|        []|        []|
|  3|      null|      null|
+---+----------+----------+

