In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=b2df7825c28b93508f2452869f75750067f34cf0228175ac3f9e4b10d6cd742f
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, max, min, avg, count, countDistinct, lit, sum, when, year, month, dayofmonth
from pyspark.sql.window import Window
from pyspark.sql.types import DateType
import pyspark.sql.functions as F

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Weather Data Analysis") \
    .getOrCreate()


In [3]:
import zipfile
import os

# Path to the zip file
zip_file_path = 'data.zip'

# Directory where the zip file will be extracted
extract_to_dir = 'data'

# Check if the extraction directory exists, create if it doesn't
if not os.path.exists(extract_to_dir):
    os.makedirs(extract_to_dir)

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all the contents into the directory
    zip_ref.extractall(extract_to_dir)

print(f"Extracted all files in {zip_file_path} to {extract_to_dir}")


Extracted all files in data.zip to data


In [5]:
# Assuming the data.zip has been extracted into a folder named 'data'
weather_df = spark.read.csv("data/data/*/*.csv", header=True, inferSchema=True)

# Show the DataFrame schema to understand your data
weather_df.printSchema()


root
 |-- STATION: long (nullable = true)
 |-- DATE: date (nullable = true)
 |-- LATITUDE: double (nullable = true)
 |-- LONGITUDE: double (nullable = true)
 |-- ELEVATION: double (nullable = true)
 |-- NAME: string (nullable = true)
 |-- TEMP: double (nullable = true)
 |-- TEMP_ATTRIBUTES: double (nullable = true)
 |-- DEWP: double (nullable = true)
 |-- DEWP_ATTRIBUTES: double (nullable = true)
 |-- SLP: double (nullable = true)
 |-- SLP_ATTRIBUTES: double (nullable = true)
 |-- STP: double (nullable = true)
 |-- STP_ATTRIBUTES: double (nullable = true)
 |-- VISIB: double (nullable = true)
 |-- VISIB_ATTRIBUTES: double (nullable = true)
 |-- WDSP: double (nullable = true)
 |-- WDSP_ATTRIBUTES: double (nullable = true)
 |-- MXSPD: double (nullable = true)
 |-- GUST: double (nullable = true)
 |-- MAX: double (nullable = true)
 |-- MAX_ATTRIBUTES: string (nullable = true)
 |-- MIN: double (nullable = true)
 |-- MIN_ATTRIBUTES: string (nullable = true)
 |-- PRCP: double (nullable = true)

In [6]:
hottest_days = weather_df.withColumn("YEAR", year("DATE")) \
    .groupBy("YEAR", "STATION", "NAME") \
    .agg(max("MAX").alias("MAX_TEMP")) \
    .orderBy("YEAR", "MAX_TEMP", ascending=False)

hottest_days.show(13)


+----+----------+----------------+--------+
|YEAR|   STATION|            NAME|MAX_TEMP|
+----+----------+----------------+--------+
|2022|2095099999|      PAJALA, SW|    85.5|
|2022|1241099999|      ORLAND, NO|    82.4|
|2021|1065099999|    KARASJOK, NO|    88.3|
|2021|1062099999|HOPEN ISLAND, NO|    47.3|
|2020|1023099999|   BARDUFOSS, NO|    79.9|
|2020|1008099999|    LONGYEAR, SV|    71.1|
|2019|1023099999|   BARDUFOSS, NO|    78.8|
|2019|1008099999|    LONGYEAR, SV|    61.0|
|2018|1025099999|      TROMSO, NO|    84.2|
|2018|1008099999|    LONGYEAR, SV|    59.2|
|2017|1023099999|   BARDUFOSS, NO|    78.6|
|2017|1008099999|    LONGYEAR, SV|    55.4|
|2016|1023199999|     DRAUGEN, NO|  9999.9|
+----+----------+----------------+--------+
only showing top 13 rows



In [7]:
coldest_january = weather_df.withColumn("MONTH", month("DATE")) \
    .filter(col("MONTH") == 1) \
    .select("STATION", "NAME", "DATE", "MIN") \
    .orderBy("MIN").limit(1)

coldest_january.show()


+----------+-------------+----------+-----+
|   STATION|         NAME|      DATE|  MIN|
+----------+-------------+----------+-----+
|1023099999|BARDUFOSS, NO|2017-01-05|-28.3|
+----------+-------------+----------+-----+



In [8]:
precipitation_2015 = weather_df.filter(year("DATE") == 2015) \
    .select("STATION", "NAME", "DATE", "PRCP") \
    .agg(max("PRCP").alias("MAX_PRCP"), min("PRCP").alias("MIN_PRCP"))

precipitation_2015.show()


+--------+--------+
|MAX_PRCP|MIN_PRCP|
+--------+--------+
|   99.99|     0.0|
+--------+--------+



In [9]:
gust_missing_2019 = weather_df.filter(year("DATE") == 2019) \
    .select((count(when(col("GUST").isNull(), True)) / count("*")).alias("MISSING_GUST_PERCENTAGE"))

gust_missing_2019.show()


+-----------------------+
|MISSING_GUST_PERCENTAGE|
+-----------------------+
|                    0.0|
+-----------------------+



In [13]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, month, max, min, avg, stddev

spark = SparkSession.builder \
    .appName("Weather Data Analysis") \
    .getOrCreate()

# Assuming the CSV files have been loaded into `weather_df`
temp_stats_2020 = weather_df.filter(year("DATE") == 2020) \
    .withColumn("MONTH", month("DATE")) \
    .groupBy("MONTH") \
    .agg(
        avg("TEMP").alias("MEAN_TEMP"),
        F.expr("percentile_approx(TEMP, 0.5)").alias("MEDIAN_TEMP"),
        stddev("TEMP").alias("STDDEV_TEMP")
    )

temp_stats_2020.show(12)


+-----+------------------+-----------+------------------+
|MONTH|         MEAN_TEMP|MEDIAN_TEMP|       STDDEV_TEMP|
+-----+------------------+-----------+------------------+
|   12| 19.95483870967742|       20.2| 8.854464048157649|
|    1|15.896774193548385|       14.9|12.805172721989297|
|    6|47.429999999999986|       46.0| 8.877190347997288|
|    3|14.653225806451614|       18.6|15.784789500893568|
|    5| 36.21935483870968|       36.0| 8.077246704851957|
|    9| 41.84500000000001|       42.5| 5.887660897797833|
|    4|23.329999999999995|       26.0|13.022097256170087|
|    8| 49.28709677419354|       48.7| 6.548594740281951|
|    7| 52.88709677419355|       51.4| 6.663787232915164|
|   10|31.529032258064525|       30.7| 9.609052888228808|
|   11|29.246666666666663|       29.8|  8.14344837353497|
|    2|13.358620689655174|       15.3| 13.09180853418292|
+-----+------------------+-----------+------------------+



In [16]:
from pyspark.sql.functions import concat_ws

# Assuming hottest_days DataFrame is already defined and has the columns you mentioned
# Concatenate all the columns into a single string column
hottest_days_string = hottest_days.withColumn(
    "result",
    concat_ws(", ", *[col(c).cast("string") for c in hottest_days.columns])
)

# Select only the concatenated string column
hottest_days_single_column = hottest_days_string.select("result")

# Now you can save it to a text file
hottest_days_single_column.coalesce(1).write.text("results/hottest_days.txt")
