In [1]:
import os
import math

import altair as alt
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window
from pyspark.sql.types import ShortType

In [2]:
df_path = "F:\Datasets\CSV datasets\Weather Data 1990-1999\hourly_data.csv"

In [3]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [4]:
spark = (
    SparkSession.builder
    .appName('MOMA art collection - Optimized Local')
    .master('local[*]')
    .config("spark.driver.memory", "60g")
    .config("spark.driver.maxResultSize", "4g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '128mb')
    .config('spark.sql.shuffle.partitions', '100')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.sql.autoBroadcastJoinThreshold', '256mb')
    .getOrCreate()
)

print(f"SparkSession configured with Driver Memory: {spark.conf.get('spark.driver.memory')}")

SparkSession configured with Driver Memory: 60g


In [5]:
df = spark.read.option(
    "header", "true"
).option(
    "inferSchema", "true"
).csv(df_path)

In [9]:
df.show(n=5, truncate=False)

+---------+-------------------+--------------+--------------------+------------+--------------------+-------------+----+--------+----------+------------+------------+----------------+-----------+---------------+---------------+----------------+--------------------------+-----------------------+--------------+---------------+------------------+-------------------+--------------+-------------------------+--------------------------+----------------------------+-----------------------------+----------------------+-----------------------+-------------------------+--------------------------+-------------------+----------------+-----------------+------------------------+------------------------+---------------------+---------------------------+------------------------+-------------------------+--------------------------------+--------------------------------+-----------------------------+
|city_name|datetime           |temperature_2m|relative_humidity_2m|dew_point_2m|apparent_temperature|prec

In [11]:
df.select([
    F.sum(F.when(F.col(column).isNull(), 1).otherwise(0)).alias(column) for column in df.columns
]).show()

+---------+--------+--------------+--------------------+------------+--------------------+-------------+----+--------+----------+------------+------------+----------------+-----------+---------------+---------------+----------------+--------------------------+-----------------------+--------------+---------------+------------------+-------------------+--------------+-------------------------+--------------------------+----------------------------+-----------------------------+----------------------+-----------------------+-------------------------+--------------------------+-------------------+----------------+-----------------+------------------------+------------------------+---------------------+---------------------------+------------------------+-------------------------+--------------------------------+--------------------------------+-----------------------------+
|city_name|datetime|temperature_2m|relative_humidity_2m|dew_point_2m|apparent_temperature|precipitation|rain|snowfal

In [12]:
df.show()

+---------+-------------------+--------------+--------------------+------------+--------------------+-------------+----+--------+----------+------------+------------+----------------+-----------+---------------+---------------+----------------+--------------------------+-----------------------+--------------+---------------+------------------+-------------------+--------------+-------------------------+--------------------------+----------------------------+-----------------------------+----------------------+-----------------------+-------------------------+--------------------------+-------------------+----------------+-----------------+------------------------+------------------------+---------------------+---------------------------+------------------------+-------------------------+--------------------------------+--------------------------------+-----------------------------+
|city_name|           datetime|temperature_2m|relative_humidity_2m|dew_point_2m|apparent_temperature|prec