In [1]:
import os
import math

import altair as alt
import pandas as pd

import plotly.express as px
import plotly.graph_objects as go

import pyspark.sql.functions as F
from pyspark.sql import SparkSession
from pyspark.sql import Window

In [2]:
os.environ['SPARK_HOME'] = r'C:\spark\spark-3.5.5-bin-hadoop3'
os.environ['PYSPARK_DRIVER_PYTHON'] = 'jupyter'
os.environ['PYSPARK_DRIVER_PYTHON_OPTS'] = 'lab'
os.environ['PYSPARK_PYTHON'] = 'python'

In [6]:
spark = (
    SparkSession.builder
    .appName('MOMA art collection')
    .master('local[*]')
    .config('spark.executor.memory', '1g')
    .config('spark.executor.cores', '3')
    .config("spark.dynamicAllocation.enabled", "true")
    .config("spark.dynamicAllocation.minExecutors", "1")
    .config("spark.dynamicAllocation.maxExecutors", "4")
    .config('spark.executor.memoryOverhead', '256m')
    .config("spark.driver.memory", "1g")
    .config("spark.driver.maxResultSize", "1g")
    .config('spark.sql.adaptive.enabled', 'true')
    .config('spark.sql.adaptive.coalescePartitions.enabled', 'true')
    .config('spark.sql.adaptive.advisoryPartitionSizeInBytes', '16mb')
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    .config('spark.dynamicAllocation.executorIdleTimeout', '60s')
    .config('spark.sql.autoBroadcastJoinThreshold', '128mb')
    .getOrCreate()
)

In [7]:
artists_path = r"F:\Datasets\CSV datasets\MoMA+Art+Collection\Artists.csv"
artworks_path = r"F:\Datasets\CSV datasets\MoMA+Art+Collection\Artworks.csv"
moma_data_dictionary_path = r"F:\Datasets\CSV datasets\MoMA+Art+Collection\MoMA_data_dictionary.csv"
moma_onview_path = "F:\Datasets\CSV datasets\MoMA+Art+Collection\MoMA_OnView.xlsx"

In [18]:
moma_onview_pd = pd.read_excel(moma_onview_path, sheet_name="Artists")

moma_onview = spark.createDataFrame(moma_onview_pd)

In [19]:
artworks_df = spark.read.csv(artworks_path, header=True, inferSchema=True)

In [22]:
artists_df = spark.read.csv(artists_path, header=True, inferSchema=True)

In [24]:
artists_df.count()

15639

In [25]:
artworks_df.count()

169806

In [30]:
artworks_artists = artworks_df.join(
    F.broadcast(artists_df),
    on=artworks_df.ConstituentID == artists_df.ConstituentID,
    how='inner'
)

In [31]:
moma_data_dictionary = spark.read.csv(moma_data_dictionary_path, header=True, inferSchema=True)

In [32]:
moma_data_dictionary.show(5)

+--------+-------------+--------------------+
|   Table|        Field|         Description|
+--------+-------------+--------------------+
|Artworks|        Title|The name or title...|
|Artworks|       Artist|The name of the a...|
|Artworks|ConstituentID|Unique identifier...|
|Artworks|    ArtistBio|A short biography...|
|Artworks|  Nationality|The country or re...|
+--------+-------------+--------------------+
only showing top 5 rows



In [34]:
artworks_artists.show(n=5, truncate=False)

+-------------------------------------------------------------------------------------------------------------+------------------------+-------------+-----------------------------------------+-----------+---------+-------+------+----+--------------------------------------------------------------+-------------------------------------+-----------------------------------------------------------------------+---------------+--------------+---------------------+------------+---------+--------+---------------------------------------+----------------------------------------------------------------------------------------------------------------------------------+------+------------------+----------+-------------+-----------+-----------+-----------+----------+----------------+---------------+-------------+------------------------+---------------------------------------+-----------+------+---------+-------+--------+---------+
|Title                                                                