In [None]:
# Import Libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load Dataset
df = pd.read_csv('../artvis_dump_NEW.csv', parse_dates=['a.birthdate', 'a.deathdate', 'e.startdate'])

In [None]:
# Data Cleaning
df['a.birthdate'] = pd.to_datetime(df['a.birthdate'], errors='coerce').dt.year
df['a.deathdate'] = pd.to_datetime(df['a.deathdate'], errors='coerce').dt.year
df['e.latitude'] = pd.to_numeric(df['e.latitude'], errors='coerce')
df['e.longitude'] = pd.to_numeric(df['e.longitude'], errors='coerce')

In [None]:
# Exploratory Data Analysis
print(df.describe())

In [None]:
print(df['a.gender'].value_counts())

In [None]:
print(df['e.type'].value_counts())

In [None]:
print(df['e.type'].value_counts())

In [None]:
# Visualizations
sns.countplot(data=df, x='a.gender')
plt.title('Gender Distribution of Artists')
plt.show()

In [None]:
sns.histplot(data=df, x='e.startdate', bins=10)
plt.title('Exhibitions Over Time')
plt.show()

In [None]:
sns.scatterplot(data=df, x='e.longitude', y='e.latitude', hue='e.type')
plt.title('Geographic Distribution of Exhibitions')
plt.show()

In [None]:
# pip install geopandas folium

In [None]:
import folium
from folium.plugins import MarkerCluster

In [None]:
# Create a base map
m = folium.Map(location=[0, 0], zoom_start=2)

In [None]:
# Create a marker cluster
marker_cluster = MarkerCluster().add_to(m)

In [None]:
# Add points to the map
for idx, row in df.iterrows():
    if not pd.isna(row['e.latitude']) and not pd.isna(row['e.longitude']):
        folium.Marker(
            location=[row['e.latitude'], row['e.longitude']],
            popup=f"{row['e.title']} ({row['e.type']})",
            icon=folium.Icon(color='blue' if row['e.type'] == 'group' else 'green' if row['e.type'] == 'solo' else 'red')
        ).add_to(marker_cluster)

In [None]:
# Save the map to an HTML file
map_file = 'exhibitions_map.html'
m.save(map_file)

In [None]:
# # Display the map in Jupyter Notebook
m

In [None]:
# Plotting the distribution of exhibition types
sns.countplot(data=df, x='e.type')
plt.title('Distribution of Exhibition Types')
plt.xlabel('Exhibition Type')
plt.ylabel('Count')
plt.show()

In [None]:
# Count the number of exhibitions per city
city_counts = df['e.city'].value_counts().reset_index()
city_counts.columns = ['City', 'Exhibition Count']

# Plotting the top 10 cities with the most exhibitions
plt.figure(figsize=(12,6))
sns.barplot(data=city_counts.head(10), x='City', y='Exhibition Count')
plt.title('Top 10 Cities by Number of Exhibitions')
plt.xlabel('City')
plt.ylabel('Number of Exhibitions')
plt.xticks(rotation=45)
plt.show()

In [None]:
print(df['e.city'].value_counts())

In [None]:
# Count the number of exhibitions per country
country_counts = df['e.country'].value_counts().reset_index()
country_counts.columns = ['Country', 'Exhibition Count']

# Plotting the top 10 countries with the most exhibitions
plt.figure(figsize=(12,6))
sns.barplot(data=country_counts.head(10), x='Country', y='Exhibition Count')
plt.title('Top 10 Countries by Number of Exhibitions')
plt.xlabel('Country')
plt.ylabel('Number of Exhibitions')
plt.xticks(rotation=45)
plt.show()

In [None]:
# Data Cleaning: Extract distinct artists
artists = df[['a.id', 'a.birthdate', 'a.deathdate']].drop_duplicates()

# Convert birthdate and deathdate to year
artists['birth_year'] = pd.to_datetime(artists['a.birthdate'], errors='coerce').dt.year
artists['death_year'] = pd.to_datetime(artists['a.deathdate'], errors='coerce').dt.year

In [None]:
# Plotting
plt.figure(figsize=(14,6))

# Distribution of Birth Years
plt.subplot(1, 2, 1)
sns.histplot(artists['birth_year'].dropna(), bins=30, kde=True, color='skyblue')
plt.title('Distribution of Birth Years')
plt.xlabel('Year')
plt.ylabel('Number of Artists')

In [None]:
# Distribution of Death Years
plt.subplot(1, 2, 2)
sns.histplot(artists['death_year'].dropna(), bins=30, kde=True, color='salmon')
plt.title('Distribution of Death Years')
plt.xlabel('Year')
plt.ylabel('Number of Artists')

plt.tight_layout()
plt.show()