# OmniArt dataset insights

In [None]:
import pandas as pd
import numpy as nd
import matplotlib.pyplot as plt

In [None]:
# Load the dataset
data_root = "data/omniart_v3_datadump.csv"
dataset = pd.read_csv(data_root)

In [None]:
# List all columns of the dataset and corresponding counts
dataset.count()

### Artist information

In [None]:
# Combine the first and last names to obtain an artist_full_name
dataset['artist_full_name'] = (dataset['artist_first_name'] + ' ' + dataset['artist_last_name'])
print(f"Unique artists: {dataset['artist_full_name'].nunique()}")      

In [None]:
# Get the average number of works made by an artist
print(f"Average number of works per artist: {dataset['id'].count() / dataset['artist_full_name'].nunique():.2f}")

### Art type information

In [None]:
unique_types = dataset['general_type'].unique()
print(f"Unique types: {unique_types}")

### Art year distribution

In [None]:
# Convert creation_year to numeric data
numeric_creation_year = pd.to_numeric(dataset['creation_year'], errors='coerce')
# remove NaN
numeric_creation_year.dropna(inplace=True)

# Note that I excluded works from before -2000 in this plot
start_year = 1500
end_year = 2020
bins=100
interval = (end_year - start_year) / bins

plt.hist(numeric_creation_year, range=(start_year, end_year), bins=bins)
plt.title(f"Number works per bin of {interval} years from {start_year} to {end_year}")
plt.xlabel('year')
plt.ylabel(f"number of works")
plt.show()