# ---------------------------------------------------------------
# DataSplitting.ipynb
# ---------------------------------------------------------------
This script prepares the post-war dataset for modeling by:
1. Log-transforming the target variable (Price Sold USD)
2. Exploring sale date distribution
3. Splitting the data chronologically into Train, Validation, and Test sets
4. Creating artist-level rolling median price encodings based on past sales

The chronological split ensures realistic forecasting conditions where 
future sales are predicted based only on past information.

In [None]:
# Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
import os
os.chdir('Art-Valuation-in-Auction')

In [None]:
# Load the dataset
df_postwar = pd.read_pickle("Datasets/df_postwar.pkl")

df_postwar = df_postwar.rename(columns={
    'Paint Final Imputed Collapsed': 'Paint Imputed',
    'Material Final Imputed Collapsed': 'Material Imputed'
})

print(df_postwar.columns.tolist())

In [None]:
# ---------------------------------------------------------------
# Step 1: Log-transform the Target Variable
# ---------------------------------------------------------------
df_postwar['Log Price'] = np.log(df_postwar['Price Sold USD'])

# Visualize log-transformed price distribution
plt.figure(figsize=(10, 6))
plt.hist(df_postwar['Log Price'], bins=100, color='skyblue', edgecolor='black')
plt.title("Distribution of Log-Transformed Artwork Prices")
plt.xlabel("Log(1 + Price Sold USD)")
plt.ylabel("Number of Artworks")
plt.grid(True)
plt.show()

In [None]:
# ---------------------------------------------------------------
# Step 2: Prepare Sale Dates for Chronological Splitting
# ---------------------------------------------------------------
df_postwar['Sale Date Cleaned'] = pd.to_datetime(df_postwar['Sale Date Cleaned'])

# Confirm no missing dates
num_missing_dates = df_postwar['Sale Date Cleaned'].isnull().sum()
print(f"Missing sale dates: {num_missing_dates}")

# Sort by sale date
df_postwar = df_postwar.sort_values('Sale Date Cleaned').reset_index(drop=True)

# Date range
print("Min sale date:", df_postwar['Sale Date Cleaned'].min())
print("Max sale date:", df_postwar['Sale Date Cleaned'].max())

# Plot number of artworks per year
df_postwar['Sale Year'] = df_postwar['Sale Date Cleaned'].dt.year
year_counts = df_postwar['Sale Year'].value_counts().sort_index()

year_counts.plot(kind='bar', figsize=(15, 5))
plt.title("Number of Artworks Sold per Year")
plt.xlabel("Year")
plt.ylabel("Number of Artworks")
plt.grid(True)
plt.show()

In [None]:
# ---------------------------------------------------------------
# Step 3: Compute Cut-off Years for Train / Val / Test Split
# ---------------------------------------------------------------
cumulative = year_counts.cumsum()
total = cumulative.iloc[-1]

train_year = cumulative[cumulative <= 0.75 * total].index.max()
val_year = cumulative[cumulative <= 0.90 * total].index.max()

print("Suggested chronological split:")
print(f" - Train: <= {train_year}")
print(f" - Validation: {train_year + 1} to {val_year}")
print(f" - Test: > {val_year}")

In [None]:
# ---------------------------------------------------------------
# Step 4: Apply Train / Validation / Test Splits
# ---------------------------------------------------------------
train_df = df_postwar[df_postwar['Sale Year'] <= train_year].copy()
val_df = df_postwar[(df_postwar['Sale Year'] > train_year) & (df_postwar['Sale Year'] <= val_year)].copy()
test_df = df_postwar[df_postwar['Sale Year'] > val_year].copy()

print(f"Train size: {len(train_df)}")
print(f"Validation size: {len(val_df)}")
print(f"Test size: {len(test_df)}")

In [None]:
# ---------------------------------------------------------------
# Step 5: Encode Artist-Level Rolling Avg Sale Prices
# ---------------------------------------------------------------
def add_artist_history_features(df):
    df = df.sort_values("Sale Date Cleaned").copy()

    # Cumulative price before artwork t
    df["Artist Cumulative Price"] = (
        df.groupby("Artist Name")["Price Sold USD"]
          .transform(lambda x: x.shift().cumsum())
    )

    # Ordered average price before artwork t
    df["Artist Ordered Avg Price"] = (
        df.groupby("Artist Name")["Price Sold USD"]
          .transform(lambda x: x.shift().expanding().mean())
    )

    # Drop first appearance of each artist (where shift leads to NaN)
    df = df[
        df["Artist Cumulative Price"].notnull() &
        df["Artist Ordered Avg Price"].notnull()
    ].copy()

    # Count of prior sales
    df["Artist Sale Count"] = df.groupby("Artist Name").cumcount()

    # Log transforms
    df["Artist Cumulative Price Log"] = np.log(df["Artist Cumulative Price"])
    df["Artist Ordered Avg Price"] = np.log(df["Artist Ordered Avg Price"])
    df["Artist Sale Count"] = np.log1p(df["Artist Sale Count"])

    return df

# Apply to split datasets
train_df = add_artist_history_features(train_df)
val_df = add_artist_history_features(val_df)
test_df = add_artist_history_features(test_df)

In [None]:
# Save as pickle files
train_df.to_pickle(f"Datasets/train_df.pkl")
val_df.to_pickle(f"Datasets/val_df.pkl")
test_df.to_pickle(f"Datasets/test_df.pkl")

print(train_df.columns.tolist())