"""
---------------------------------------------------------
Data Imputation and Grouping Script (Step 2): Paint & Material
---------------------------------------------------------

This script performs grouping, cleaning, and imputation of missing values for the 'Paint' and 'Material' columns 
in the cleaned art auction dataset from Step 1. It ensures consistent terminology, reduces cardinality, 
and imputes missing values using artist-specific distributions.

Main Tasks:
-----------
1. **Load Data**:
   - Loads the processed DataFrame from Step 1 (`DataProcessing_Step1_df.pkl`).

2. **Paint Grouping**:
   - Maps exact and composite paint descriptions (e.g., 'ink, watercolor') into consistent grouped labels.
   - Uses rule-based and fallback keyword matching to assign a single paint group per artwork.
   - Collapses rarely used paint types into 'other', keeping top 28 types.

3. **Material Grouping**:
   - Maps specific and compound terms (e.g., 'canvas, paper', 'fiberboard') into unified categories.
   - Applies multi-stage refinement to consolidate materials like 'panel', 'board', and 'canvas'.
   - Collapses rarely used materials into 'other', keeping top 20 types.

4. **Imputation Strategy**:
   - Identifies artists with fewer than 35% missing data in 'Paint' or 'Material' and imputes missing values 
     based on observed co-occurrences of paint and material combinations.
   - A second round of imputation targets high-volume artists (≥150 known entries) not covered in round one.
   - Imputation is performed using artist-specific conditional distributions (probabilistic sampling).

5. **Final Cleanup**:
   - Drops remaining rows with missing 'Paint', 'Material', or 'Area'.
   - Outputs a cleaned DataFrame (`df_final`) ready for modeling.

Outputs:
--------
- `df_final` : Final cleaned DataFrame with imputed and grouped `Paint` and `Material` columns:
    - `Paint Final Imputed Collapsed`
    - `Material Final Imputed Collapsed`
- `missing_summary`: Summary of missing values per column after all filtering and imputation.

Next Step:
----------
- Feature engineering and modeling using the cleaned and imputed dataset.

"""

In [None]:
# Libraries
import pandas as pd
import numpy as np
import pickle

In [None]:
import os
os.chdir('Art-Valuation-in-Auction')

In [None]:
# Reading Pickle File Function
def read_pickle_to_dataframe(filepath):
    try:
        with open(filepath, "rb") as f:
            data = []
            while True:
                try:
                    obj = pickle.load(f)
                    if isinstance(obj, list):
                        data.extend(obj)
                    else:
                        data.append(obj)
                except EOFError:
                    break
        print(f"Loaded {len(data)} records.")
        df = pd.DataFrame(data)
        print("Columns:", df.columns.tolist())
        return df
    except Exception as e:
        print(f"Error reading pickle file: {e}")
        return pd.DataFrame()

In [None]:
# Load cleaned data from Step 1
df = pd.read_pickle("Datasets/DataProcessing_Step1_df.pkl")

# Check
print("Data Shape:", df.shape)
df.head()

In [None]:
# Unique Values of Paint
unique_paints = df['Paint'].dropna().unique()
print(f"Unique Paints ({len(unique_paints)}):")

top_paints = df['Paint'].str.strip().str.lower().value_counts()
print(top_paints[:20])
print(f"Number of unique paint categories: {df['Paint'].nunique()}")

In [None]:
# ----------------------------------------
# Define Grouping Rules for Paint
# ----------------------------------------

    ## Exact matches ##
# These sets contain known variants/synonyms of key paint categories.
# Each key in the dictionary is a standardized category label (e.g., 'oil', 'acrylic'),
# and the associated set contains all strings that should be grouped into that label.
exact_matches = {
    'oil': {'oil', 'oil wash', 'oil, s', 'huile sur toile', 'oil painting', 'each oil'},
    'acrylic': {'acrylic', 'acrylic paint', 'acrylic, s', 'acrylics'},
    'watercolor': {'watercolor', 'watercolor heightened with white', 'opaque white, watercolor', 'watercolor drawing', 'watercolor pair'},
    'mixed media': {'mixed media', 'collage', 'collage, mixed media', 'mixed technique', 'paper collage'},
    'ink': {'ink', 'india ink', 'ink wash', 'color, ink', 'ink, wash', 'colour, ink'},
    'pencil': {'pencil', 'graphite', 'lead pencil', 'pencil drawing', 'black pencil', 'colored pencil'},
    'pastel': {'pastel', 'crayon', 'oil pastel', 'crayon drawing', 'wax crayon'},
    'chalk': {'chalk', 'red chalk', 'black chalk', 'color chalk'},
    'lithography': {'lithograph', 'color lithograph', 'lithograph in colors'},
    'pen': {'ballpoint pen', 'pen', 'black pen', 'colored pen', 'felt-tip pen'},
    'ink, watercolor': {'ink, watercolor', 'india ink, watercolor', 'ink, pen, watercolor'},
    'ink, pen': {'ink, pen', 'brown ink, pen'},
    'watercolor, pencil': {'watercolor, pencil', 'watercolor over pencil', 'pencil, watercolor heightened with white'},
    'oil, mixed media': {'mixed media, oil', 'collage, oil'},
}

    ## Composite matching rules ##
# If a paint string contains multiple components (e.g., 'acrylic, gouache'), and all are present in the string,
# it is assigned to the matching combined category (e.g., 'acrylic, gouache').
double_paint_categories = {
    'acrylic, gouache': {'acrylic', 'gouache'},
    'gouache, watercolor': {'gouache', 'watercolor'},
    'pencil, watercolor': {'pencil', 'watercolor'},
    'ink, watercolor': {'ink', 'watercolor'},
    'ink, pen': {'ink', 'pen'},
    'oil, mixed media': {'oil', 'mixed media'},
    'acrylic, mixed media': {'acrylic', 'mixed media'},
}

    ## Priority order for fallback ##
# If no exact or composite match is found, the fallback logic searches for the first keyword
# in the string that matches this priority list.
fallback_priority = ['oil', 'watercolor', 'acrylic', 'ink', 'mixed media', 'gouache', 'pencil', 'pastel', 'charcoal', 'tempera', 'chalk', 'enamel', 'pen', 'lithograph']

## Matching Functions ##
# Maps a paint string to a standardized label using the exact_matches dictionary.
def group_paint(paint):
    if pd.isnull(paint):
        return None
    p = paint.lower().strip()
    for label, terms in exact_matches.items():
        if p in terms:
            return label
    return p

# First tries to match composite terms, then falls back to keyword search.
def group_paint_complex(paint):
    if pd.isnull(paint):
        return None
    p = paint.lower().strip()
    for label, comps in double_paint_categories.items():
        if all(comp in p for comp in comps):
            return label
    for keyword in fallback_priority:
        if keyword in p:
            return keyword
    return p

df['Paint Grouped'] = df['Paint'].apply(group_paint).apply(group_paint_complex)

top_paints = df['Paint Grouped'].value_counts()
print(top_paints[:20])
print(f"Number of unique paint categories: {df['Paint Grouped'].nunique()}")


In [None]:
# Unique Values of Material
unique_materials = df['Material'].dropna().unique()
print(f"Unique Material ({len(unique_materials)}):")

top_materials = df['Material'].str.strip().str.lower().value_counts()
print(top_materials[0:20])
print(f"Number of unique material categories: {df['Material'].nunique()}")

In [None]:
# ----------------------------------------
# Define Grouping Rules for Material
# ----------------------------------------

# Exact matching sets for key material types
# These sets contain known variants/synonyms that map to standardized material categories
exact_matches_board = {
    "artist's board", "fiberboard", "artist board", "composition board",
    "fibreboard", "hardboard", "illustration board", "academy board",
    "board, oval", "mdf", "pavatex", "light board", "board, in artist's frame", 
    "thin board", "cut out board", "hard fiberboard", "high-density fiberboard"
}

exact_matches_panel = {
    "oak panel", "cradled panel", "mahogany panel", "birch panel",
    "panel, unframed", "panel, en grisaille", "shaped panel", 
    "cradled oak panel", "uncradled panel", "uncradled oak panel",
    "gold ground panel"
}

# Paper variants
paper_other_terms = {
    "laid paper", "wove paper", "cream wove paper", "buff paper", 
    "handmade paper", "tracing paper", "arches paper", "cream laid paper",
    "paper, hanging scroll ", "japanese paper", "japan paper", "newspaper"
}

paper_terms = {
    "watercolor paper", "beige paper", "blue paper", "brown paper",
    "paper, framed", "black paper", "grey paper", "paper laid down on paper",
    "light brown paper", "gray paper"
}

# Fabrics sometimes used as canvas
canvas_fabrics = {"hessian", "jute", "burlap"}

# Terms to preserve — prevent them from being collapsed in the next steps
preserve_canvas_terms = {
    'canvas', 'canvas, board', 'canvas, panel', 'canvas, paper'
}

preserve_paper_terms = {
    'paper', 'canvas, paper', 'paper, board', 'paper, panel', 'wove paper'
}

preserve_board_terms = {
    'canvas, board', 'paper, board'
}

preserve_panel_terms = {
    'canvas, panel', 'paper, panel', 'panel'
}

# Function to group materials based on known matches and keywords
def group_material(material):
    """
    Assigns a standardized material label based on known matches and keywords.
    Covers common material combinations and fallbacks.
    """
    if pd.isnull(material):
        return None
    mat = material.lower().strip()

    # Composite materials (multi-material combinations)
    if 'paper' in mat and 'canvas' in mat:
        return 'canvas, paper'
    if 'canvas' in mat and 'panel' in mat:
        return 'canvas, panel'
    if 'canvas' in mat and 'board' in mat:
        return 'canvas, board'
    if 'paper' in mat and 'board' in mat:
        return 'paper, board'
    if 'paper' in mat and 'panel' in mat:
        return 'paper, panel'

    # Exact matches to predefined categories
    if mat in exact_matches_board:
        return 'board'
    if mat in exact_matches_panel:
        return 'panel'
    if mat in paper_other_terms:
        return 'paper, other'
    if mat in paper_terms:
        return 'paper'
    if mat in canvas_fabrics:
        return 'canvas'

    # Keyword-based fallback matching
    if 'wood' in mat or 'oak' in mat or 'mahogany' in mat:
        return 'wood'
    if 'masonite' in mat or 'isorel' in mat:
        return 'masonite'
    if 'silk' in mat or 'satin' in mat:
        return 'silk'
    if 'vellum' in mat or 'velum' in mat:
        return 'vellum'
    if 'glass' in mat or 'plexiglass' in mat or 'plexiglas' in mat or 'perspex' in mat:
        return 'glass'
    if 'cardboard' in mat:
        return 'cardboard'
    if 'linen' in mat:
        return 'linen'
    if 'copper' in mat:
        return 'copper'

    
    return mat

# Apply initial grouping
df['Material Grouped'] = df['Material'].apply(group_material)


# These functions further reduce granularity by consolidating near-duplicate terms
# while preserving key terms like 'canvas, panel', etc.
def final_canvas_grouping(mat):
    if pd.isnull(mat):
        return None
    mat = mat.lower().strip()
    if 'canvas' in mat and mat not in preserve_canvas_terms:
        return 'canvas'
    return mat

def final_paper_grouping(mat):
    if pd.isnull(mat):
        return None
    mat = mat.lower().strip()
    if 'paper' in mat and mat not in preserve_paper_terms:
        return 'paper, other'
    return mat

def final_board_grouping(mat):
    if pd.isnull(mat):
        return None
    mat = mat.lower().strip()
    if 'board' in mat and mat not in preserve_board_terms and 'cardboard' not in mat:
        return 'board'
    return mat

def final_panel_grouping(mat):
    if pd.isnull(mat):
        return None
    mat = mat.lower().strip()
    if 'panel' in mat and mat not in preserve_panel_terms:
        return 'panel'
    return mat

# Apply all four refinement stages
df['Material Final'] = df['Material Grouped'].apply(final_canvas_grouping)
df['Material Final'] = df['Material Final'].apply(final_paper_grouping)
df['Material Final'] = df['Material Final'].apply(final_board_grouping)
df['Material Final'] = df['Material Final'].apply(final_panel_grouping)


top_materials = df['Material Final'].value_counts()
print(top_materials[:20])
print(f"Number of unique material categories: {df['Material Final'].nunique()}")

In [None]:
# ----------------------------------------
# Finalizing Material and Paint Grouping
# ----------------------------------------

# Collapse Rare Material Categories into 'other'
# Keep the top 20 most frequent material types
top_20_materials = df['Material Final'].value_counts().head(20).index.tolist()

def collapse_material_to_top_20(mat):
    """
    Returns the material if it's among the top 20; otherwise 'other'.
    NaNs are preserved as NaN.
    """
    if pd.isnull(mat):
        return None
    mat = mat.lower().strip()
    return mat if mat in top_20_materials else "other"

df['Material Final Cleaned'] = df['Material Final'].apply(collapse_material_to_top_20)

# Collapse Rare Paint Categories into 'other'
# Keep the top 28 most frequent paint types
top_28_paints = df['Paint Grouped'].value_counts().head(28).index.tolist()

def collapse_paint_to_top_28(paint):
    """
    Returns the paint type if it's among the top 28; otherwise 'other'.
    NaNs are preserved as NaN.
    """
    if pd.isnull(paint):
        return None
    paint = paint.lower().strip()
    return paint if paint in top_28_paints else "other"

df['Paint Final'] = df['Paint Grouped'].apply(collapse_paint_to_top_28)

print(f"Number of unique material categories: {df['Material Final Cleaned'].nunique()}") # 21
print(f"Number of unique paint categories: {df['Paint Final'].nunique()}") # 29


In [None]:
# ----------------------------------------
# Paint-Material Co-Occurrence Table
# ----------------------------------------

## Table showing the percentage of each material used with each paint type ##

# Use cleaned columns
paint_col = 'Paint Final'
material_col = 'Material Final Cleaned'

# Get updated top values based on cleaned columns
top_paints = df[paint_col].value_counts().head(28).index.tolist()
top_materials = df[material_col].value_counts().head(20).index.tolist()

# Initialize a dictionary to store co-occurrence percentages
from collections import defaultdict
paint_to_material_pct = defaultdict(dict)

# Loop through each top paint and calculate material usage percentages
for paint in top_paints:
    filtered = df[df[paint_col] == paint]
    total_paint_count = len(filtered)
    
    if total_paint_count == 0:
        continue

    material_counts = filtered[material_col].value_counts()
    for material in top_materials:
        count = material_counts.get(material, 0)
        pct = round(100 * count / total_paint_count, 2)
        paint_to_material_pct[paint][material] = pct

# Convert to a DataFrame for easy visualization and analysis
paint_material_pct_df = pd.DataFrame(paint_to_material_pct).fillna(0)

# Reorder rows and columns to match top material/paint order
paint_material_pct_df = paint_material_pct_df.loc[top_materials, top_paints]

#  Display Paint–Material Matrix
print("Paint–Material Conditional Percentage Matrix:")
print(paint_material_pct_df.round(2))

# Missing Values After Collapsing
print("Missing values in 'Paint Final':", df['Paint Final'].isnull().sum())
print("Missing values in 'Material Final Cleaned':", df['Material Final Cleaned'].isnull().sum())

In [None]:
# ----------------------------------------
# Examine Missing Paint and Material Data by Artist
# ----------------------------------------

# Count missing material values per artist
missing_material_counts = (
    df[df['Material Final Cleaned'].isnull()]
    .groupby('Artist Name')
    .size()
    .sort_values(ascending=False)
)
missing_material_counts.name = 'Missing Material Count'

# Count missing paint values per artist
missing_paint_counts = (
    df[df['Paint Final'].isnull()]
    .groupby('Artist Name')
    .size()
    .sort_values(ascending=False)
)
missing_paint_counts.name = 'Missing Paint Count'

# Combine both counts into one DataFrame
missing_counts = pd.concat([missing_material_counts, missing_paint_counts], axis=1).fillna(0).astype(int)

# Display top 20 artists with the most missing values
print("Top 20 artists with missing paint or material data:")
print(missing_counts.head(20))


## Artists missing exactly one paint or material entry ##
artists_missing_one = missing_counts[
    (missing_counts['Missing Material Count'] + missing_counts['Missing Paint Count']) == 1
]

print(f"Artists missing exactly 1 paint or material entry: {len(artists_missing_one)}")


In [None]:
# ----------------------------------------
# First Round of Imputation: Artist-Based
# ----------------------------------------

# This step imputes missing values in 'Paint' and 'Material' based
# on observed co-occurrence patterns within each artist's known works.
#
# For artists with less than 35% missing data in both fields:
#   - If 'Paint' is missing but 'Material' is known:
#       → Sample a likely paint type used with that material
#         based on the artist's other artworks.
#   - If 'Material' is missing but 'Paint' is known:
#       → Sample a likely material used with that paint
#         based on the same artist's known combinations.
#
# This approach ensures imputations are:
#   - Artist-specific
#   - Reflective of actual historical usage patterns

# Step 1: Count Missing Paint and Material Entries Per Artist
missing_material_counts = (
    df[df['Material Final Cleaned'].isnull()]
    .groupby('Artist Name').size()
    .sort_values(ascending=False)
)
missing_material_counts.name = 'Missing Material Count'

missing_paint_counts = (
    df[df['Paint Final'].isnull()]
    .groupby('Artist Name').size()
    .sort_values(ascending=False)
)
missing_paint_counts.name = 'Missing Paint Count'

# Combine counts and compute total + missing %
combined_missing = pd.concat([missing_material_counts, missing_paint_counts], axis=1).fillna(0).astype(int)
combined_missing['Total Artworks'] = df.groupby('Artist Name').size()
combined_missing['Missing Material %'] = (combined_missing['Missing Material Count'] / combined_missing['Total Artworks']).round(3)
combined_missing['Missing Paint %'] = (combined_missing['Missing Paint Count'] / combined_missing['Total Artworks']).round(3)

# Step 2: Identify Eligible Artists (less than 35% missing in both)
threshold = 0.35
eligible_artists = combined_missing[
    (combined_missing['Missing Material %'] < threshold) &
    (combined_missing['Missing Paint %'] < threshold)
].index.tolist()

# Step 3: Imputation Function Using Artist-Specific Co-occurrence
def impute_paint_or_material(row, df):
    artist = row['Artist Name']
    known_material = row['Material Final Cleaned']
    known_paint = row['Paint Final']
    artist_df = df[df['Artist Name'] == artist]

    # Impute Paint based on known Material
    if pd.isnull(known_paint) and pd.notnull(known_material):
        subset = artist_df[(artist_df['Paint Final'].notnull()) &
                           (artist_df['Material Final Cleaned'] == known_material)]
        if not subset.empty:
            dist = subset['Paint Final'].value_counts(normalize=True)
            return np.random.choice(dist.index, p=dist.values)

    # Impute Material based on known Paint
    if pd.isnull(known_material) and pd.notnull(known_paint):
        subset = artist_df[(artist_df['Material Final Cleaned'].notnull()) &
                           (artist_df['Paint Final'] == known_paint)]
        if not subset.empty:
            dist = subset['Material Final Cleaned'].value_counts(normalize=True)
            return np.random.choice(dist.index, p=dist.values)

    # If nothing can be inferred, return existing value
    return known_paint if pd.isnull(known_material) else known_material

# Step 4: Apply Imputation
# Paint imputation
missing_paint_mask = (
    df['Artist Name'].isin(eligible_artists) &
    df['Paint Final'].isnull() &
    df['Material Final Cleaned'].notnull()
)

df.loc[missing_paint_mask, 'Paint Final Imputed'] = df[missing_paint_mask].apply(
    lambda row: impute_paint_or_material(row, df), axis=1
)

# Material imputation
missing_material_mask = (
    df['Artist Name'].isin(eligible_artists) &
    df['Material Final Cleaned'].isnull() &
    df['Paint Final'].notnull()
)

df.loc[missing_material_mask, 'Material Final Imputed'] = df[missing_material_mask].apply(
    lambda row: impute_paint_or_material(row, df), axis=1
)

# Step 5: Fill in All Remaining Nulls with Original Values (for completeness)
df['Paint Final Imputed'] = df['Paint Final Imputed'].fillna(df['Paint Final'])
df['Material Final Imputed'] = df['Material Final Imputed'].fillna(df['Material Final Cleaned'])

## Summary of Imputation ##
# Per artist (for diagnostics, optional to print)
final_missing_material = df[df['Material Final Imputed'].isnull()]\
    .groupby('Artist Name').size().sort_values(ascending=False)
final_missing_material.name = 'Missing Material After'

final_missing_paint = df[df['Paint Final Imputed'].isnull()]\
    .groupby('Artist Name').size().sort_values(ascending=False)
final_missing_paint.name = 'Missing Paint After'

final_combined_missing = pd.concat([final_missing_material, final_missing_paint], axis=1).fillna(0).astype(int)

# Overall missing value summary
overall_missing_summary = pd.DataFrame({
    'Missing Values': df[['Paint Final Imputed', 'Material Final Imputed']].isnull().sum(),
    'Percent Missing (%)': df[['Paint Final Imputed', 'Material Final Imputed']].isnull().mean() * 100
}).round(2)

# Step 7: Collapse Long-Tail Categories After Imputation
df['Paint Final Imputed Collapsed'] = df['Paint Final Imputed'].apply(collapse_paint_to_top_28)
df['Material Final Imputed Collapsed'] = df['Material Final Imputed'].apply(collapse_material_to_top_20)

# Print Summary
print("Imputation Summary:")
print(overall_missing_summary)
print("Unique Paints After Imputation:", df['Paint Final Imputed Collapsed'].nunique(dropna=True))
print("Unique Materials After Imputation:", df['Material Final Imputed Collapsed'].nunique(dropna=True))

In [None]:
# ---------------------------------------------------------------
# Second Round of Imputation: High-Volume Artists (≥150 entries)
# ---------------------------------------------------------------
# This second imputation step targets artists who were not included 
# in the first round but have a large number of artworks with known 
# paint or material labels (≥150 entries).
#
# For each of these high-volume artists:
#   - If 'Paint' is missing but 'Material' is known, we impute paint 
#     by sampling from the artist's observed paint distribution for 
#     that material.
#   - If 'Material' is missing but 'Paint' is known, we impute material 
#     using the artist's distribution for that paint.


## Identify High-Volume Artists ##
# Count how many non-missing paint and material entries each artist has
known_paint_counts = df[df['Paint Final Imputed Collapsed'].notnull()]\
    .groupby('Artist Name').size()

known_material_counts = df[df['Material Final Imputed Collapsed'].notnull()]\
    .groupby('Artist Name').size()

# Select artists with ≥150 known entries in either paint or material
second_pass_artists = [
    artist for artist in df['Artist Name'].unique()
    if known_paint_counts.get(artist, 0) >= 150 or known_material_counts.get(artist, 0) >= 150
]

# Remove artists already imputed in the first round
first_pass_artists = set(eligible_artists)  # from previous imputation step
second_pass_only = [a for a in second_pass_artists if a not in first_pass_artists]

## Define Masks for Missing Paint or Material ## 
# These masks identify which entries are still missing and can now be imputed
second_missing_paint_mask = (
    df['Artist Name'].isin(second_pass_only) &
    df['Paint Final Imputed Collapsed'].isnull() &
    df['Material Final Imputed Collapsed'].notnull()
)

second_missing_material_mask = (
    df['Artist Name'].isin(second_pass_only) &
    df['Material Final Imputed Collapsed'].isnull() &
    df['Paint Final Imputed Collapsed'].notnull()
)

## Apply Imputation ##
df.loc[second_missing_paint_mask, 'Paint Final Imputed Collapsed'] = df[second_missing_paint_mask].apply(
    lambda row: impute_paint_or_material(row, df), axis=1
)

df.loc[second_missing_material_mask, 'Material Final Imputed Collapsed'] = df[second_missing_material_mask].apply(
    lambda row: impute_paint_or_material(row, df), axis=1
)

# Recalculate Overall Missingness After Second Imputation
overall_missing_summary = pd.DataFrame({
    'Missing Values': df[['Paint Final Imputed Collapsed', 'Material Final Imputed Collapsed']].isnull().sum(),
    'Percent Missing (%)': df[['Paint Final Imputed Collapsed', 'Material Final Imputed Collapsed']].isnull().mean() * 100
}).round(2)
df['Paint Final Imputed Collapsed'] = df['Paint Final Imputed Collapsed'].apply(collapse_paint_to_top_28)
df['Material Final Imputed Collapsed'] = df['Material Final Imputed Collapsed'].apply(collapse_material_to_top_20)

# Print Final Summary
print("Second Imputation Summary:")
print(overall_missing_summary)
print("Unique Paints After 2nd Imputation:", df['Paint Final Imputed Collapsed'].nunique(dropna=True))
print("Unique Materials After 2nd Imputation:", df['Material Final Imputed Collapsed'].nunique(dropna=True))

In [None]:
# ---------------------------------------------------------------
# Final Dataset Cleanup: Remove Remaining Missing Values
# ---------------------------------------------------------------

# Keep only rows with both paint and material fully imputed
df_final = df[
    df['Paint Final Imputed Collapsed'].notnull() &
    df['Material Final Imputed Collapsed'].notnull()
].copy()

# Summary after removing artworks missing both labels
print(f"Final dataset size after imputation filter: {len(df_final)} artworks")
print(f"Remaining missing in 'Paint': {df_final['Paint Final Imputed Collapsed'].isnull().sum()}")
print(f"Remaining missing in 'Material': {df_final['Material Final Imputed Collapsed'].isnull().sum()}")

# Remove entries with missing 'Area' values (important for modeling)
df_final = df_final[df_final['Area'].notnull()].copy()

# Summary of remaining missing values in other columns
missing_counts = df_final.isnull().sum()
missing_percent = df_final.isnull().mean() * 100

missing_summary = pd.DataFrame({
    'Missing Values': missing_counts,
    'Percent Missing (%)': missing_percent.round(2)
})

missing_summary

In [None]:
# ---------------------------------------------------------------
# Subset Final DataFrame to Relevant Columns for Modeling/Export
# ---------------------------------------------------------------

selected_columns = [
    'Title Cleaned', 'Year of Creation', 'Artist ID', 'Artist Name', 'Dimensions', 'Log Area',
    'Sale Date Cleaned', 'Auction House', 'Sale Location', 'Sale Name', 'Lot Number',
    'Price Sold USD', 'Image url better quality', 'Country', 'Year of Birth', 'Year of Death',
    'Birth Period', 'Birth Period Ordinal' ,'Sale Year', 'CPI_US', 'Alive Status',
    'Paint Final Imputed Collapsed', 'Material Final Imputed Collapsed'
]

df_subset = df_final[selected_columns].copy()

print(f"Subset shape: {df_subset.shape}")
df_subset.head()

In [None]:
for col in df_subset.columns[4:]:
    unique_vals = df_subset[col].nunique()
    print(f"{col}: {unique_vals} unique values")

In [None]:
# ---------------------------------------------------------------
# Filter Artworks to Post-War Period (1945–1970) and Save to File
# ---------------------------------------------------------------

# Handle missing years by temporarily replacing them with 9999
year_created = df_subset['Year of Creation'].fillna(9999).astype(int)

# Post-War artworks (1945–1970)
post_war_mask = (year_created >= 1945) & (year_created <= 1970)

# Subset and reset index
df_postwar = df_subset[post_war_mask].reset_index(drop=True).copy()

# Save filtered dataset as a pickle file
df_postwar.to_pickle("Datasets/df_postwar.pkl")