In [294]:
# https://platform.olimpiada-ai.ro/en/problems/81

import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
import warnings 
warnings.filterwarnings('ignore')

In [295]:
train = pd.read_csv("/kaggle/input/famous-paintings/train.csv")
test = pd.read_csv("/kaggle/input/famous-paintings/test.csv")

train.shape, test.shape

((960, 24), (240, 23))

In [296]:
train.head()

Unnamed: 0,SampleID,canvas_size,is_oil_painting,brush_type,num_colors,colorfulness,complexity,brightness,contrast,stroke_density,...,dominant_warm_colors,dominant_color,art_period_hint,auction_house,image_quality,brightness_log,complexity_x_stroke,fake_style_score,painter_style_score,target_price
0,332,60x50,True,medium,71,0.61624,0.755582,0.647338,0.587923,0.702118,...,False,red,baroque,Online,low,0.499161,0.530508,0.349718,0.536437,50800
1,410,80x90,True,medium,60,0.660715,0.474923,0.538822,0.599076,0.528112,...,False,yellow,modern,Online,low,0.431017,0.250813,0.258722,0.163906,37400
2,77,80x50,True,fine,64,0.684877,0.380591,0.608029,0.500152,0.508521,...,False,mixed,baroque,Sothebys,low,0.475009,0.193539,0.797662,0.137732,35500
3,869,80x50,True,medium,56,0.427938,0.581636,0.562086,0.483896,0.550152,...,False,blue,modern,Local,low,0.446022,0.319988,0.569981,0.394542,43200
4,139,80x130,True,fine,55,0.481406,0.62978,0.476093,0.493429,0.68171,...,False,red,modern,Local,medium,0.389398,0.429328,0.723147,0.321858,54500


In [297]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 960 entries, 0 to 959
Data columns (total 24 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   SampleID              960 non-null    int64  
 1   canvas_size           960 non-null    object 
 2   is_oil_painting       960 non-null    bool   
 3   brush_type            960 non-null    object 
 4   num_colors            960 non-null    int64  
 5   colorfulness          960 non-null    float64
 6   complexity            960 non-null    float64
 7   brightness            960 non-null    float64
 8   contrast              960 non-null    float64
 9   stroke_density        960 non-null    float64
 10  has_signature         960 non-null    bool   
 11  is_framed             960 non-null    bool   
 12  uses_gold_leaf        960 non-null    bool   
 13  is_restored           960 non-null    bool   
 14  dominant_warm_colors  960 non-null    bool   
 15  dominant_color        9

In [298]:
def process_df(df):
    bool_cols = df.select_dtypes('bool').columns.tolist()
    df.loc[:, bool_cols] = df.loc[:, bool_cols].astype(int)
    df['canvas_h'] = df['canvas_size'].map(lambda x: int(x.split('x')[0]))
    df['canvas_w'] = df['canvas_size'].map(lambda x: int(x.split('x')[1]))
    df['canvas_area'] = df['canvas_h'] * df['canvas_w']
    df['canvas_perimeter'] = 2 * (df['canvas_h'] + df['canvas_w'])
    df['AAS'] = (
        (df['stroke_density'] > 0.7).astype(int)*2 + 
        (df['complexity'] > 0.65).astype(int)*2 + 
        df['uses_gold_leaf'] + 
        df['has_signature'] + 
        ((df['num_colors'] > 65) & (df['colorfulness'] > 0.7)).astype(int) * 2 -
        ((df['contrast'] < 0.4) | (df['brightness'] < 0.45) | (df['brightness'] > 0.75)).astype(int)
    )
    df_ohe = pd.get_dummies(df.select_dtypes(exclude=['object']).drop(columns=['canvas_h', 'canvas_w', 'canvas_area', 'canvas_perimeter', 'target_price', 'SampleID'], errors='ignore'))
    bool_cols = df_ohe.select_dtypes('bool').columns.tolist()
    df_ohe.loc[:, bool_cols] = df_ohe.loc[:, bool_cols].astype(int)
    return df, df_ohe

train, train_ohe = process_df(train)
test, test_ohe = process_df(test)

In [299]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

train_ohe = scaler.fit_transform(train_ohe)
test_ohe = scaler.transform(test_ohe)

In [300]:
from sklearn.cluster import KMeans

clusterer = KMeans(n_clusters=5)

clusterer.fit(train_ohe)

test_clusters = clusterer.predict(test_ohe)

In [301]:
from sklearn.model_selection import train_test_split
from catboost import Pool

features = [c for c in train.columns if c not in ['SampleID', 'target_price']]
cat_features = [c for c in train.select_dtypes('object') if c in features]
target_col = 'target_price'

X, y = train[features], train[target_col]
X_test = test[features]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.1, random_state=42)

train_pool = Pool(X_train, y_train, cat_features=cat_features)
valid_pool = Pool(X_valid, y_valid, cat_features=cat_features)

In [302]:
from catboost import CatBoostRegressor

params = {
    'iterations': 100000,
    'loss_function': 'MAE',
    'eval_metric': 'MAE',
    'metric_period': 5000,
    'max_depth': 2,
    'random_state': 42
}

model = CatBoostRegressor(**params)

model.fit(train_pool, eval_set=valid_pool)

0:	learn: 10487.6319487	test: 9433.1875043	best: 9433.1875043 (0)	total: 1.61ms	remaining: 2m 40s
5000:	learn: 422.9361764	test: 730.6064288	best: 730.6064288 (5000)	total: 4.82s	remaining: 1m 31s
10000:	learn: 350.2718815	test: 698.8285146	best: 698.8285146 (10000)	total: 9.52s	remaining: 1m 25s
15000:	learn: 321.6166858	test: 694.0751577	best: 694.0751577 (15000)	total: 14.3s	remaining: 1m 20s
20000:	learn: 300.2718212	test: 684.2925620	best: 684.2925620 (20000)	total: 19s	remaining: 1m 16s
25000:	learn: 285.5130793	test: 679.9777652	best: 679.9777652 (25000)	total: 23.8s	remaining: 1m 11s
30000:	learn: 274.9430070	test: 678.1330396	best: 678.1330396 (30000)	total: 28.5s	remaining: 1m 6s
35000:	learn: 265.2086618	test: 677.3966643	best: 677.3966643 (35000)	total: 33.2s	remaining: 1m 1s
40000:	learn: 257.2767475	test: 675.3776490	best: 675.3776490 (40000)	total: 37.9s	remaining: 56.8s
45000:	learn: 251.1997631	test: 673.9995583	best: 673.9995583 (45000)	total: 42.6s	remaining: 52s
500

<catboost.core.CatBoostRegressor at 0x7c80f58e0470>

In [303]:
from sklearn.metrics import mean_absolute_error

y_pred = model.predict(X_valid)

score = mean_absolute_error(y_pred, y_valid)

print(f'Score: {score:.5f}')

Score: 671.63804


In [304]:
y_pred = model.predict(X_test)

In [305]:
subm = pd.DataFrame({
    'SampleID': test['SampleID'].tolist() * 3,
    'subtaskID': ['Task1'] * len(test) + ['Task2'] * len(test) + ['Task3'] * len(test),
    'Answer': np.where(test['AAS'] >= 5, 'Autentic', 'Incert').tolist() + test_clusters.tolist() + y_pred.tolist()
})

subm.to_csv("submission.csv", index=False)
subm.head()

Unnamed: 0,SampleID,subtaskID,Answer
0,1179,Task1,Incert
1,866,Task1,Autentic
2,102,Task1,Incert
3,440,Task1,Incert
4,59,Task1,Incert
