In [239]:
import os
import time
import random

import numpy as np
import pandas as pd

from PIL import Image

import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor
import joblib
import optuna

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

import open_clip
from sentence_transformers import SentenceTransformer

import pickle
import json

# Data Preparation

In [32]:
profiles = pd.read_csv('../data/profiles_presampled.csv')

def is_valid_row(row):
    try:
        int(row['preference_1'])
        int(row['preference_2'])
        int(row['preference_3'])
        return True
    except:
        return False

profiles = profiles[profiles.apply(is_valid_row, axis=1)].reset_index(drop=True)
preference_cols = ['preference_1', 'preference_2', 'preference_3']

for col in preference_cols:
    profiles[col] = pd.to_numeric(profiles[col], errors='coerce').astype('Int64')

profiles = profiles[
    profiles[preference_cols].apply(lambda x: x.between(1, 17)).all(axis=1)
].reset_index(drop=True)

profiles

Unnamed: 0,age,gender,education,income,marital_status,risk_preference,preference_1,preference_2,preference_3
0,49,female,bachelor,medium,divorced,-0.70,13,6,8
1,61,male,master,medium,single,0.18,6,8,13
2,41,female,bachelor,medium,married,-0.32,6,13,8
3,45,female,master,medium,married,0.39,6,9,13
4,64,male,bachelor,medium,single,-0.71,8,6,13
...,...,...,...,...,...,...,...,...,...
9987,31,female,high school,high,single,0.04,6,8,13
9988,76,male,master,medium,single,0.12,6,13,8
9989,39,male,bachelor,high,single,-0.14,6,8,14
9990,35,male,bachelor,high,married,0.27,10,6,13


In [33]:
def preprocess_data(df, categorical_cols, features, target, test_size=0.2, random_state=42):
    df = df.copy()
    for col in categorical_cols:
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])
    X = df[features]
    y = df[target]
    return train_test_split(X, y, test_size=test_size, random_state=random_state)


def train_and_evaluate(model, model_name, X_train, X_test, y_train, y_test):
    start_train = time.time()
    model.fit(X_train, y_train)
    train_time = time.time() - start_train

    start_infer = time.time()
    y_pred = model.predict(X_test)
    inference_time = time.time() - start_infer

    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)

    print(f"{model_name}: MSE={mse:.4f}, R2={r2:.4f}, train_time={train_time:.4f}s, inference_time={inference_time:.4f}s")

    return {
        'model_name': model_name,
        'mse': mse,
        'r2': r2,
        'train_time': train_time,
        'inference_time': inference_time
    }

In [34]:
categorical_cols = ['gender', 'education', 'income', 'marital_status']
target = 'risk_preference'
results = []

# Baseline

In [35]:
X_train, X_test, y_train, y_test = train_test_split(profiles.drop(columns=[target]), profiles[target], test_size=0.2, random_state=42)

start_train = time.time()
baseline_pred = np.ones_like(y_test) * y_train.mean()
train_time = time.time() - start_train

baseline_mse = mean_squared_error(y_test, baseline_pred)

results.append({
    'model': 'Baseline (mean prediction)',
    'mse': baseline_mse,
    'r2': 0,
    'train_time': train_time,
    'inference_time': 0
})

print(f"Baseline MSE: {baseline_mse:.4f}")

Baseline MSE: 0.2395


# Random forest

In [36]:
def optimize_rf_with_optuna(X_train, y_train, n_trials=50, random_state=42):
    def objective(trial):
        n_estimators = trial.suggest_int('n_estimators', 20, 300)
        max_depth = trial.suggest_int('max_depth', 2, 5)
        min_samples_split = trial.suggest_int('min_samples_split', 15, 60)
        min_samples_leaf = trial.suggest_int('min_samples_leaf', 10, 30)

        model = RandomForestRegressor(
            n_estimators=n_estimators,
            max_depth=max_depth,
            min_samples_split=min_samples_split,
            min_samples_leaf=min_samples_leaf,
            random_state=random_state
        )

        scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
        mse = -np.mean(scores)
        return mse

    study = optuna.create_study(direction='minimize')
    study.optimize(objective, n_trials=n_trials)

    print("Best params:", study.best_params)
    print("Best CV MSE:", study.best_value)

    return study.best_params


## RF: only socio-demographic features

In [37]:
features = ['age', 'gender', 'education', 'income', 'marital_status']
X_train, X_test, y_train, y_test = preprocess_data(profiles, categorical_cols, features, target)

best_params = optimize_rf_with_optuna(X_train, y_train, n_trials=50)

[I 2025-06-21 23:14:59,407] A new study created in memory with name: no-name-6a5c234a-5839-4ace-b417-57644dca09c0
[I 2025-06-21 23:14:59,584] Trial 0 finished with value: 0.23007781868665086 and parameters: {'n_estimators': 28, 'max_depth': 2, 'min_samples_split': 51, 'min_samples_leaf': 21}. Best is trial 0 with value: 0.23007781868665086.
[I 2025-06-21 23:15:00,643] Trial 1 finished with value: 0.22997527227577078 and parameters: {'n_estimators': 188, 'max_depth': 2, 'min_samples_split': 47, 'min_samples_leaf': 12}. Best is trial 1 with value: 0.22997527227577078.
[I 2025-06-21 23:15:02,750] Trial 2 finished with value: 0.23003208843086012 and parameters: {'n_estimators': 270, 'max_depth': 3, 'min_samples_split': 38, 'min_samples_leaf': 11}. Best is trial 1 with value: 0.22997527227577078.
[I 2025-06-21 23:15:03,875] Trial 3 finished with value: 0.23023146732772498 and parameters: {'n_estimators': 124, 'max_depth': 4, 'min_samples_split': 48, 'min_samples_leaf': 20}. Best is trial 1 

Best params: {'n_estimators': 265, 'max_depth': 2, 'min_samples_split': 45, 'min_samples_leaf': 25}
Best CV MSE: 0.22994579474138116


In [38]:
best_rf = RandomForestRegressor(**best_params, random_state=42)
results.append(train_and_evaluate(best_rf, "RandomForest (socdem only)", X_train, X_test, y_train, y_test))

RandomForest (socdem only): MSE=0.2397, R2=-0.0039, train_time=0.3062s, inference_time=0.0120s


## RF: socdem fetures + weighted meta

In [39]:
preference_mapping = {
    1:  (1, 1, 1, 1),
    2:  (1, 1, 1, 2),
    3:  (1, 1, 2, 1),
    4:  (1, 1, 2, 2),
    5:  (1, 2, 1, 1),
    6:  (1, 2, 1, 2),
    7:  (1, 2, 2, 1),
    8:  (1, 2, 2, 2),
    9:  (2, 1, 1, 1),
    10: (2, 1, 1, 2),
    11: (2, 1, 2, 1),
    12: (2, 1, 2, 2),
    13: (2, 2, 1, 1),
    14: (2, 2, 1, 2),
    15: (2, 2, 2, 1),
    16: (2, 2, 2, 2),
    17: (0, 0, 0, 0)
}

def get_meta_features(preference_id):
    return preference_mapping[preference_id]

def add_weighted_meta_features(row):
    prefs = [row['preference_1'], row['preference_2'], row['preference_3']]
    weights = [0.5, 0.3, 0.2]
    
    S_sum = A_sum = O_sum = I_sum = 0.0
    
    for pref, w in zip(prefs, weights):
        S, A, O, I = get_meta_features(pref)
        S_sum += w * S
        A_sum += w * A
        O_sum += w * O
        I_sum += w * I
    
    row['S_weighted'] = S_sum
    row['A_weighted'] = A_sum
    row['O_weighted'] = O_sum
    row['I_weighted'] = I_sum
    
    return row


df = profiles.copy()
df = df.apply(add_weighted_meta_features, axis=1)

features = ['age', 'gender', 'education', 'income', 'marital_status',
            'S_weighted', 'A_weighted', 'O_weighted', 'I_weighted']

X_train, X_test, y_train, y_test = preprocess_data(df, categorical_cols, features, target)

best_params = optimize_rf_with_optuna(X_train, y_train, n_trials=50)

[I 2025-06-21 23:16:14,704] A new study created in memory with name: no-name-72cbb6e4-c6e8-45b3-8194-bfb483bd6bcf
[I 2025-06-21 23:16:15,151] Trial 0 finished with value: 0.22261351515311573 and parameters: {'n_estimators': 35, 'max_depth': 5, 'min_samples_split': 35, 'min_samples_leaf': 14}. Best is trial 0 with value: 0.22261351515311573.
[I 2025-06-21 23:16:17,823] Trial 1 finished with value: 0.22336306333912354 and parameters: {'n_estimators': 221, 'max_depth': 3, 'min_samples_split': 25, 'min_samples_leaf': 18}. Best is trial 0 with value: 0.22261351515311573.
[I 2025-06-21 23:16:19,223] Trial 2 finished with value: 0.22289376435328817 and parameters: {'n_estimators': 115, 'max_depth': 4, 'min_samples_split': 36, 'min_samples_leaf': 23}. Best is trial 0 with value: 0.22261351515311573.
[I 2025-06-21 23:16:20,153] Trial 3 finished with value: 0.22250498101091756 and parameters: {'n_estimators': 75, 'max_depth': 5, 'min_samples_split': 20, 'min_samples_leaf': 30}. Best is trial 3 w

Best params: {'n_estimators': 175, 'max_depth': 5, 'min_samples_split': 19, 'min_samples_leaf': 10}
Best CV MSE: 0.22224378976086254


In [40]:
best_rf = RandomForestRegressor(**best_params, random_state=42)
results.append(train_and_evaluate(best_rf, "RandomForest (socdem + weighted meta)", X_train, X_test, y_train, y_test))

RandomForest (socdem + weighted meta): MSE=0.2325, R2=0.0266, train_time=0.5402s, inference_time=0.0130s


## RF: socdem fetures + one-hot meta

In [41]:
def add_full_meta_features(row):
    prefs = [row['preference_1'], row['preference_2'], row['preference_3']]
    
    for idx, pref in enumerate(prefs, start=1):
        S, A, O, I = preference_mapping[pref]
        row[f'S{idx}'] = S
        row[f'A{idx}'] = A
        row[f'O{idx}'] = O
        row[f'I{idx}'] = I
        
    return row

df_onehot = profiles.copy()
df_onehot = df_onehot.apply(add_full_meta_features, axis=1)

features_onehot = [
    'age', 'gender', 'education', 'income', 'marital_status',
    'S1', 'A1', 'O1', 'I1',
    'S2', 'A2', 'O2', 'I2',
    'S3', 'A3', 'O3', 'I3'
]
X_train, X_test, y_train, y_test = preprocess_data(df_onehot, categorical_cols, features_onehot, target)

best_params = optimize_rf_with_optuna(X_train, y_train, n_trials=50)

[I 2025-06-21 23:18:34,477] A new study created in memory with name: no-name-5dd44ed6-2f4e-4ea1-9d5e-2e1da8fa835d
[I 2025-06-21 23:18:35,744] Trial 0 finished with value: 0.22529774363155383 and parameters: {'n_estimators': 177, 'max_depth': 2, 'min_samples_split': 16, 'min_samples_leaf': 13}. Best is trial 0 with value: 0.22529774363155383.
[I 2025-06-21 23:18:36,315] Trial 1 finished with value: 0.22269639033413494 and parameters: {'n_estimators': 40, 'max_depth': 5, 'min_samples_split': 58, 'min_samples_leaf': 14}. Best is trial 1 with value: 0.22269639033413494.
[I 2025-06-21 23:18:36,610] Trial 2 finished with value: 0.22379708573018836 and parameters: {'n_estimators': 30, 'max_depth': 3, 'min_samples_split': 44, 'min_samples_leaf': 16}. Best is trial 1 with value: 0.22269639033413494.
[I 2025-06-21 23:18:37,611] Trial 3 finished with value: 0.22529821189450328 and parameters: {'n_estimators': 138, 'max_depth': 2, 'min_samples_split': 40, 'min_samples_leaf': 26}. Best is trial 1 w

Best params: {'n_estimators': 204, 'max_depth': 5, 'min_samples_split': 26, 'min_samples_leaf': 28}
Best CV MSE: 0.2226567202160532


In [42]:
best_rf = RandomForestRegressor(**best_params, random_state=42)
results.append(train_and_evaluate(best_rf, "RandomForest (socdem + one-hot meta)", X_train, X_test, y_train, y_test))

RandomForest (socdem + one-hot meta): MSE=0.2320, R2=0.0284, train_time=0.7132s, inference_time=0.0161s


## RF: only meta

In [43]:
df_metaonly = profiles.copy()
df_metaonly = df_metaonly.apply(add_full_meta_features, axis=1)

features_metaonly = [
    'S1', 'A1', 'O1', 'I1',
    'S2', 'A2', 'O2', 'I2',
    'S3', 'A3', 'O3', 'I3'
]

categorical_cols_metaonly = []
X_train, X_test, y_train, y_test = preprocess_data(df_metaonly, categorical_cols_metaonly, features_metaonly, target)

best_params = optimize_rf_with_optuna(X_train, y_train, n_trials=50)

[I 2025-06-21 23:20:50,670] A new study created in memory with name: no-name-a9da37de-1a7d-4b3d-bab6-f0020b304049
[I 2025-06-21 23:20:51,299] Trial 0 finished with value: 0.22377564498998934 and parameters: {'n_estimators': 118, 'max_depth': 3, 'min_samples_split': 44, 'min_samples_leaf': 16}. Best is trial 0 with value: 0.22377564498998934.
[I 2025-06-21 23:20:52,534] Trial 1 finished with value: 0.2230041338337811 and parameters: {'n_estimators': 208, 'max_depth': 4, 'min_samples_split': 30, 'min_samples_leaf': 29}. Best is trial 1 with value: 0.2230041338337811.
[I 2025-06-21 23:20:54,039] Trial 2 finished with value: 0.22298172394132765 and parameters: {'n_estimators': 257, 'max_depth': 4, 'min_samples_split': 50, 'min_samples_leaf': 21}. Best is trial 2 with value: 0.22298172394132765.
[I 2025-06-21 23:20:54,811] Trial 3 finished with value: 0.22527157431628067 and parameters: {'n_estimators': 172, 'max_depth': 2, 'min_samples_split': 40, 'min_samples_leaf': 20}. Best is trial 2 w

Best params: {'n_estimators': 197, 'max_depth': 5, 'min_samples_split': 39, 'min_samples_leaf': 30}
Best CV MSE: 0.22266214804586948


In [44]:
best_rf = RandomForestRegressor(**best_params, random_state=42)
results.append(train_and_evaluate(best_rf, "RandomForest (meta only)", X_train, X_test, y_train, y_test))

RandomForest (meta only): MSE=0.2320, R2=0.0284, train_time=0.3075s, inference_time=0.0137s


## RF: text embeddings

In [45]:
def prepare_features(
    df, embeddings, 
    num_columns, 
    cat_columns, 
    target_column, 
    test_size=0.2, 
    random_state=42
):
    pref_1_embs = np.array([embeddings[x] for x in df['preference_1']])
    pref_2_embs = np.array([embeddings[x] for x in df['preference_2']])
    pref_3_embs = np.array([embeddings[x] for x in df['preference_3']])
    emb_features = np.hstack([pref_1_embs, pref_2_embs, pref_3_embs])

    if num_columns:
        numeric_features = df[num_columns].values
    else:
        numeric_features = np.zeros((emb_features.shape[0], 0))

    if cat_columns:
        categorical_features_list = []
        for col in cat_columns:
            le = LabelEncoder()
            encoded_col = le.fit_transform(df[col])
            encoded_col = encoded_col.reshape(-1, 1)
            categorical_features_list.append(encoded_col)
        categorical_features = np.hstack(categorical_features_list)
    else:
        categorical_features = np.zeros((emb_features.shape[0], 0))

    X = np.hstack([emb_features, numeric_features, categorical_features])
    y = df[target_column].values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=test_size, random_state=random_state
    )

    return X_train, X_test, y_train, y_test

In [46]:
def create_text_embeddings(description_mapping):

    model = SentenceTransformer('all-MiniLM-L6-v2')

    text_embeddings = {}
    for img_id, description in description_mapping.items():
        embedding = model.encode(description)
        text_embeddings[img_id] = embedding

    return text_embeddings


In [47]:
description_mapping = {
    1: "Orderly courtyard. Knights march in sync. Donkey gives commands. Farquaad measures with ruler. Clean pastel tones.",
    2: "Shrek trains alone in clean gym. Hits straw dummy. Fight schedule on wall. Dull skyline. Pastel light.",
    3: "Chaos in throne room after party. Broken dishes. Princess on chandelier. Donkey DJs. Shrek facepalms.",
    4: "Shrek in dungeon. Broken cages. Graffiti 'Puss was here'. Dramatic shadows from torchlight.",
    5: "Farquaad drinks tea in empty hall. Parade outside. Banner 'Order is Power'.",
    6: "Shrek reads 'How to Be Human' in library. Warm lamp. Coffee cup.",
    7: "Everyone sleeps after party. Shrek in crown. Donkey hugs keg.",
    8: "Shrek naps in carriage. Unicorns pull. Books and teacup inside.",
    9: "Shrek, Fiona, Donkey race in swamp. Flags. Cartoon lighting.",
    10: "Shrek chops wood neatly. Fiona watches. Calm forest.",
    11: "Tavern party. Trolls dance. Donkey sings. Spilled drinks.",
    12: "Shrek puts out fire. Puss cleans himself nearby. 'No Entry' sign.",
    13: "Family picnic in swamp. Donkey entertains piglets. Lanterns, baskets.",
    14: "Shrek naps in hammock. Bird holds sign. Puss asleep.",
    15: "Campfire talk. Group laughs and argues. Snacks around.",
    16: "Shrek lies in puddle. Frogs jump. 'Life is good' sign.",
    17: "Everyone stands in line near outhouse. No action. Donkey stares at hourglass."
    }

text_embeddings = create_text_embeddings(description_mapping)

cat_columns = ['gender', 'education', 'income', 'marital_status']
num_columns = ['age']

X_train, X_test, y_train, y_test = prepare_features(
    profiles, text_embeddings, num_columns, cat_columns, target
)

best_params = optimize_rf_with_optuna(X_train, y_train, n_trials=50)


[I 2025-06-21 23:21:44,223] A new study created in memory with name: no-name-8bb0e349-bf67-4e24-bf78-adc53d408fde
[I 2025-06-21 23:24:48,611] Trial 0 finished with value: 0.22187386422018704 and parameters: {'n_estimators': 300, 'max_depth': 4, 'min_samples_split': 38, 'min_samples_leaf': 30}. Best is trial 0 with value: 0.22187386422018704.
[I 2025-06-21 23:25:53,754] Trial 1 finished with value: 0.22154451329139224 and parameters: {'n_estimators': 88, 'max_depth': 5, 'min_samples_split': 24, 'min_samples_leaf': 29}. Best is trial 1 with value: 0.22154451329139224.
[I 2025-06-21 23:27:31,049] Trial 2 finished with value: 0.22418113582706164 and parameters: {'n_estimators': 277, 'max_depth': 2, 'min_samples_split': 32, 'min_samples_leaf': 14}. Best is trial 1 with value: 0.22154451329139224.
[I 2025-06-21 23:28:17,749] Trial 3 finished with value: 0.22195677246800835 and parameters: {'n_estimators': 80, 'max_depth': 4, 'min_samples_split': 38, 'min_samples_leaf': 27}. Best is trial 1 w

Best params: {'n_estimators': 188, 'max_depth': 5, 'min_samples_split': 60, 'min_samples_leaf': 29}
Best CV MSE: 0.22150710661926346


In [48]:
best_rf = RandomForestRegressor(**best_params, random_state=42)
results.append(train_and_evaluate(best_rf, "RandomForest (text embeddings)", X_train, X_test, y_train, y_test))

RandomForest (text embeddings): MSE=0.2303, R2=0.0356, train_time=30.7492s, inference_time=0.0177s


## RF: img embeddings

In [61]:
def create_image_embeddings(image_folder_path):
    model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
    model.eval()

    image_embeddings = {}

    for img_id in range(1, 18):
        img_path = f"{image_folder_path}/{img_id}.png"
        image = preprocess(Image.open(img_path)).unsqueeze(0)
        with torch.no_grad():
            embedding = model.encode_image(image)
        image_embeddings[img_id] = embedding.cpu().numpy().flatten()

    return image_embeddings

In [62]:
image_embeddings = create_image_embeddings('../images')

cat_columns = ['gender', 'education', 'income', 'marital_status']
num_columns = ['age']

X_train, X_test, y_train, y_test = prepare_features(
    profiles, image_embeddings, num_columns, cat_columns, target_column=target
)

best_params = optimize_rf_with_optuna(X_train, y_train, n_trials=50)

[I 2025-06-22 01:50:47,057] A new study created in memory with name: no-name-ca450e7c-2e21-4e7e-9a89-a13aa2d21e57
[I 2025-06-22 01:55:22,259] Trial 0 finished with value: 0.22201929532692533 and parameters: {'n_estimators': 287, 'max_depth': 5, 'min_samples_split': 29, 'min_samples_leaf': 16}. Best is trial 0 with value: 0.22201929532692533.
[I 2025-06-22 01:55:59,924] Trial 1 finished with value: 0.22299963357261637 and parameters: {'n_estimators': 58, 'max_depth': 3, 'min_samples_split': 47, 'min_samples_leaf': 14}. Best is trial 0 with value: 0.22201929532692533.
[I 2025-06-22 01:56:28,614] Trial 2 finished with value: 0.22192376185704615 and parameters: {'n_estimators': 31, 'max_depth': 5, 'min_samples_split': 21, 'min_samples_leaf': 29}. Best is trial 2 with value: 0.22192376185704615.
[I 2025-06-22 01:58:05,680] Trial 3 finished with value: 0.22430267352122524 and parameters: {'n_estimators': 207, 'max_depth': 2, 'min_samples_split': 17, 'min_samples_leaf': 20}. Best is trial 2 w

Best params: {'n_estimators': 189, 'max_depth': 5, 'min_samples_split': 21, 'min_samples_leaf': 29}
Best CV MSE: 0.22168053609881114


In [63]:
best_rf = RandomForestRegressor(**best_params, random_state=42)
results.append(train_and_evaluate(best_rf, "RandomForest (img embeddings)", X_train, X_test, y_train, y_test))

RandomForest (img embeddings): MSE=0.2303, R2=0.0356, train_time=46.5691s, inference_time=0.0241s


# MLP

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ["PYTHONHASHSEED"] = str(seed)

In [95]:
device = "cuda" if torch.cuda.is_available() else "cpu"


def create_text_embeddings(description_mapping):
    model = SentenceTransformer('all-MiniLM-L6-v2')

    with torch.no_grad():
        text_embeddings = {k: model.encode(v, normalize_embeddings=True) for k, v in description_mapping.items()}
    return text_embeddings


def create_image_embeddings(image_folder_path):
    model, _, preprocess = open_clip.create_model_and_transforms('ViT-B-32', pretrained='laion2b_s34b_b79k')
    model = model.to(device)
    model.eval()

    image_embeddings = {}

    for img_id in range(1, 18):
        img_path = f"{image_folder_path}/{img_id}.png"
        image = preprocess(Image.open(img_path)).unsqueeze(0).to(device)
        with torch.no_grad():
            embedding = model.encode_image(image)
            embedding /= embedding.norm(dim=-1, keepdim=True)  # нормализация
        image_embeddings[img_id] = embedding.cpu().numpy().flatten()

    return image_embeddings


In [None]:
class MyDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32).unsqueeze(-1)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


In [None]:
class MLPRegressor(nn.Module):
    
    def __init__(self, input_dim, hidden_dims=[256, 64], dropout=0.2):
        super().__init__()
        layers = []
        prev = input_dim
        for h in hidden_dims:
            layers += [
                nn.Linear(prev, h),
                nn.ReLU(),
                nn.Dropout(dropout)
            ]
            prev = h
        layers.append(nn.Linear(prev, 1))
        layers.append(nn.Tanh())
        self.net = nn.Sequential(*layers)

    def forward(self, x):
        return self.net(x)


In [227]:
def train_and_evaluate_mlp(
    X_train, X_test, y_train, y_test,
    model_name="MLPRegressor", hidden_dims=[256,64],
    n_epochs=20, lr=1e-3, batch_size=32
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_ds = MyDataset(X_train, y_train)
    test_ds  = MyDataset(X_test, y_test)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    test_loader  = DataLoader(test_ds, batch_size=batch_size)

    model = MLPRegressor(input_dim=X_train.shape[1], hidden_dims=hidden_dims).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    start_train = time.time()

    for epoch in range(n_epochs):
        model.train()
        losses = []
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {np.mean(losses):.4f}")
        
    train_time = time.time() - start_train

    model.eval()
    y_preds, y_true = [], []
    start_inf = time.time()
    with torch.no_grad():
        for xb, yb in test_loader:
            xb = xb.to(device)
            out = model(xb).cpu().numpy().flatten()
            y_preds.extend(out)
            y_true.extend(yb.numpy().flatten())
    inf_time = time.time() - start_inf

    mse = mean_squared_error(y_true, y_preds)
    r2  = r2_score(y_true, y_preds)

    print(f"{model_name}: MSE={mse:.4f}, R2={r2:.4f}, train_time={train_time:.2f}s, inference_time={inf_time:.2f}s")

    return {
        'model_name': model_name,
        'mse': mse,
        'r2': r2,
        'train_time': train_time,
        'inference_time': inf_time
    }

## MLP: text embeddings

In [232]:
set_seed(17)

text_embeddings = create_text_embeddings(description_mapping)

cat_columns = ['gender', 'education', 'income', 'marital_status']
num_columns = ['age']

X_train, X_test, y_train, y_test = prepare_features(
    profiles, text_embeddings, num_columns, cat_columns, target
)

result = train_and_evaluate_mlp(
    X_train, X_test, y_train, y_test,
    model_name="MLP on text+socdem",
    hidden_dims=[512, 128],
    n_epochs=55,
    lr=1e-3,
    batch_size=64
)
results.append(result)

Epoch 1/55 - Train Loss: 0.2328
Epoch 2/55 - Train Loss: 0.2253
Epoch 3/55 - Train Loss: 0.2240
Epoch 4/55 - Train Loss: 0.2217
Epoch 5/55 - Train Loss: 0.2226
Epoch 6/55 - Train Loss: 0.2215
Epoch 7/55 - Train Loss: 0.2216
Epoch 8/55 - Train Loss: 0.2206
Epoch 9/55 - Train Loss: 0.2207
Epoch 10/55 - Train Loss: 0.2201
Epoch 11/55 - Train Loss: 0.2214
Epoch 12/55 - Train Loss: 0.2204
Epoch 13/55 - Train Loss: 0.2201
Epoch 14/55 - Train Loss: 0.2197
Epoch 15/55 - Train Loss: 0.2197
Epoch 16/55 - Train Loss: 0.2199
Epoch 17/55 - Train Loss: 0.2199
Epoch 18/55 - Train Loss: 0.2198
Epoch 19/55 - Train Loss: 0.2187
Epoch 20/55 - Train Loss: 0.2191
Epoch 21/55 - Train Loss: 0.2182
Epoch 22/55 - Train Loss: 0.2193
Epoch 23/55 - Train Loss: 0.2188
Epoch 24/55 - Train Loss: 0.2183
Epoch 25/55 - Train Loss: 0.2186
Epoch 26/55 - Train Loss: 0.2182
Epoch 27/55 - Train Loss: 0.2185
Epoch 28/55 - Train Loss: 0.2178
Epoch 29/55 - Train Loss: 0.2182
Epoch 30/55 - Train Loss: 0.2180
Epoch 31/55 - Train

## MLP: img embeddings

In [114]:
set_seed(17)

image_embeddings = create_image_embeddings('../images')

cat_columns = ['gender', 'education', 'income', 'marital_status']
num_columns = ['age']

X_train, X_test, y_train, y_test = prepare_features(
    profiles, image_embeddings, num_columns, cat_columns, target
)

result = train_and_evaluate_mlp(
    X_train, X_test, y_train, y_test,
    model_name="MLP on img+socdem",
    hidden_dims=[512, 128],
    n_epochs=50,
    lr=1e-3,
    batch_size=64
)
results.append(result)

Epoch 1/50 - Train Loss: 0.2319
Epoch 2/50 - Train Loss: 0.2264
Epoch 3/50 - Train Loss: 0.2239
Epoch 4/50 - Train Loss: 0.2241
Epoch 5/50 - Train Loss: 0.2225
Epoch 6/50 - Train Loss: 0.2219
Epoch 7/50 - Train Loss: 0.2221
Epoch 8/50 - Train Loss: 0.2215
Epoch 9/50 - Train Loss: 0.2214
Epoch 10/50 - Train Loss: 0.2206
Epoch 11/50 - Train Loss: 0.2212
Epoch 12/50 - Train Loss: 0.2202
Epoch 13/50 - Train Loss: 0.2206
Epoch 14/50 - Train Loss: 0.2197
Epoch 15/50 - Train Loss: 0.2194
Epoch 16/50 - Train Loss: 0.2201
Epoch 17/50 - Train Loss: 0.2199
Epoch 18/50 - Train Loss: 0.2193
Epoch 19/50 - Train Loss: 0.2188
Epoch 20/50 - Train Loss: 0.2187
Epoch 21/50 - Train Loss: 0.2187
Epoch 22/50 - Train Loss: 0.2188
Epoch 23/50 - Train Loss: 0.2185
Epoch 24/50 - Train Loss: 0.2194
Epoch 25/50 - Train Loss: 0.2189
Epoch 26/50 - Train Loss: 0.2175
Epoch 27/50 - Train Loss: 0.2193
Epoch 28/50 - Train Loss: 0.2187
Epoch 29/50 - Train Loss: 0.2186
Epoch 30/50 - Train Loss: 0.2183
Epoch 31/50 - Train

## Clusterization + MLP: text embeddings

In [204]:
def train_mlp(X_train, y_train, input_dim, hidden_dims=[512, 128], lr=1e-3, batch_size=64, n_epochs=25):
    
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_ds = MyDataset(X_train, y_train)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

    model = MLPRegressor(input_dim=input_dim, hidden_dims=hidden_dims).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(n_epochs):
        model.train()
        losses = []
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        print(f"Epoch {epoch+1}/{n_epochs} - Loss: {np.mean(losses):.4f}")

    return model

In [205]:
def train_mlp_with_clustering(profiles, description_mapping, target):
    start_train = time.time()

    text_embeddings = create_text_embeddings(description_mapping)
    cat_columns = ['gender', 'education', 'income', 'marital_status']
    num_columns = ['age']

    X_train, X_test, y_train, y_test = prepare_features(
        profiles, text_embeddings, num_columns, cat_columns, target_column=target
    )

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    embedding_dim = list(text_embeddings.values())[0].shape[0] * 3
    socdem_start = embedding_dim
    socdem_end = embedding_dim + len(num_columns) + len(cat_columns)
    X_socdem_train = X_train_scaled[:, socdem_start:socdem_end]

    n_clusters = 3
    kmeans = KMeans(n_clusters=n_clusters, random_state=17)
    cluster_train = kmeans.fit_predict(X_socdem_train)

    mlp_models = {}

    for cluster_id in range(n_clusters):
        idx = np.where(cluster_train == cluster_id)[0]
        X_cluster = X_train_scaled[idx]
        y_cluster = y_train[idx]

        print(f"\nОбучаем MLP для кластера {cluster_id}, объектов: {len(idx)}")
        model = train_mlp(
            X_cluster, y_cluster,
            input_dim=X_train.shape[1],
            hidden_dims=[512, 128],
            lr=1e-3, batch_size=64, n_epochs=10
        )
        mlp_models[cluster_id] = model

    joblib.dump(scaler, 'scaler.pkl')
    joblib.dump(kmeans, 'kmeans.pkl')

    for cluster_id, model in mlp_models.items():
        torch.save(model.state_dict(), f'mlp_model_cluster_{cluster_id}.pth')

    end_train = time.time()
    train_time = end_train - start_train

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    y_preds = []
    y_true = []

    start_inference = time.time()

    for i in range(len(X_test)):
        x_sample = X_test_scaled[i].reshape(1, -1)
        socdem_sample = x_sample[:, socdem_start:socdem_end]
        cluster_id = kmeans.predict(socdem_sample)[0]

        model = MLPRegressor(input_dim=X_train.shape[1], hidden_dims=[512, 128])
        model.load_state_dict(torch.load(f'mlp_model_cluster_{cluster_id}.pth'))
        model.to(device)
        model.eval()

        x_tensor = torch.tensor(x_sample, dtype=torch.float32).to(device)
        with torch.no_grad():
            pred = model(x_tensor).cpu().numpy().flatten()[0]

        y_preds.append(pred)
        y_true.append(y_test[i])

    end_inference = time.time()
    inference_time = end_inference - start_inference

    mse = mean_squared_error(y_true, y_preds)
    r2 = r2_score(y_true, y_preds)

    print(f"\nFinal evaluation: MSE={mse:.4f}, R2={r2:.4f}")

    return {
        'model_name': 'MLP with clustering on socdem',
        'mse': mse,
        'r2': r2,
        'train_time': train_time,
        'inference_time': inference_time
    }

In [206]:
set_seed(17)

results.append(train_mlp_with_clustering(profiles, description_mapping, target))


Обучаем MLP для кластера 0, объектов: 3014
Epoch 1/10 - Loss: 0.2497
Epoch 2/10 - Loss: 0.2257
Epoch 3/10 - Loss: 0.2188
Epoch 4/10 - Loss: 0.2176
Epoch 5/10 - Loss: 0.2191
Epoch 6/10 - Loss: 0.2162
Epoch 7/10 - Loss: 0.2185
Epoch 8/10 - Loss: 0.2155
Epoch 9/10 - Loss: 0.2187
Epoch 10/10 - Loss: 0.2176

Обучаем MLP для кластера 1, объектов: 2449
Epoch 1/10 - Loss: 0.2521
Epoch 2/10 - Loss: 0.2191
Epoch 3/10 - Loss: 0.2171
Epoch 4/10 - Loss: 0.2130
Epoch 5/10 - Loss: 0.2125
Epoch 6/10 - Loss: 0.2112
Epoch 7/10 - Loss: 0.2088
Epoch 8/10 - Loss: 0.2130
Epoch 9/10 - Loss: 0.2080
Epoch 10/10 - Loss: 0.2065

Обучаем MLP для кластера 2, объектов: 2530
Epoch 1/10 - Loss: 0.2735
Epoch 2/10 - Loss: 0.2439
Epoch 3/10 - Loss: 0.2355
Epoch 4/10 - Loss: 0.2369
Epoch 5/10 - Loss: 0.2337
Epoch 6/10 - Loss: 0.2335
Epoch 7/10 - Loss: 0.2341
Epoch 8/10 - Loss: 0.2322
Epoch 9/10 - Loss: 0.2312
Epoch 10/10 - Loss: 0.2316

Final evaluation: MSE=0.2298, R2=0.0378


# Results

In [215]:
df_results = pd.DataFrame(results)

cols = ['model_name'] + [col for col in df_results.columns if col != 'model_name']
df_results = df_results[cols]

format_dict = {
    'mse': "{:.4f}",
    'r2': "{:.4f}",
    'train_time': "{:.4f}",
    'inference_time': "{:.4f}"
}

def make_pastel_cmap(base_cmap_name, n_colors=256, alpha=0.5):
    base = plt.get_cmap(base_cmap_name, n_colors)
    colors = base(np.linspace(0, 1, n_colors))
    from matplotlib.colors import rgb_to_hsv, hsv_to_rgb
    hsv_colors = rgb_to_hsv(colors[:, :3])
    hsv_colors[:, 1] = hsv_colors[:, 1] * alpha
    hsv_colors[:, 2] = hsv_colors[:, 2] + (1 - hsv_colors[:, 2]) * 0.1
    new_rgb = hsv_to_rgb(hsv_colors)
    colors[:, :3] = new_rgb
    return LinearSegmentedColormap.from_list(f"pastel_{base_cmap_name}", colors)

pastel_rdylgn = make_pastel_cmap('RdYlGn')
pastel_rdylgn_r = make_pastel_cmap('RdYlGn_r')

styled = df_results.style.format(format_dict)

styled = styled.background_gradient(subset=['mse'], cmap=pastel_rdylgn_r)
styled = styled.background_gradient(subset=['r2'], cmap=pastel_rdylgn)
styled = styled.background_gradient(subset=['train_time', 'inference_time'], cmap=pastel_rdylgn_r)

styled

Unnamed: 0,model_name,mse,r2,train_time,inference_time
0,Baseline (mean prediction),0.2395,0.0,0.0011,0.0
1,RandomForest (socdem only),0.2397,-0.0039,0.3062,0.012
2,RandomForest (socdem + weighted meta),0.2325,0.0266,0.5402,0.013
3,RandomForest (socdem + one-hot meta),0.232,0.0284,0.7132,0.0161
4,RandomForest (meta only),0.232,0.0284,0.3075,0.0137
5,RandomForest (text embeddings),0.2303,0.0356,30.7492,0.0177
6,RandomForest (img embeddings),0.2303,0.0356,46.5691,0.0241
7,MLP on text+socdem,0.2279,0.0459,21.5193,0.0156
8,MLP on img+socdem,0.2293,0.04,23.8672,0.0189
9,MLP with clustering on socdem,0.2298,0.0378,6.4228,7.3422


# Inference preparation

In [233]:
model = SentenceTransformer('all-MiniLM-L6-v2')

embeddings = {}
for img_id, description in description_mapping.items():
    embeddings[img_id] = model.encode(description)

with open("../inference/embeddings.pkl", "wb") as f:
    pickle.dump(embeddings, f)

In [234]:
def train_mlp_and_save(X, y, hidden_dims=[512, 128], n_epochs=60, lr=1e-3, batch_size=64, save_path="mlp_model.pth"):

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    train_ds = MyDataset(X, y)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)

    model = MLPRegressor(input_dim=X.shape[1], hidden_dims=hidden_dims).to(device)
    criterion = nn.MSELoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)

    for epoch in range(n_epochs):
        model.train()
        losses = []
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad()
            pred = model(xb)
            loss = criterion(pred, yb)
            loss.backward()
            optimizer.step()
            losses.append(loss.item())
        print(f"Epoch {epoch+1}/{n_epochs} - Train Loss: {np.mean(losses):.4f}")

    torch.save(model.state_dict(), save_path)
    print(f"Модель сохранена в {save_path}")


text_embeddings = create_text_embeddings(description_mapping)
cat_columns = ['gender', 'education', 'income', 'marital_status']
num_columns = ['age']

X, _, y, _ = prepare_features(
    profiles, text_embeddings, num_columns, cat_columns, target, test_size=1e-10
)

train_mlp_and_save(
    X, y,
    hidden_dims=[512, 128],
    n_epochs=60,
    lr=1e-3,
    batch_size=64,
    save_path="../inference/mlp_model.pth"
)

Epoch 1/60 - Train Loss: 0.2363
Epoch 2/60 - Train Loss: 0.2272
Epoch 3/60 - Train Loss: 0.2256
Epoch 4/60 - Train Loss: 0.2260
Epoch 5/60 - Train Loss: 0.2229
Epoch 6/60 - Train Loss: 0.2234
Epoch 7/60 - Train Loss: 0.2222
Epoch 8/60 - Train Loss: 0.2233
Epoch 9/60 - Train Loss: 0.2232
Epoch 10/60 - Train Loss: 0.2213
Epoch 11/60 - Train Loss: 0.2210
Epoch 12/60 - Train Loss: 0.2214
Epoch 13/60 - Train Loss: 0.2238
Epoch 14/60 - Train Loss: 0.2208
Epoch 15/60 - Train Loss: 0.2213
Epoch 16/60 - Train Loss: 0.2223
Epoch 17/60 - Train Loss: 0.2219
Epoch 18/60 - Train Loss: 0.2222
Epoch 19/60 - Train Loss: 0.2205
Epoch 20/60 - Train Loss: 0.2200
Epoch 21/60 - Train Loss: 0.2202
Epoch 22/60 - Train Loss: 0.2202
Epoch 23/60 - Train Loss: 0.2189
Epoch 24/60 - Train Loss: 0.2205
Epoch 25/60 - Train Loss: 0.2195
Epoch 26/60 - Train Loss: 0.2197
Epoch 27/60 - Train Loss: 0.2203
Epoch 28/60 - Train Loss: 0.2186
Epoch 29/60 - Train Loss: 0.2189
Epoch 30/60 - Train Loss: 0.2179
Epoch 31/60 - Train

In [242]:
label_mappings = {}

for col in ['gender', 'education', 'income', 'marital_status']:
    le = LabelEncoder()
    le.fit(profiles[col])
    label_mappings[col] = {cls: int(idx) for idx, cls in enumerate(le.classes_)}

with open('../inference/label_mappings.json', 'w') as f:
    json.dump(label_mappings, f)
