In [2]:
import numpy as np
import optuna
import pandas as pd
from scipy import stats
from random import sample
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error
from scipy.stats import skew, kurtosis

import tensorflow as tf
from tensorflow.keras.layers import Dense, Flatten, Input, Dropout,\
        Embedding, Concatenate, LayerNormalization, MultiHeadAttention, \
        GlobalAveragePooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau
from sklearn.preprocessing import StandardScaler, OneHotEncoder, MultiLabelBinarizer
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer

  from .autonotebook import tqdm as notebook_tqdm
2025-06-16 14:44:57.189232: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


ModuleNotFoundError: No module named 'matplotlib'

In [None]:
from tensorflow.keras.layers import Lambda

def oversample_high_values(X, y, threshold, factor=3):
    """Triplicate samples where target y is above threshold"""
    log_threshold = np.log(threshold + 1)
    log_upper_bound = np.log(MAX_SOLD_PRICE * 1.3 + 1)

    mask_high = (y > log_threshold)
    mask_high_n_low = (y > log_threshold) & (y < log_upper_bound)
    y_high = y[mask_high]
    y_high_n_low = y[mask_high_n_low]
    X_high = [x[mask_high] for x in X]
    X_high_n_low = [x[mask_high_n_low] for x in X]

    int_part = int(factor)
    frac_part = factor - int_part

    X_oversampled = []
    for i in range(len(X)):
        x_orig = X[i]
        x_hi = X_high[i]
        x_hi_n_lo = X_high_n_low[i]

        if frac_part > 0 and len(x_hi_n_lo) > 0:
            n_extra = int(len(x_hi_n_lo) * frac_part)
            sampled_indices = np.random.choice(len(x_hi_n_lo), n_extra, replace=False)
            x_extra = x_hi_n_lo[sampled_indices]
            x_aug = np.concatenate([x_orig] + [x_hi] * int_part + [x_extra], axis=0)
        else:
            x_aug = np.concatenate([x_orig] + [x_hi] * int_part, axis=0)

        X_oversampled.append(x_aug)

    if frac_part > 0 and len(y_high_n_low) > 0:
        n_extra = int(len(y_high_n_low) * frac_part)
        sampled_indices = np.random.choice(len(y_high_n_low), n_extra, replace=False)
        y_extra = y_high_n_low[sampled_indices]
        y_oversampled = np.concatenate([y] + [y_high] * int_part + [y_extra])
    else:
        y_oversampled = np.concatenate([y] + [y_high] * int_part)

    return X_oversampled, y_oversampled

class TransformerEncoder(tf.keras.layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate=0.3):
        super(TransformerEncoder, self).__init__()
        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim)
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(dropout_rate)
        self.dropout2 = Dropout(dropout_rate)

    def call(self, inputs):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output)
        return self.layernorm2(out1 + ffn_output)

@tf.keras.utils.register_keras_serializable(package="Custom", name="weighted_mse_original_space")
def weighted_mse_original_space(y_true, y_pred):
    cutoff = tf.constant(10, dtype=tf.float32)  
    sharpness = tf.constant(1, dtype=tf.float32)
    y_true_orig = tf.math.expm1(y_true)
    y_pred_orig = tf.math.expm1(y_pred)

    weights = 1.0 / (1.0 + tf.exp(sharpness * (y_true_orig - cutoff)))

    weighted_error = weights * tf.square(y_true_orig - y_pred_orig)
    return weighted_error
        

def build_mixed_transformer_model(cat_cardinalities, num_numerical_features, item_embed_dim,
                                  embed_dim=16, num_heads=2, ff_dim=64, num_layers=2, dropout_rate=0.3):
    inputs = []
    cat_embeds = []

    # Categorical inputs
    for cardinality in cat_cardinalities:
        inp = Input(shape=(1,), dtype="int32")
        emb = Embedding(input_dim=cardinality + 1, output_dim=embed_dim)(inp)
        emb = Flatten()(emb)
        inputs.append(inp)
        cat_embeds.append(emb)

    # Items input (sequence)
    item_input = Input(shape=(None,), dtype="int32", name="items_input")
    item_emb = Embedding(input_dim=item_embed_dim, output_dim=embed_dim)(item_input)  # Adjust vocab size
    x_item = item_emb
    for _ in range(num_layers):
        attn_output = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(x_item, x_item)
        x_item = LayerNormalization(epsilon=1e-6)(x_item + attn_output)
        ffn_output = Dense(ff_dim, activation='relu')(x_item)
        ffn_output = Dense(embed_dim)(ffn_output)
        x_item = LayerNormalization(epsilon=1e-6)(x_item + ffn_output)
    item_pooled = GlobalAveragePooling1D()(x_item)
    inputs.append(item_input)

    # Numerical input
    num_input = Input(shape=(num_numerical_features,), dtype="float32")
    inputs.append(num_input)

    # Concatenate all
    x = Concatenate()(cat_embeds + [item_pooled, num_input])
    x = Dense(128, activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(dropout_rate)(x)
    output = Dense(1)(x)

    return Model(inputs=inputs, outputs=output)

exp_clip = lambda np_array: np.clip(np.exp(np_array) - 1, None, MAX_SOLD_PRICE)
def show_model_performance( model, X_test, y_test):
    """ Shows model's performance metrics, visualization of predicted against actual results
    and residuals against predicted price.
    Input: model, testing data
    Output: predictions on the testing data for further inspection.
    """
    predictions = model.predict(X_test).flatten()
    predictions_original = exp_clip(predictions) 
    y_test_original = exp_clip(y_test) 
    #compute residuals
    residuals = y_test_original - predictions_original
    
    mean_res = np.sum( residuals)/len(residuals)
    std_res = np.sqrt(np.sum((residuals - mean_res) ** 2)/len(residuals))
    skew_res = skew(residuals, bias=False)
    kurt_res = kurtosis(residuals, bias=False)
    
    # Weighted metrics
    mae = np.mean(np.sum(np.abs(residuals)))/len(y_test)
    mse = np.sum(residuals**2)/len(y_test)
    rmse = np.sqrt(mse)
    
    print("\nResiduals Statistics:")
    print(f"Mean: {mean_res:.4f}")
    print(f"Std Dev: {std_res:.4f}")
    print(f"Skewness: {skew_res:.4f}")
    print(f"Kurtosis: {kurt_res:.4f}")
    
    print("\nEvaluation Metrics (on original sold_price scale):")
    print(f"MAE: ${mae:.4f}")
    print(f"MSE: ${mse:.4f}")
    print(f"RMSE: ${rmse:.4f}")
    
    plt.scatter(predictions_original, residuals, alpha=0.2)
    plt.axhline(0, color="red", linestyle="--", lw=2)
    plt.xlabel("Predicted Sold Price")
    plt.ylabel("Residual  (Actual − Predicted)")
    plt.title("Residuals vs. Predicted Sold Price")
    plt.show()
    
    
    plt.figure(figsize=(8, 8))
    plt.scatter(y_test_original,predictions_original,alpha=0.4,c='blue',label="Predictions")
    
    # Ideal fit line
    min_val = min(y_test_original.min(), predictions_original.min())
    max_val = max(y_test_original.max(), predictions_original.max())
    plt.plot([min_val, max_val], [min_val, max_val], 'r--', lw=2, label="Ideal Fit")
    
    plt.xlabel("Actual Sold Price")
    plt.ylabel("Predicted Sold Price")
    plt.title("Actual vs Predicted Sold Price")
    plt.legend()
    plt.grid(True)
    plt.show()
    
    plt.hist(residuals, bins=80, color='gray')
    plt.title("Distribution of Residuals")
    plt.xlabel("Residual")
    plt.ylabel("Frequency")
    plt.grid(True)
    return predictions



In [None]:
import pandas as pd
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
import numpy as np

MAX_SOLD_PRICE = 8.15
df = pd.read_csv("./training_data.csv")
df=df[df["sold_price"] < MAX_SOLD_PRICE*1.2 ]
#For the read data perform the relevant transformations on the numerical variables,
#transform the categorical variables using
categoric_features = ["pay_frequency","income_type", "source_subid", "state","day_hour_cross"]#, "prefiltered_orders"]
numeric_features = ["annual_income", "requested_amount", "sold_price"]

#transformation to some numerical columns

for col in numeric_features:
    if col != "requested_amount":
        df[col+'_log']=np.log(df[col]+1)


#fill nas in cat features
for cat in categoric_features:
    df[cat] = df[cat].fillna('missing')

numeric_features = [ c for c in numeric_features if c!='sold_price']
numeric_features.append('annual_income_log')

print(f"Updated numeric_features: {numeric_features}")

df["items"]=df["prefiltered_orders"].apply(lambda x: [int(y) for y in x.split(";")])
#Flatten all item lists

all_items = [item for sublist in df['items'] for item in sublist]
le = LabelEncoder()
le.fit(all_items)

# Map each list to a list of integers
df['items_encoded'] = df['items'].apply(lambda lst: le.transform(lst))
#padding the sequences
max_len = df["items_encoded"].apply(len).max()
X_items = pad_sequences(df['items_encoded'], maxlen=max_len, padding='post')
X_num = df[numeric_features].values


col_medians = np.nanmedian(X_num, axis=0)
#localize nas 
inds = np.where(np.isnan(X_num))
#replace
X_num[inds] = np.take(col_medians, inds[1])
cat_encoders = {}
X_cat = []

for col in categoric_features:
    le = LabelEncoder()
    df[col + '_encoded'] = le.fit_transform(df[col])
    cat_encoders[col] = le
    X_cat.append(df[col + '_encoded'].values.reshape(-1, 1)) 

cat_cardinalities = [df[col].nunique() for col in categoric_features]
y_target = df["sold_price_log"].values


#train-test-val split
from sklearn.model_selection import train_test_split

n_samples = X_cat[0].shape[0]
indices = np.arange(n_samples)
train_idx, val_idx = train_test_split(indices, test_size=0.2, random_state=42)
val_idx, test_idx = train_test_split(val_idx, test_size=0.5, random_state=42)

X_cat_tr = [x[train_idx] for x in X_cat]
X_cat_v = [x[val_idx] for x in X_cat]
X_cat_tst =  [x[test_idx] for x in X_cat]

X_items_tr = X_items[train_idx]
X_items_v = X_items[val_idx]
X_items_tst = X_items[test_idx]

X_num_tr = X_num[train_idx]
X_num_v = X_num[val_idx]
X_num_tst = X_num[test_idx]

scaler = StandardScaler()
scaler.fit(X_num_tr)

X_num_tr = scaler.transform(X_num_tr)
X_num_v = scaler.transform(X_num_v)
X_num_tst = scaler.transform(X_num_tst)

# Create model input lists
X_tr = X_cat_tr + [X_items_tr, X_num_tr]
X_v = X_cat_v + [X_items_v, X_num_v]
X_tst = X_cat_tst + [X_items_tst, X_num_tst]

y_tr = y_target[train_idx]
y_v = y_target[val_idx]
y_tst = y_target[test_idx]

In [None]:
n_diff_items = len(set(all_items))

def objective(trial):
    global X_tr, y_tr, X_v, y_v
    # Architecture hyperparameters
    embed_dim = trial.suggest_int("embed_dim", 48, 120)
    num_heads =  trial.suggest_categorical("num_heads", [1, 2, 4, 8]) 
    ff_dim = trial.suggest_int("ff_dim", 128, 192)
    item_embed_dim = trial.suggest_int("item_embed_dim", n_diff_items +1, 5000)
    num_layers = trial.suggest_int("num_layers", 2, 5)
    dropout_rate = trial.suggest_float("dropout_rate", 0.1, 0.4)
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-4, log=True)

    # Training callback hyperparameters
    early_stopping_patience = trial.suggest_int("early_stopping_patience", 3, 10)
    reduce_lr_patience = trial.suggest_int("reduce_lr_patience", 3, 10)

    # Data processing constants
    #proportion_cut = trial.suggest_float("proportion_cut", 1.0, 3.0)
    factor_augmentation = trial.suggest_float("factor_augmentation", 0.0, 3.0)
    low_sold_price = trial.suggest_float("low_sold_price", 5.5, 7.0)


    X_tr, y_tr = oversample_high_values(X_tr, y_tr, low_sold_price, factor_augmentation)

    # ----- Model construction -----
    model = build_mixed_transformer_model(
        cat_cardinalities=cat_cardinalities,
        num_numerical_features=X_num.shape[1],
        embed_dim=embed_dim,
        num_heads=num_heads,
        item_embed_dim = item_embed_dim,
        ff_dim=ff_dim,
        num_layers=num_layers,
        dropout_rate=dropout_rate
    )

    
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=learning_rate),
        loss=weighted_mse_original_space
    )
    
    history = model.fit(
        X_tr, y_tr,
        validation_data=(X_v, y_v),
        epochs=10,
        batch_size=64,
        verbose=1,
        callbacks=[
            ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=reduce_lr_patience, verbose=1),
            tf.keras.callbacks.EarlyStopping(patience=early_stopping_patience, restore_best_weights=True)
        ]
    )
    predictions = model.predict(X_tst).flatten()
    predictions_original = exp_clip(predictions)
    y_test_original = exp_clip(y_tst)
    rmse = np.sqrt(mean_squared_error(y_test_original, predictions_original))

    max_pred = np.max(predictions_original)
    penalty = max(0, 8.15 - max_pred)**2

    return rmse + penalty
study = optuna.create_study(
    direction="minimize",
    study_name="optuna_class_embedings",  # Custom name for this study
    storage="sqlite:///optuna_study.db",  # File-based SQLite DB
    load_if_exists=True  # Will not overwrite if it exists
)

# Run the optimization and store everything
study.optimize(objective, n_trials=20, n_jobs=1, catch=(MemoryError,))

print("Best RMSE:", study.best_value)
print("Best Hyperparameters:", study.best_params)
