In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
stratified_computer_prices = pd.read_csv("../data/stratified_computer_prices_sample.csv")

In [3]:
def extract_cpu_type(cpu_model):
    parts = cpu_model.replace("-", " ").split()

    if parts[0] in ["AMD", "Intel"]:
        parts = parts[:-1]
        
    parts = parts[1:]

    return " ".join(parts).strip()

stratified_computer_prices['cpu_type'] = stratified_computer_prices['cpu_model'].apply(extract_cpu_type)
stratified_computer_prices[['resolution_width', 'resolution_height']] = stratified_computer_prices['resolution'].str.split('x', expand=True).astype(int)
stratified_computer_prices['aspect_ratio'] = np.round(stratified_computer_prices['resolution_width'] / stratified_computer_prices['resolution_height'], 2)

stratified_computer_prices.drop(['model', 'cpu_model', 'resolution', 'wifi', 'warranty_months', 'bluetooth', 'storage_drive_count'], axis=1, inplace=True)

In [4]:
from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer, make_column_selector

In [5]:
X = stratified_computer_prices.copy()
y = X.pop('price')

price_bins = pd.qcut(y, q=5, labels=False)

preprocessor = make_column_transformer(
    (StandardScaler(), make_column_selector(dtype_include=np.number)),
    (OneHotEncoder(sparse_output=False), make_column_selector(dtype_include=object))
)

X = preprocessor.fit_transform(X)
y = np.log(y)

X_train, X_temp, y_train, y_temp, bins_train, bins_temp = train_test_split(X, y, price_bins, test_size=0.2, random_state=42, stratify=price_bins)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42, stratify=bins_temp)

In [17]:
from sklearn.model_selection import KFold

from tensorflow import keras
from tensorflow.keras import Input, layers, callbacks, regularizers
import keras_tuner as kt

In [18]:
input_shape = (X_train.shape[1],)

In [19]:
def build_model1(hp):
    model = keras.Sequential();
    model.add(Input(shape=input_shape))

    for units, drop in zip(
        [hp.Int('units1', 64, 256, 64),
         hp.Int('units2', 64, 256, 64),
         hp.Int('units3', 64, 256, 64)],
        [hp.Float('dropout1', 0.1, 0.5, 0.05),
         hp.Float('dropout2', 0.1, 0.5, 0.05),
         hp.Float('dropout3', 0.1, 0.5, 0.05)]
    ):
        model.add(layers.BatchNormalization())
        model.add(layers.Dense(units, activation='relu', kernel_regularizer=regularizers.l2(1e-4)))
        model.add(layers.Dropout(rate=drop))

    model.add(layers.Dense(1, activation='linear'))

    lr = hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss='mse',
        metrics=['mse']
    )

    return model

tuner1 = kt.RandomSearch(
    build_model1,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory=None,
    project_name=None,
    overwrite=True
)

kf1 = KFold(n_splits=5, shuffle=True, random_state=42)
val_scores1 = []

for train_index, val_index in kf1.split(X_train):
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

    tuner1.search(X_tr, y_tr, epochs=50, validation_data=(X_val, y_val), verbose=0)
    best_model1 = tuner1.get_best_models(num_models=1)[0]

    history1 = best_model1.fit(
        X_tr, y_tr,
        validation_data=(X_val, y_val),
        epochs=200,
        batch_size=32,
        verbose=0,
        callbacks=[
            callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1),
            callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1)
        ]
    )

    val_scores1.append(min(history1.history['val_mse']))




  saveable.load_own_variables(weights_store.get(inner_path))



Epoch 15: ReduceLROnPlateau reducing learning rate to 0.0016898798057809472.

Epoch 20: ReduceLROnPlateau reducing learning rate to 0.0008449399028904736.

Epoch 25: ReduceLROnPlateau reducing learning rate to 0.0004224699514452368.
Epoch 25: early stopping
Restoring model weights from the end of the best epoch: 10.


  saveable.load_own_variables(weights_store.get(inner_path))



Epoch 18: ReduceLROnPlateau reducing learning rate to 0.0016898798057809472.

Epoch 23: ReduceLROnPlateau reducing learning rate to 0.0008449399028904736.

Epoch 30: ReduceLROnPlateau reducing learning rate to 0.0004224699514452368.

Epoch 36: ReduceLROnPlateau reducing learning rate to 0.0002112349757226184.

Epoch 46: ReduceLROnPlateau reducing learning rate to 0.0001056174878613092.

Epoch 51: ReduceLROnPlateau reducing learning rate to 5.28087439306546e-05.

Epoch 56: ReduceLROnPlateau reducing learning rate to 2.64043719653273e-05.
Epoch 58: early stopping
Restoring model weights from the end of the best epoch: 43.


  saveable.load_own_variables(weights_store.get(inner_path))



Epoch 12: ReduceLROnPlateau reducing learning rate to 0.0016898798057809472.

Epoch 19: ReduceLROnPlateau reducing learning rate to 0.0008449399028904736.

Epoch 24: ReduceLROnPlateau reducing learning rate to 0.0004224699514452368.

Epoch 29: ReduceLROnPlateau reducing learning rate to 0.0002112349757226184.

Epoch 36: ReduceLROnPlateau reducing learning rate to 0.0001056174878613092.

Epoch 41: ReduceLROnPlateau reducing learning rate to 5.28087439306546e-05.

Epoch 46: ReduceLROnPlateau reducing learning rate to 2.64043719653273e-05.
Epoch 46: early stopping
Restoring model weights from the end of the best epoch: 31.


  saveable.load_own_variables(weights_store.get(inner_path))



Epoch 17: ReduceLROnPlateau reducing learning rate to 0.0016898798057809472.

Epoch 30: ReduceLROnPlateau reducing learning rate to 0.0008449399028904736.

Epoch 35: ReduceLROnPlateau reducing learning rate to 0.0004224699514452368.

Epoch 41: ReduceLROnPlateau reducing learning rate to 0.0002112349757226184.

Epoch 46: ReduceLROnPlateau reducing learning rate to 0.0001056174878613092.

Epoch 51: ReduceLROnPlateau reducing learning rate to 5.28087439306546e-05.

Epoch 56: ReduceLROnPlateau reducing learning rate to 2.64043719653273e-05.
Epoch 59: early stopping
Restoring model weights from the end of the best epoch: 44.


  saveable.load_own_variables(weights_store.get(inner_path))



Epoch 7: ReduceLROnPlateau reducing learning rate to 0.0016898798057809472.

Epoch 13: ReduceLROnPlateau reducing learning rate to 0.0008449399028904736.

Epoch 19: ReduceLROnPlateau reducing learning rate to 0.0004224699514452368.

Epoch 24: ReduceLROnPlateau reducing learning rate to 0.0002112349757226184.

Epoch 30: ReduceLROnPlateau reducing learning rate to 0.0001056174878613092.

Epoch 35: ReduceLROnPlateau reducing learning rate to 5.28087439306546e-05.

Epoch 40: ReduceLROnPlateau reducing learning rate to 2.64043719653273e-05.

Epoch 45: ReduceLROnPlateau reducing learning rate to 1.320218598266365e-05.

Epoch 50: ReduceLROnPlateau reducing learning rate to 6.601092991331825e-06.
Epoch 50: early stopping
Restoring model weights from the end of the best epoch: 35.


In [None]:
def build_model2(hp):
    model = keras.Sequential();
    model.add(Input(shape=input_shape))

    for units, drop in zip(
        [hp.Int('units1', 64, 256, 64),
         hp.Int('units2', 64, 256, 64),
         hp.Int('units3', 64, 256, 64)],
        [hp.Float('dropout1', 0.1, 0.5, 0.05),
         hp.Float('dropout2', 0.1, 0.5, 0.05),
         hp.Float('dropout3', 0.1, 0.5, 0.05)]
    ):
        model.add(layers.BatchNormalization())
        model.add(layers.Dense(units, activation='elu', kernel_regularizer=regularizers.l2(1e-4)))
        model.add(layers.Dropout(rate=drop))

    model.add(layers.Dense(1, activation='linear'))

    lr = hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss='mse',
        metrics=['mse']
    )

    return model

tuner2 = kt.RandomSearch(
    build_model2,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory=None,
    project_name=None,
    overwrite=True
)

kf2 = KFold(n_splits=5, shuffle=True, random_state=42)
val_scores2 = []

for train_index, val_index in kf2.split(X_train):
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

    tuner2.search(X_tr, y_tr, epochs=50, validation_data=(X_val, y_val), verbose=0)
    best_model2 = tuner2.get_best_models(num_models=1)[0]

    history2 = best_model2.fit(
        X_tr, y_tr,
        validation_data=(X_val, y_val),
        epochs=200,
        batch_size=32,
        verbose=0,
        callbacks=[
            callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1),
            callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1)
        ]
    )

    val_scores2.append(min(history2.history['val_mse']))

In [None]:
def build_model3(hp):
    model = keras.Sequential();
    model.add(Input(shape=input_shape))

    for units, drop in zip(
        [hp.Int('units1', 64, 256, 64),
         hp.Int('units2', 64, 256, 64),
         hp.Int('units3', 64, 256, 64)],
        [hp.Float('dropout1', 0.1, 0.5, 0.05),
         hp.Float('dropout2', 0.1, 0.5, 0.05),
         hp.Float('dropout3', 0.1, 0.5, 0.05)]
    ):
        model.add(layers.BatchNormalization())
        model.add(layers.Dense(units, activation='swish', kernel_regularizer=regularizers.l2(1e-4)))
        model.add(layers.Dropout(rate=drop))

    model.add(layers.Dense(1, activation='linear'))

    lr = hp.Float('learning_rate', 1e-4, 1e-2, sampling='log')

    model.compile(
        optimizer=keras.optimizers.Adam(learning_rate=lr),
        loss='mse',
        metrics=['mse']
    )

    return model

tuner3 = kt.RandomSearch(
    build_model3,
    objective='val_loss',
    max_trials=10,
    executions_per_trial=1,
    directory=None,
    project_name=None,
    overwrite=True
)

best_model3 = tuner3.get_best_models(num_models=1)[0]

history3 = best_model3.fit(
    X_tr, y_tr,
    validation_data=(X_val, y_val),
    epochs=200,
    batch_size=32,
    callbacks=[
        callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1),
        callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=10, min_lr=1e-6, verbose=1)
    ]
)

kf3 = KFold(n_splits=5, shuffle=True, random_state=42)
val_scores3 = []

for train_index, val_index in kf3.split(X_train):
    X_tr, X_val = X_train[train_index], X_train[val_index]
    y_tr, y_val = y_train.iloc[train_index], y_train.iloc[val_index]

    tuner3.search(X_tr, y_tr, epochs=50, validation_data=(X_val, y_val), verbose=0)
    best_model3 = tuner3.get_best_models(num_models=1)[0]

    history3 = best_model3.fit(
        X_tr, y_tr,
        validation_data=(X_val, y_val),
        epochs=200,
        batch_size=32,
        verbose=0,
        callbacks=[
            callbacks.EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True, verbose=1),
            callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=1)
        ]
    )

    val_scores3.append(min(history3.history['val_mse']))

In [None]:
from tensorflow.keras import Model

In [None]:
def ensembleModels(models):
    model_input = Input(shape=input_shape)
    y_models = [m(model_input) for m in models]
    y_avg = layers.Average()(y_models)
    return Model(inputs=model_input, outputs=y_avg, name='ensemble')

models_ensemble = ensembleModels([best_model1, best_model2, best_model3])

In [None]:
y_pred_log = models_ensemble.predict(X_test).flatten()

y_pred_price = np.exp(y_pred_log)
y_test_price = np.exp(y_test)

residuals_price = y_test_price - y_pred_price

plt.figure(figsize=(16, 6))

plt.subplot(1, 2, 1)
sns.scatterplot(x=y_pred_price, y=residuals_price, color='#c77dff', alpha=0.5)
plt.axhline(0, color='#9d4edd', linestyle='--')
plt.xlabel('Predicted')
plt.ylabel('Residuals')

plt.subplot(1, 2, 2)
sns.histplot(residuals_price, bins=50, color='#faa307', kde=True, line_kws={'color': '#ffba08'})
plt.xlabel('Residuals')

plt.tight_layout()
plt.show()

In [None]:
print(f"MAE: ${np.abs(residuals_price).mean():.2f}")
print(f"Median Error: ${np.median(residuals_price):.2f}")
print(f"Std Dev: ${residuals_price.std():.2f}")