In [8]:
import numpy as np
import pandas as pd
from pandas import DataFrame
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split

In [9]:
def preprocess_data(X_train, X_test):
    """Scales the features using StandardScaler."""
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler

In [10]:
def build_model(input_dim):
    """Creates a Neural Network model for regression."""
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),  # Input Layer
        Dropout(0.2),  # Regularization
        Dense(64, activation='relu'),
        Dropout(0.2),
        Dense(32, activation='relu'),
        Dense(1)  # Output Layer (Regression)
    ])
    
    # Compile the model
    model.compile(optimizer=Adam(learning_rate=0.001), loss='mse', metrics=['mae'])
    
    return model

In [11]:
def train_model(model, X_train, y_train, X_test, y_test, epochs=100, batch_size=32):
    """Trains the Neural Network."""
    history = model.fit(X_train, y_train, 
                        validation_data=(X_test, y_test),
                        epochs=epochs, batch_size=batch_size, verbose=2)
    return history

In [12]:
def evaluate_model(model, X_test, y_test):
    """Evaluates the trained model using regression metrics."""
    y_pred = model.predict(X_test).flatten()

    # Compute metrics
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    mape = mean_absolute_percentage_error(y_test, y_pred)

    print(f"MAE: {mae:.2f}")
    print(f"MSE: {mse:.2f}")
    print(f"R² Score: {r2:.2f}")
    print(f"MAPE: {mape:.2f}")

    return {"MAE": mae, "MSE": mse, "R2": r2, "MAPE": mape}

In [13]:
def split_and_prepare_data(df: DataFrame, target_column: str, test_size: float = 0.2, random_state: int = 1):

    if not isinstance(df, pd.DataFrame):
        raise TypeError("Input data must be a pandas DataFrame.")

    if target_column not in df.columns:
        raise ValueError(
            f"The target column '{target_column}' is not in the DataFrame."
        )

    # Split the dataset into training and testing sets
    X_train, X_test = train_test_split(
        df, test_size=test_size, random_state=random_state
    )

    # Reset the index for both splits
    X_train = X_train.reset_index(drop=True)
    X_test = X_test.reset_index(drop=True)

    # Extract target values
    y_train = X_train[target_column].values
    y_test = X_test[target_column].values

    # Remove the target column from features
    X_train = X_train.drop(columns=[target_column])
    X_test = X_test.drop(columns=[target_column])

    return X_train, X_test, y_train, y_test

In [16]:
data = pd.read_csv('../data/labeled_data/synthetic_product_listings_gpt_4o_mini_encoded_labeled.csv')
data.head()

Unnamed: 0,category,brand,condition,seller_reputation,log_price
0,0,0,1,4,4.330733
1,1,1,1,2,7.170881
2,2,10,1,5,7.09091
3,2,8,0,4,5.860786
4,2,32,0,3,8.071219


In [18]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 5 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   category           995 non-null    int64  
 1   brand              995 non-null    int64  
 2   condition          995 non-null    int64  
 3   seller_reputation  995 non-null    int64  
 4   log_price          995 non-null    float64
dtypes: float64(1), int64(4)
memory usage: 39.0 KB


In [19]:
X_train, X_test, y_train, y_test = split_and_prepare_data(data, target_column="log_price", test_size=0.2, random_state=1)

In [20]:
# Preprocess data
X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test)

# Build model
model = build_model(input_dim=X_train.shape[1])

# Train model
train_model(model, X_train_scaled, y_train, X_test_scaled, y_test, epochs=100, batch_size=32)

# Evaluate model
evaluate_model(model, X_test_scaled, y_test)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)
2025-02-24 17:45:39.427115: I metal_plugin/src/device/metal_device.cc:1154] Metal device set to: Apple M1 Pro
2025-02-24 17:45:39.427350: I metal_plugin/src/device/metal_device.cc:296] systemMemory: 16.00 GB
2025-02-24 17:45:39.427366: I metal_plugin/src/device/metal_device.cc:313] maxCacheSize: 5.33 GB
I0000 00:00:1740408339.447081 5515551 pluggable_device_factory.cc:305] Could not identify NUMA node of platform GPU ID 0, defaulting to 0. Your kernel may not have been built with NUMA support.
I0000 00:00:1740408339.447672 5515551 pluggable_device_factory.cc:271] Created TensorFlow device (/job:localhost/replica:0/task:0/device:GPU:0 with 0 MB memory) -> physical PluggableDevice (device: 0, name: METAL, pci bus id: <undefined>)


Epoch 1/100


2025-02-24 17:45:40.949088: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:117] Plugin optimizer for device_type GPU is enabled.


25/25 - 12s - 481ms/step - loss: 36.4648 - mae: 5.8868 - val_loss: 29.5139 - val_mae: 5.3586
Epoch 2/100
25/25 - 1s - 23ms/step - loss: 26.0896 - mae: 4.9297 - val_loss: 17.7523 - val_mae: 4.1068
Epoch 3/100
25/25 - 1s - 24ms/step - loss: 14.4628 - mae: 3.4337 - val_loss: 5.5879 - val_mae: 2.2612
Epoch 4/100
25/25 - 1s - 23ms/step - loss: 7.5898 - mae: 2.2496 - val_loss: 0.8347 - val_mae: 0.7222
Epoch 5/100
25/25 - 1s - 21ms/step - loss: 6.2572 - mae: 1.9130 - val_loss: 0.5657 - val_mae: 0.6163
Epoch 6/100
25/25 - 1s - 21ms/step - loss: 4.9454 - mae: 1.7409 - val_loss: 0.6983 - val_mae: 0.7212
Epoch 7/100
25/25 - 1s - 21ms/step - loss: 4.5869 - mae: 1.6563 - val_loss: 0.9325 - val_mae: 0.7696
Epoch 8/100
25/25 - 1s - 21ms/step - loss: 3.4245 - mae: 1.4564 - val_loss: 0.4615 - val_mae: 0.5143
Epoch 9/100
25/25 - 1s - 21ms/step - loss: 2.9790 - mae: 1.3478 - val_loss: 0.4660 - val_mae: 0.5099
Epoch 10/100
25/25 - 1s - 22ms/step - loss: 2.6505 - mae: 1.2900 - val_loss: 0.4563 - val_mae: 0

{'MAE': 14.162374861778252,
 'MSE': 229.78141433091344,
 'R2': -124.3507520071642,
 'MAPE': 2.5143238244268686}

In [18]:
import keras_tuner as kt

def build_model_with_tuning(hp):
    """Create a neural network model with hyperparameter tuning."""
    model = Sequential()

    # Tunable parameters for layers
    model.add(Dense(
        hp.Int('units1', min_value=32, max_value=512, step=32),
        activation='relu', input_shape=(X_train.shape[1],)
    ))

    model.add(Dropout(hp.Float('dropout1', min_value=0.1, max_value=0.5, step=0.1)))
    
    model.add(Dense(
        hp.Int('units2', min_value=32, max_value=512, step=32),
        activation='relu'
    ))

    model.add(Dropout(hp.Float('dropout2', min_value=0.1, max_value=0.5, step=0.1)))
    
    model.add(Dense(1))  # Output layer

    # Compile model with tunable learning rate
    model.compile(
        optimizer=Adam(learning_rate=hp.Float('learning_rate', min_value=1e-5, max_value=1e-2, sampling='LOG')),
        loss='mse', metrics=['mae']
    )

    return model

In [19]:
# Initialize the tuner
tuner = kt.Hyperband(
    build_model_with_tuning,
    objective='val_loss',
    max_epochs=10,
    factor=3,
    directory='my_dir',
    project_name='price_prediction'
)

# Search for the best hyperparameters
tuner.search(X_train_scaled, y_train, epochs=10, validation_data=(X_test_scaled, y_test))

# Get the best model
best_model = tuner.get_best_models(num_models=1)[0]

# Evaluate the best model
evaluate_model(best_model, X_test_scaled, y_test)

Trial 30 Complete [00h 00m 16s]
val_loss: 1719676.0

Best val_loss So Far: 325861.0
Total elapsed time: 00h 06m 13s
[1m7/7[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
MAE: 391.88
MSE: 325861.01
R² Score: 0.61
MAPE: 0.71


{'MAE': 391.87534406038975,
 'MSE': 325861.00997261406,
 'R2': 0.6143021499043362,
 'MAPE': 0.7051358303896563}

In [3]:
import pandas as pd
data = pd.read_csv('../data/processed_data/synthetic_product_listings_gpt_4o_mini.csv')
data.head()

Unnamed: 0,product_id,category,brand,condition,price,seller_reputation,description,suspected_fraud,log_price
0,2,Clothing,Adidas,used,75.0,4,Brand new Adidas running shorts,True,4.330733
1,3,Electronics,Apple,used,1299.99,2,Brand new Apple iPhone 14 Pro with 256GB storage,False,7.170881
2,4,Furniture,Joybird,used,1200.0,5,Stylish Joybird mid-century modern sofa with p...,False,7.09091
3,5,Furniture,IKEA,New,350.0,4,Brand new IKEA Hemnes dresser with 6 drawers.,False,5.860786
4,6,Furniture,West Elm,New,3200.0,3,Brand new West Elm mid-century dining table in...,True,8.071219


In [2]:
import pandas as pd
data = pd.read_csv('../data/labeled_data/synthetic_product_listings_gpt_4o_mini_encoded_labeled.csv')
data.head()

Unnamed: 0,category,brand,condition,seller_reputation,log_price
0,0,0,1,4,4.330733
1,1,1,1,2,7.170881
2,2,10,1,5,7.09091
3,2,8,0,4,5.860786
4,2,32,0,3,8.071219


In [4]:
import numpy as np

In [7]:
n = 75
np.log1p(n)

np.float64(4.330733340286331)

In [None]:
self.model = xgb.XGBRegressor(
            objective="reg:squarederror",
            max_depth=9,
            min_child_weight=5,
            learning_rate=0.2,
            colsample_bytree=1.0,
            gamma=0.1,
            n_estimators=100,
            reg_alpha=0.1,
            reg_lambda=0,
            subsample=1.0,
            random_state=1,
        )