In [12]:
import pandas as pd
import numpy as np
import optuna
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Concatenate, Input, Dropout
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model

In [13]:
cat_types = ["model", "brand", "ext_col", "int_col", "accident", 
             "clean_title", "body_style",
             'engine','fuel_type']
df = pd.read_csv('cars_train_enriched_acc_noassumption.csv')
df['miles_per_year'] = df['milage']
df['miles_per_year'] = df.apply(lambda x: x['miles_per_year'] / (x['age']+1), axis=1)
df = df.astype({col: "category" for col in cat_types})

dt = pd.read_csv('cars_test_enriched_acc_noassumption.csv')
dt['miles_per_year'] = dt['milage']
dt['miles_per_year'] = dt.apply(lambda x: x['miles_per_year'] / (x['age']+1), axis=1)
dt = dt.astype({col: "category" for col in cat_types})

for cat in cat_types:
    value_counts = df[cat].value_counts().to_dict()
    df[cat] = df[cat].apply(lambda x: x if (value_counts[x] > 50) else "unknown")
    dt[cat] = dt[cat].apply(lambda x: x if (x in value_counts) and (value_counts[x] > 50) else "unknown")



In [14]:
# Encode categorical features
label_encoders = {}
for col in cat_types:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

X = df.drop(columns=['id', 'price_diff', 'adjusted_price_diff', 'transmission', 'full_name', 'brand_model', 'price'])
y = df['price']

In [4]:
def objective(trial):
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Normalize numerical features
    scaler = StandardScaler()
    X_train[[col for col in X.columns if col not in cat_types]] = scaler.fit_transform(X_train[[col for col in X.columns if col not in cat_types]])
    X_test[[col for col in X.columns if col not in cat_types]] = scaler.transform(X_test[[col for col in X.columns if col not in cat_types]])

    # Define hyperparameters to be tuned
    possible_activations = ['relu', 'linear', 'tanh']
    max_embed_dim = trial.suggest_int('max_embed_dim', 10, 200)
    n_layers = trial.suggest_int('n_layers', 1, 5) #number of dense layers
    inner_act = 'relu'
    output_act = 'relu'
    dropout_frac = trial.suggest_float('dropout_frac', 0.0, 0.7)
    base_node_count = trial.suggest_int('base_node_count', 8, 32)
    batch_size = trial.suggest_int('batch_size', 32, 256)
    epochs = trial.suggest_int('epochs', 3, 20)

    # Define the model
    def build_model(input_dims):
        inputs = []
        embeddings = []
        
        # Embedding layers for categorical features
        for col in cat_types:
            input_dim = df[col].nunique()
            embed_dim = min(max_embed_dim, input_dim // 2)
            input_layer = Input(shape=(1,))
            embedding_layer = Embedding(input_dim=input_dim, output_dim=embed_dim, input_length=1)(input_layer)
            embedding_layer = Flatten()(embedding_layer)
            inputs.append(input_layer)
            embeddings.append(embedding_layer)
        
        # Concatenate embeddings with numerical features
        numerical_inputs = Input(shape=(X_train.shape[1] - len(cat_types),))
        inputs.append(numerical_inputs)
        concatenated = Concatenate()(embeddings + [numerical_inputs])
        
        # Dense layers
        x = Dense(base_node_count * (2**(n_layers-1)), activation=inner_act)(concatenated)
        for i in range(1,n_layers):
            x = Dropout(dropout_frac)(x)
            x = Dense(base_node_count * (2**(n_layers-i-1)), activation=inner_act)(x)
        output = Dense(1, activation = output_act)(x)
        
        model = Model(inputs=inputs, outputs=output)
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model

    # Build and train the model
    model = build_model(X_train.shape[1])
    #plot_model(model, show_shapes=True, show_layer_names=True)

    # Prepare inputs for the model
    train_inputs = [X_train[col] for col in cat_types] + [X_train.drop(columns=cat_types)]
    test_inputs = [X_test[col] for col in cat_types] + [X_test.drop(columns=cat_types)]

    model.fit(train_inputs, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)

    # Evaluate the model
    return model.evaluate(test_inputs, y_test)
    

In [5]:
#run trials

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100)


best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

print("Parameter importance:\n", optuna.importance.get_param_importances(study))

[I 2024-09-28 11:28:45,308] A new study created in memory with name: no-name-804cde82-de3a-4f1a-8e43-9462e916c2ac
[I 2024-09-28 11:29:43,545] Trial 0 finished with value: 4905167360.0 and parameters: {'max_embed_dim': 24, 'n_layers': 1, 'dropout_frac': 0.3097148851992497, 'base_node_count': 24, 'batch_size': 239, 'epochs': 17}. Best is trial 0 with value: 4905167360.0.
[I 2024-09-28 11:32:14,886] Trial 1 finished with value: 4657692160.0 and parameters: {'max_embed_dim': 34, 'n_layers': 5, 'dropout_frac': 0.45444175044970586, 'base_node_count': 20, 'batch_size': 171, 'epochs': 10}. Best is trial 1 with value: 4657692160.0.
[I 2024-09-28 11:33:24,235] Trial 2 finished with value: 4691266560.0 and parameters: {'max_embed_dim': 132, 'n_layers': 3, 'dropout_frac': 0.41141399262050743, 'base_node_count': 19, 'batch_size': 253, 'epochs': 9}. Best is trial 1 with value: 4657692160.0.
[I 2024-09-28 11:34:10,074] Trial 3 finished with value: 4838325248.0 and parameters: {'max_embed_dim': 39, 'n

In [15]:
best_params = {'max_embed_dim': 92, 'n_layers': 5, 'dropout_frac': 0.15612330572989397, 'base_node_count': 21, 'batch_size': 189, 'epochs': 5}

#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train = X
y_train = y
# Normalize numerical features
scaler = StandardScaler()
X_train[[col for col in X.columns if col not in cat_types]] = scaler.fit_transform(X_train[[col for col in X.columns if col not in cat_types]])
#X_test[[col for col in X.columns if col not in cat_types]] = scaler.transform(X_test[[col for col in X.columns if col not in cat_types]])

# Define hyperparameters to be tuned
possible_activations = ['relu', 'linear', 'tanh']
max_embed_dim = best_params['max_embed_dim']
n_layers = best_params['n_layers']
inner_act = 'relu'
output_act = 'relu'
dropout_frac = best_params['dropout_frac']
base_node_count = best_params['base_node_count']
batch_size = best_params['batch_size']
epochs = best_params['epochs']

# Define the model
def build_model(input_dims):
    inputs = []
    embeddings = []
    
    # Embedding layers for categorical features
    for col in cat_types:
        input_dim = df[col].nunique()
        embed_dim = min(max_embed_dim, input_dim // 2)
        input_layer = Input(shape=(1,))
        embedding_layer = Embedding(input_dim=input_dim, output_dim=embed_dim, input_length=1)(input_layer)
        embedding_layer = Flatten()(embedding_layer)
        inputs.append(input_layer)
        embeddings.append(embedding_layer)
    
    # Concatenate embeddings with numerical features
    numerical_inputs = Input(shape=(X_train.shape[1] - len(cat_types),))
    inputs.append(numerical_inputs)
    concatenated = Concatenate()(embeddings + [numerical_inputs])
    
    # Dense layers
    x = Dense(base_node_count * (2**(n_layers-1)), activation=inner_act)(concatenated)
    for i in range(1,n_layers):
        x = Dropout(dropout_frac)(x)
        x = Dense(base_node_count * (2**(n_layers-i-1)), activation=inner_act)(x)
    output = Dense(1, activation = output_act)(x)
    
    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Build and train the model
model = build_model(X_train.shape[1])
#plot_model(model, show_shapes=True, show_layer_names=True)

# Prepare inputs for the model
train_inputs = [X_train[col] for col in cat_types] + [X_train.drop(columns=cat_types)]
#test_inputs = [X_test[col] for col in cat_types] + [X_test.drop(columns=cat_types)]

model.fit(train_inputs, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)


<keras.src.callbacks.History at 0x7f6dbb9dab50>

In [16]:
for col in cat_types:
    dt[col] = label_encoders[col].transform(dt[col])

dt = dt.drop(columns=['transmission', 'full_name', 'brand_model'])

ids = dt.pop('id')

dt[[col for col in dt.columns if col not in cat_types]] = scaler.transform(dt[[col for col in dt.columns if col not in cat_types]])

pred_inputs = [dt[col] for col in cat_types] + [dt.drop(columns=cat_types)]
pred = model.predict(pred_inputs)





In [17]:
# Create a submission DataFrame
submission_df = pd.DataFrame({
    'id': ids,
    'price': pred.reshape(len(pred), )
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission23.csv', index=False)