In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from sklearn import linear_model
from sklearn.preprocessing import LabelEncoder, StandardScaler
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten, Concatenate, Input, Dropout
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Model

  from .autonotebook import tqdm as notebook_tqdm
2024-09-29 20:32:56.078130: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-29 20:32:56.124089: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-09-29 20:32:56.124130: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-09-29 20:32:56.124162: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-09-29 20:32:56.132636: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2024-09-29 20:32:56.134941: I tensorflow

In [2]:
#We first build a linear regression model on the numeric features to also use in our blended model
cat_types = ["model", "brand", "ext_col", "int_col", "accident", 
             "clean_title", "body_style",
             'engine','fuel_type']
df = pd.read_csv('train_processed.csv')
df['miles_per_year'] = df['milage']
df['miles_per_year'] = df.apply(lambda x: x['miles_per_year'] / (x['age']), axis=1)
df = df.astype({col: "category" for col in cat_types})
df_0, df_1 = train_test_split(df, test_size = 0.2)

dt = pd.read_csv('test_processed.csv')
dt['miles_per_year'] = dt['milage']
dt['miles_per_year'] = dt.apply(lambda x: x['miles_per_year'] / (x['age']), axis=1)
dt = dt.astype({col: "category" for col in cat_types})
drop_cols_predict = ['transmission', 'full_name', 'brand_model']
dt.drop(drop_cols_predict, axis = 1, inplace = True)

nn_df = df.copy()
nn_df_0 = df_0.copy()
nn_df_1 = df_1.copy()
nn_dt = dt.copy().drop(['id'], axis = 1)
for cat in cat_types:
    value_counts = nn_df[cat].value_counts().to_dict()
    nn_df[cat] = nn_df[cat].apply(lambda x: x if (value_counts[x] > 50) else "unknown")
#    value_counts = nn_df_0[cat].value_counts().to_dict()
    nn_df_0[cat] = nn_df_0[cat].apply(lambda x: x if (value_counts[x] > 50) else "unknown")
    nn_df_1[cat] = nn_df_1[cat].apply(lambda x: x if (x in value_counts) and (value_counts[x] > 50) else "unknown")
    nn_dt[cat] = nn_dt[cat].apply(lambda x: x if (x in value_counts) and (value_counts[x] > 50) else "unknown")

level0_train_predictions = []
level0_test_predictions = []

# NN 1:

In [3]:
# Encode categorical features
label_encoders = {}
for col in cat_types:
    le = LabelEncoder()
    nn_df[col] = le.fit_transform(nn_df[col])
    nn_df_0[col] = le.transform(nn_df_0[col])
    label_encoders[col] = le
    nn_df_1[col] = le.transform(nn_df_1[col])
    nn_dt[col] = le.transform(nn_dt[col])


X_full = nn_df.drop(columns=['id', 'price_diff', 'adjusted_price_diff', 'transmission', 'full_name', 'brand_model', 'price'])
y_full = nn_df['price']
X = nn_df_0.drop(columns=['id', 'price_diff', 'adjusted_price_diff', 'transmission', 'full_name', 'brand_model', 'price'])
y = nn_df_0['price']

nn_df_1.drop(columns=['id', 'price_diff', 'adjusted_price_diff', 'transmission', 'full_name', 'brand_model', 'price'], inplace = True)

# Normalize numerical features
scaler = StandardScaler()
X[[col for col in X.columns if col not in cat_types]] = scaler.fit_transform(X[[col for col in X.columns if col not in cat_types]])
X_full[[col for col in X_full.columns if col not in cat_types]] = scaler.transform(X_full[[col for col in X_full.columns if col not in cat_types]])
nn_df_1[[col for col in nn_df_1.columns if col not in cat_types]] = scaler.transform(nn_df_1[[col for col in nn_df_1.columns if col not in cat_types]])
nn_dt[[col for col in nn_dt.columns if col not in cat_types]] = scaler.transform(nn_dt[[col for col in nn_dt.columns if col not in cat_types]])

# Define the model
def build_model(input_dims):
    inputs = []
    embeddings = []
    
    # Embedding layers for categorical features
    for col in cat_types:
        input_dim = df[col].nunique()
        embed_dim = min(50, input_dim // 2)
        input_layer = Input(shape=(1,))
        embedding_layer = Embedding(input_dim=input_dim, output_dim=embed_dim, input_length=1)(input_layer)
        embedding_layer = Flatten()(embedding_layer)
        inputs.append(input_layer)
        embeddings.append(embedding_layer)
    
    # Concatenate embeddings with numerical features
    numerical_inputs = Input(shape=(X.shape[1] - len(cat_types),))
    inputs.append(numerical_inputs)
    concatenated = Concatenate()(embeddings + [numerical_inputs])
    
    # Dense layers
    x = Dense(128, activation='relu')(concatenated)
    x = Dropout(0.5)(x)
    x = Dense(64, activation='relu')(x)
    x = Dropout(0.5)(x)
    x = Dense(32, activation='relu')(x)
    output = Dense(1)(x)
    
    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Build and train the model
model = build_model(X.shape[1])
#plot_model(model, show_shapes=True, show_layer_names=True)

# Prepare inputs for the model
train_inputs = [X[col] for col in cat_types] + [X.drop(columns=cat_types)]

model.fit(train_inputs, y, epochs=10, batch_size=32, validation_split=0.2)


level0_train_pred_inputs = [nn_df_1[col] for col in cat_types] + [nn_df_1.drop(columns=cat_types)]
preds = model.predict(level0_train_pred_inputs)
level0_train_predictions.append(preds.reshape(len(preds),))

level0_test_pred_inputs = [nn_dt[col] for col in cat_types] + [nn_dt.drop(columns=cat_types)]
preds = model.predict(level0_test_pred_inputs)
level0_test_predictions.append(preds.reshape(len(preds),))




Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


# NN2:

In [4]:
best_params = {'max_embed_dim': 92, 'n_layers': 5, 'dropout_frac': 0.15612330572989397, 'base_node_count': 21, 'batch_size': 189, 'epochs': 5}

possible_activations = ['relu', 'linear', 'tanh']
max_embed_dim = best_params['max_embed_dim']
n_layers = best_params['n_layers']
inner_act = 'relu'
output_act = 'relu'
dropout_frac = best_params['dropout_frac']
base_node_count = best_params['base_node_count']
batch_size = best_params['batch_size']
epochs = best_params['epochs']

# Define the model
def build_model(input_dims):
    inputs = []
    embeddings = []
    
    # Embedding layers for categorical features
    for col in cat_types:
        input_dim = df[col].nunique()
        embed_dim = min(max_embed_dim, input_dim // 2)
        input_layer = Input(shape=(1,))
        embedding_layer = Embedding(input_dim=input_dim, output_dim=embed_dim, input_length=1)(input_layer)
        embedding_layer = Flatten()(embedding_layer)
        inputs.append(input_layer)
        embeddings.append(embedding_layer)
    
    # Concatenate embeddings with numerical features
    numerical_inputs = Input(shape=(input_dims - len(cat_types),))
    inputs.append(numerical_inputs)
    concatenated = Concatenate()(embeddings + [numerical_inputs])
    
    # Dense layers
    x = Dense(base_node_count * (2**(n_layers-1)), activation=inner_act)(concatenated)
    for i in range(1,n_layers):
        x = Dropout(dropout_frac)(x)
        x = Dense(base_node_count * (2**(n_layers-i-1)), activation=inner_act)(x)
    output = Dense(1, activation = output_act)(x)
    
    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Build and train the model
model = build_model(X.shape[1])
#plot_model(model, show_shapes=True, show_layer_names=True)

# Prepare inputs for the model
train_inputs = [X[col] for col in cat_types] + [X.drop(columns=cat_types)]

model.fit(train_inputs, y, epochs=epochs, batch_size=batch_size, validation_split=0.2)


level0_train_pred_inputs = [nn_df_1[col] for col in cat_types] + [nn_df_1.drop(columns=cat_types)]
preds = model.predict(level0_train_pred_inputs)
level0_train_predictions.append(preds.reshape(len(preds),))




# Build and train the model
model = build_model(X_full.shape[1])
#plot_model(model, show_shapes=True, show_layer_names=True)

# Prepare inputs for the model
train_inputs = [X_full[col] for col in cat_types] + [X_full.drop(columns=cat_types)]

model.fit(train_inputs, y_full, epochs=epochs, batch_size=batch_size, validation_split=0.2)

level0_test_pred_inputs = [nn_dt[col] for col in cat_types] + [nn_dt.drop(columns=cat_types)]
preds = model.predict(level0_test_pred_inputs)
level0_test_predictions.append(preds.reshape(len(preds),))


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


# Linear:

In [5]:
drop_cols_lin = cat_types+['id', 'price_diff', 'adjusted_price_diff', 'transmission', 'full_name', 'brand_model', 'model_year']
y_lin = df_0['price']
y_lin_full = df['price']
X_lin = df_0.drop(drop_cols_lin+['price'], axis=1)
X_lin_p = df_1.drop(drop_cols_lin+['price'], axis=1)
X_lin_full = df.drop(drop_cols_lin+['price'], axis=1)
X_lin_test = dt.drop(cat_types+['id', 'model_year'], axis=1)

In [6]:
lin_model = linear_model.ElasticNetCV(cv = 5).fit(X_lin, y_lin)
level0_train_predictions.append(lin_model.predict(X_lin_p))
lin_model = linear_model.ElasticNetCV(cv = 5).fit(X_lin_full, y_lin_full)
level0_test_predictions.append(lin_model.predict(X_lin_test))

# XGB:

In [7]:
model_params = [{'price_threshold': 5000000, 'assume_accident': 0, 'n_estimators': 994, 'eta': 0.015373037895620294, 'max_depth': 5, 'min_child_weight': 0.11357559673815384, 'subsample': 0.9793735367721236, 'colsample_bytree': 0.3377000630669105, 'lambda': 43.146286704054816, 'alpha': 46.88655118854743, 'model_threshold': 573, 'ext_col_threshold': 454, 'accident_threshold': 94, 'clean_title_threshold': 1, 'body_style_threshold': 485, 'engine_threshold': 56, 'fuel_type_threshold': 741, 'int_col_threshold': 886, 'brand_threshold': 909, 'include_brand': 1, 'include_model': 1, 'include_model_year': 1, 'include_milage': 0, 'include_fuel_type': 1, 'include_engine': 1, 'include_ext_col': 1, 'include_int_col': 1, 'include_accident': 1, 'include_clean_title': 1, 'include_body_style': 1, 'include_msrp': 1, 'include_age': 1, 'include_reliability': 1, 'include_adjusted_msrp': 1, 'include_miles_per_year': 1},
               {'price_threshold': 5000000, 'assume_accident': 1, 'n_estimators': 370, 'eta': 0.01327575529437083, 'max_depth': 4, 'min_child_weight': 3.302657962693619, 'subsample': 0.6789548638913294, 'colsample_bytree': 0.5484656318518291, 'lambda': 0.8252729975590364, 'alpha': 0.2899140082776656, 'model_threshold': 1994, 'ext_col_threshold': 926, 'accident_threshold': 404, 'clean_title_threshold': 68, 'body_style_threshold': 1835, 'engine_threshold': 102, 'fuel_type_threshold': 750, 'int_col_threshold': 126, 'brand_threshold': 1412, 'include_brand': 0, 'include_model': 1, 'include_model_year': 1, 'include_milage': 1, 'include_fuel_type': 0, 'include_engine': 1, 'include_ext_col': 1, 'include_int_col': 0, 'include_accident': 0, 'include_clean_title': 0, 'include_body_style': 1, 'include_msrp': 1, 'include_age': 0, 'include_reliability': 1, 'include_adjusted_msrp': 1, 'include_miles_per_year': 1},
               {'price_threshold': 2275422, 'assume_accident': 1, 'n_estimators': 1100, 'eta': 0.005259986136288139, 'max_depth': 6, 'min_child_weight': 0.00037407860076230114, 'subsample': 0.7842825629675444, 'colsample_bytree': 0.9815915284682647, 'lambda': 6.971957139818525, 'alpha': 0.00014259423464146244, 'model_threshold': 1719, 'ext_col_threshold': 58, 'accident_threshold': 1585, 'clean_title_threshold': 279, 'body_style_threshold': 322, 'engine_threshold': 1186, 'fuel_type_threshold': 243, 'int_col_threshold': 241, 'brand_threshold': 408, 'include_brand': 1, 'include_model': 1, 'include_model_year': 0, 'include_milage': 1, 'include_fuel_type': 1, 'include_engine': 0, 'include_ext_col': 1, 'include_int_col': 0, 'include_accident': 1, 'include_clean_title': 1, 'include_body_style': 1, 'include_msrp': 0, 'include_age': 1, 'include_reliability': 0, 'include_adjusted_msrp': 1, 'include_miles_per_year': 0},
               {'price_threshold': 2119121, 'assume_accident': 1, 'n_estimators': 467, 'eta': 0.03399717470810654, 'max_depth': 5, 'min_child_weight': 37.8243422300728, 'subsample': 0.9081531213002817, 'colsample_bytree': 0.1449739942646256, 'lambda': 947.7072608084693, 'alpha': 0.0008681051222483094, 'model_threshold': 1980, 'ext_col_threshold': 443, 'accident_threshold': 1815, 'clean_title_threshold': 701, 'body_style_threshold': 1132, 'engine_threshold': 221, 'fuel_type_threshold': 1441, 'int_col_threshold': 326, 'brand_threshold': 1904, 'include_brand': 0, 'include_model': 0, 'include_model_year': 0, 'include_milage': 1, 'include_fuel_type': 0, 'include_engine': 1, 'include_ext_col': 0, 'include_int_col': 0, 'include_accident': 0, 'include_clean_title': 0, 'include_body_style': 1, 'include_msrp': 1, 'include_age': 0, 'include_reliability': 0, 'include_adjusted_msrp': 0, 'include_miles_per_year': 0},
               {'price_threshold': 1194471, 'assume_accident': 0, 'n_estimators': 951, 'eta': 0.013253672942340268, 'max_depth': 6, 'min_child_weight': 38.26739593627571, 'subsample': 0.20646752099818685, 'colsample_bytree': 0.787577422310926, 'lambda': 0.30415017474541467, 'alpha': 0.00030546530869858226, 'model_threshold': 627, 'ext_col_threshold': 1679, 'accident_threshold': 62, 'clean_title_threshold': 917, 'body_style_threshold': 542, 'engine_threshold': 49, 'fuel_type_threshold': 1492, 'int_col_threshold': 1832, 'brand_threshold': 113, 'include_brand': 1, 'include_model': 1, 'include_model_year': 0, 'include_milage': 0, 'include_fuel_type': 1, 'include_engine': 1, 'include_ext_col': 1, 'include_int_col': 1, 'include_accident': 1, 'include_clean_title': 0, 'include_body_style': 1, 'include_msrp': 1, 'include_age': 1, 'include_reliability': 0, 'include_adjusted_msrp': 0, 'include_miles_per_year': 1}]

drop_cols = ['id', 'price_diff', 'adjusted_price_diff', 'transmission', 'full_name', 'brand_model']
y = df_0['price']
X = df_0.drop(['price']+drop_cols, axis=1)


In [8]:
for params in model_params:
    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'rmse'
    params['device'] = 'cpu'
    not_included = [col for col in X.columns if params[f'include_{col}']==0]
    
    y_t = y.copy()
    X_t = X.copy()
    X_p = df_1.drop(['price']+drop_cols, axis = 1)
    X_pred = dt.drop(['id'], axis = 1)
    

    y_t = y_t[y<params['price_threshold']]
    X_t = X_t[y<params['price_threshold']] 

    if params['assume_accident']==1: 
        X_t.loc[X['accident'] == 'unknown', 'accident'] = 'atleast1accidentordamagereported'
        X_p.loc[df_1['accident'] == 'unknown', 'accident'] = 'atleast1accidentordamagereported'
        X_pred.loc[dt['accident'] == 'unknown', 'accident'] = 'atleast1accidentordamagereported'
    
    for cat in cat_types:
        value_counts = X_t[cat].value_counts().to_dict()
        X_t[cat] = X_t[cat].apply(lambda x: x if value_counts[x] > params[f'{cat}_threshold'] else "unknown")
        X_p[cat] = X_p[cat].apply(lambda x: x if (x in value_counts) and (value_counts[x] > params[f'{cat}_threshold']) else "unknown")
        X_pred[cat] = X_pred[cat].apply(lambda x: x if (x in value_counts) and (value_counts[x] > params[f'{cat}_threshold']) else "unknown")
    
    X_t = X_t.astype({col: "category" for col in cat_types})
    X_p = X_p.astype({col: "category" for col in cat_types})
    X_pred = X_pred.astype({col: "category" for col in cat_types})
    
    X_t.drop(not_included, axis = 1, inplace = True)
    X_p.drop(not_included, axis = 1, inplace = True)
    X_pred.drop(not_included, axis = 1, inplace = True)
    

    dtrain = xgb.DMatrix(X_t, label=y_t, enable_categorical=True)
    model = xgb.train(params, dtrain, num_boost_round=params['n_estimators'])
    dvalid = xgb.DMatrix(X_p, enable_categorical=True)  
    dpred = xgb.DMatrix(X_pred, enable_categorical=True) 
    
    level0_train_predictions.append(model.predict(dvalid))
    #level0_test_predictions.append(model.predict(dpred))

Parameters: { "accident_threshold", "assume_accident", "body_style_threshold", "brand_threshold", "clean_title_threshold", "engine_threshold", "ext_col_threshold", "fuel_type_threshold", "include_accident", "include_adjusted_msrp", "include_age", "include_body_style", "include_brand", "include_clean_title", "include_engine", "include_ext_col", "include_fuel_type", "include_int_col", "include_milage", "include_miles_per_year", "include_model", "include_model_year", "include_msrp", "include_reliability", "int_col_threshold", "model_threshold", "n_estimators", "price_threshold" } are not used.

Parameters: { "accident_threshold", "assume_accident", "body_style_threshold", "brand_threshold", "clean_title_threshold", "engine_threshold", "ext_col_threshold", "fuel_type_threshold", "include_accident", "include_adjusted_msrp", "include_age", "include_body_style", "include_brand", "include_clean_title", "include_engine", "include_ext_col", "include_fuel_type", "include_int_col", "include_milage

In [9]:
drop_cols = ['id', 'price_diff', 'adjusted_price_diff', 'transmission', 'full_name', 'brand_model']
y = df['price']
X = df.drop(['price']+drop_cols, axis=1)

In [10]:
for params in model_params:
    params['objective'] = 'reg:squarederror'
    params['eval_metric'] = 'rmse'
    params['device'] = 'cpu'
    not_included = [col for col in X.columns if params[f'include_{col}']==0]
    
    y_t = y.copy()
    X_t = X.copy()
    X_p = dt.drop(['id'], axis = 1)

    y_t = y_t[y<params['price_threshold']]
    X_t = X_t[y<params['price_threshold']] 

    if params['assume_accident']==1: 
        X_t.loc[X['accident'] == 'unknown', 'accident'] = 'atleast1accidentordamagereported'
        X_p.loc[dt['accident'] == 'unknown', 'accident'] = 'atleast1accidentordamagereported'

    for cat in cat_types:
        value_counts = X_t[cat].value_counts().to_dict()
        X_t[cat] = X_t[cat].apply(lambda x: x if value_counts[x] > params[f'{cat}_threshold'] else "unknown")
        X_p[cat] = X_p[cat].apply(lambda x: x if (x in value_counts) and (value_counts[x] > params[f'{cat}_threshold']) else "unknown")

    X_t = X_t.astype({col: "category" for col in cat_types})
    X_p = X_p.astype({col: "category" for col in cat_types})

    X_t.drop(not_included, axis = 1, inplace = True)
    X_p.drop(not_included, axis = 1, inplace = True)

    dtrain = xgb.DMatrix(X_t, label=y_t, enable_categorical=True)
    model = xgb.train(params, dtrain, num_boost_round=params['n_estimators'])
    dpred = xgb.DMatrix(X_p, enable_categorical=True)  
    level0_test_predictions.append(model.predict(dpred))

Parameters: { "accident_threshold", "assume_accident", "body_style_threshold", "brand_threshold", "clean_title_threshold", "engine_threshold", "ext_col_threshold", "fuel_type_threshold", "include_accident", "include_adjusted_msrp", "include_age", "include_body_style", "include_brand", "include_clean_title", "include_engine", "include_ext_col", "include_fuel_type", "include_int_col", "include_milage", "include_miles_per_year", "include_model", "include_model_year", "include_msrp", "include_reliability", "int_col_threshold", "model_threshold", "n_estimators", "price_threshold" } are not used.

Parameters: { "accident_threshold", "assume_accident", "body_style_threshold", "brand_threshold", "clean_title_threshold", "engine_threshold", "ext_col_threshold", "fuel_type_threshold", "include_accident", "include_adjusted_msrp", "include_age", "include_body_style", "include_brand", "include_clean_title", "include_engine", "include_ext_col", "include_fuel_type", "include_int_col", "include_milage

In [11]:
level0_train_predictions

[array([51853.33 , 30851.68 , 36999.582, ..., 60154.21 , 49304.875,
        52738.5  ], dtype=float32),
 array([59836.99 , 34318.977, 41606.395, ..., 66815.89 , 52650.69 ,
        60566.715], dtype=float32),
 array([70488.88564067, 42871.41687713, 41959.45730106, ...,
        54803.3275174 , 63556.24666491, 62829.59271266]),
 array([59063.363, 30880.021, 39554.668, ..., 59275.562, 50894.84 ,
        48755.2  ], dtype=float32),
 array([54817.14 , 31721.338, 37856.047, ..., 60707.336, 49623.5  ,
        51149.42 ], dtype=float32),
 array([56589.93 , 32965.457, 36599.727, ..., 57415.504, 50487.69 ,
        54735.477], dtype=float32),
 array([55791.742, 39165.805, 40095.69 , ..., 59040.707, 53089.926,
        52456.67 ], dtype=float32),
 array([58548.76 , 29955.113, 37802.312, ..., 52855.56 , 49313.848,
        49810.734], dtype=float32)]

In [12]:
level1_train = np.hstack([y.reshape(len(y),1) for y in level0_train_predictions])

In [13]:
level1_test = np.hstack([y.reshape(len(y),1) for y in level0_test_predictions])

In [14]:
"""def objective(trial):
    X_train, X_test, y_train, y_test = train_test_split(level1_train, df_1['price'], test_size=0.2, random_state=42)

    # Define hyperparameters to be tuned
    possible_activations = ['relu', 'linear', 'tanh']
    n_layers = 6 #number of dense layers
    inner_act = 'relu'
    output_act = 'relu'
    dropout_frac = 0.1
    base_node_count = 8
    batch_size = 128#trial.suggest_int('batch_size', 8, 128, log = True)
    epochs = trial.suggest_int('epochs', 5, 15)

    def build_level1_model(input_dims):
        inputs = Input(shape=(input_dims,))
        
        # Dense layers
        x = Dense(base_node_count, activation=inner_act)(inputs)
        for i in range(1,n_layers):
            x = Dropout(dropout_frac)(x)
            x = Dense(base_node_count * (2**i), activation=inner_act)(x)
        output = Dense(1, activation = output_act)(x)
        
        model = Model(inputs=inputs, outputs=output)
        model.compile(optimizer='adam', loss='mean_squared_error')
        return model

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Build and train the model
    level1_model = build_level1_model(X_train.shape[1])

    level1_model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=0.2, verbose=0)

    return level1_model.evaluate(X_test, y_test)

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)


best_params = study.best_params
print(f"Best hyperparameters: {best_params}")

print("Parameter importance:\n", optuna.importance.get_param_importances(study))"""

'def objective(trial):\n    X_train, X_test, y_train, y_test = train_test_split(level1_train, df_1[\'price\'], test_size=0.2, random_state=42)\n\n    # Define hyperparameters to be tuned\n    possible_activations = [\'relu\', \'linear\', \'tanh\']\n    n_layers = 6 #number of dense layers\n    inner_act = \'relu\'\n    output_act = \'relu\'\n    dropout_frac = 0.1\n    base_node_count = 8\n    batch_size = 128#trial.suggest_int(\'batch_size\', 8, 128, log = True)\n    epochs = trial.suggest_int(\'epochs\', 5, 15)\n\n    def build_level1_model(input_dims):\n        inputs = Input(shape=(input_dims,))\n        \n        # Dense layers\n        x = Dense(base_node_count, activation=inner_act)(inputs)\n        for i in range(1,n_layers):\n            x = Dropout(dropout_frac)(x)\n            x = Dense(base_node_count * (2**i), activation=inner_act)(x)\n        output = Dense(1, activation = output_act)(x)\n        \n        model = Model(inputs=inputs, outputs=output)\n        model.compil

In [15]:
"""#best_params = {'n_layers': 4, 'dropout_frac': 0.03504995158025597, 'base_node_count': 9, 'batch_size': 35, 'epochs': 10}   # try 1
#best_params = {'n_layers': 5, 'dropout_frac': 0.1, 'base_node_count': 25, 'batch_size': 94, 'epochs': 11}  # optimized
best_params = {'n_layers': 5, 'dropout_frac': 0.1, 'base_node_count': 8, 'batch_size': 128, 'epochs': 10}  # optimized
n_layers = best_params['n_layers']
inner_act = 'relu'
output_act = 'relu'
dropout_frac = best_params['dropout_frac']
base_node_count = best_params['base_node_count']
batch_size = best_params['batch_size']
epochs = best_params['epochs']

def build_level1_model(input_dims):
    inputs = Input(shape=(input_dims,))
    
    # Dense layers
    x = Dense(base_node_count * (2**(n_layers-1)), activation=inner_act)(inputs)
    for i in range(1,n_layers):
        x = Dropout(dropout_frac)(x)
        x = Dense(base_node_count * (2**(n_layers-i-1)), activation=inner_act)(x)
    output = Dense(1, activation = output_act)(x)
    
    model = Model(inputs=inputs, outputs=output)
    model.compile(optimizer='adam', loss='mean_squared_error')
    return model

# Scale the data
scaler = StandardScaler()
level1_train = scaler.fit_transform(level1_train)
level1_test = scaler.transform(level1_test)

# Build and train the model
level1_model = build_level1_model(level1_train.shape[1])

level1_model.fit(level1_train, df_1['price'], epochs=epochs, batch_size=batch_size, validation_split=0.2)


"""

"#best_params = {'n_layers': 4, 'dropout_frac': 0.03504995158025597, 'base_node_count': 9, 'batch_size': 35, 'epochs': 10}   # try 1\n#best_params = {'n_layers': 5, 'dropout_frac': 0.1, 'base_node_count': 25, 'batch_size': 94, 'epochs': 11}  # optimized\nbest_params = {'n_layers': 5, 'dropout_frac': 0.1, 'base_node_count': 8, 'batch_size': 128, 'epochs': 10}  # optimized\nn_layers = best_params['n_layers']\ninner_act = 'relu'\noutput_act = 'relu'\ndropout_frac = best_params['dropout_frac']\nbase_node_count = best_params['base_node_count']\nbatch_size = best_params['batch_size']\nepochs = best_params['epochs']\n\ndef build_level1_model(input_dims):\n    inputs = Input(shape=(input_dims,))\n    \n    # Dense layers\n    x = Dense(base_node_count * (2**(n_layers-1)), activation=inner_act)(inputs)\n    for i in range(1,n_layers):\n        x = Dropout(dropout_frac)(x)\n        x = Dense(base_node_count * (2**(n_layers-i-1)), activation=inner_act)(x)\n    output = Dense(1, activation = outpu

In [16]:
level1_model = linear_model.LinearRegression().fit(level1_train, df_1['price'])

In [17]:
predictions = level1_model.predict(level1_test)

In [18]:
ids = dt.pop('id')

In [19]:

submission_df = pd.DataFrame({
    'id': ids,
    'price': predictions.reshape(len(predictions),)
})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submission32.csv', index=False)