In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('./train.csv')
test = pd.read_csv('./test.csv')

In [3]:
def miss_values_info(df):
    # Assuming your DataFrame is named 'train'
    # Calculate the percentage of missing values for each column
    missing_percentage = (df.isnull().sum() / len(df)) * 100

    # Create a DataFrame to store the missing value information
    missing_info = pd.DataFrame({
        'Column': df.columns,
        'Missing Values': df.isnull().sum(),
        'Percentage': missing_percentage
    })

    # Filter the DataFrame to include only columns with missing values
    missing_info = missing_info[missing_info['Missing Values'] > 0]
    
    # Sort the DataFrame by the percentage of missing values in descending order
    missing_info = missing_info.sort_values(by='Percentage', ascending=False)

    # Display the columns with the most missing values
    return missing_info

In [4]:
train_missing = miss_values_info(train)
print('train missing info: \n', train_missing)

train missing info: 
                     Column  Missing Values  Percentage
PoolQC              PoolQC            1453   99.520548
MiscFeature    MiscFeature            1406   96.301370
Alley                Alley            1369   93.767123
Fence                Fence            1179   80.753425
FireplaceQu    FireplaceQu             690   47.260274
LotFrontage    LotFrontage             259   17.739726
GarageType      GarageType              81    5.547945
GarageYrBlt    GarageYrBlt              81    5.547945
GarageFinish  GarageFinish              81    5.547945
GarageQual      GarageQual              81    5.547945
GarageCond      GarageCond              81    5.547945
BsmtExposure  BsmtExposure              38    2.602740
BsmtFinType2  BsmtFinType2              38    2.602740
BsmtFinType1  BsmtFinType1              37    2.534247
BsmtCond          BsmtCond              37    2.534247
BsmtQual          BsmtQual              37    2.534247
MasVnrArea      MasVnrArea               8 

In [5]:
train.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=0, inplace=True)
test.drop(columns=['PoolQC', 'MiscFeature', 'Alley', 'Fence', 'FireplaceQu'], axis=0, inplace=True)

In [6]:
from sklearn.impute import SimpleImputer

def impute_missing_values(df):
    # List of float type columns with missing values
    float_columns = df.select_dtypes(include=['float64']).columns.tolist()

    # List of object type columns with missing values
    object_columns = df.select_dtypes(include=['object']).columns.tolist()

    # Impute missing values for float type columns with the median
    float_imputer = SimpleImputer(strategy='median')
    df[float_columns] = float_imputer.fit_transform(df[float_columns])

    # Impute missing values for object type columns with the most frequent category ('mode')
    object_imputer = SimpleImputer(strategy='most_frequent')
    df[object_columns] = object_imputer.fit_transform(df[object_columns])

    return df

In [7]:
train = impute_missing_values(train)
test = impute_missing_values(test)

In [8]:
from sklearn.preprocessing import LabelEncoder

def encode_categorical_columns(df):
    # Identify categorical columns
    categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    # Initialize a LabelEncoder
    label_encoder = LabelEncoder()
    
    # List of object type columns with missing values
    object_columns = df.select_dtypes(include=['object']).columns.tolist()
    
    # Encode each object column
    for column in object_columns:
        df[column] = label_encoder.fit_transform(df[column])
    
    return df

In [9]:
encoded_train = encode_categorical_columns(train)
encoded_test = encode_categorical_columns(test)

In [10]:
# Specify the correlation threshold
correlation_threshold = -0.01

# Create a list of columns with high correlation to 'SalePrice' in the training set
high_corr_columns_train = encoded_train.columns[encoded_train.corr()['SalePrice'] > correlation_threshold]

# Filter the training set to include only the selected columns
filtered_encoded_train = encoded_train[high_corr_columns_train.drop('SalePrice', errors='ignore')]

# Filter the test set to include only the selected columns
filtered_encoded_test = encoded_test[high_corr_columns_train.drop('SalePrice', errors='ignore')]


In [11]:
encoded_trains = encoded_train.drop(columns=['SalePrice'], axis=0)

In [12]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()

# Fit and transform on the training data
train_scaled = scaler.fit_transform(encoded_trains)

# Transform the test data using the same scaler
test_scaled = scaler.transform(encoded_test)

In [13]:
y_trains = encoded_train['SalePrice']

In [14]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(train_scaled, y_trains, test_size=0.2, random_state=42)

# optuna

In [15]:
import optuna
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.model_selection import train_test_split

# Define a function to create and compile the deep learning model
def create_model(trial):
    # Define hyperparameters to optimize
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 1e-2, log=True)
    num_hidden_layers = trial.suggest_int("num_hidden_layers", 1, 5)
    num_units = trial.suggest_int("num_units", 32, 512, log=True)
    dropout_rate = trial.suggest_float("dropout_rate", 0.0, 0.5)

    # Create a Sequential model
    model = keras.Sequential()

    # Input layer
    model.add(keras.layers.Input(shape=(input_dim,)))

    # Add hidden layers
    for _ in range(num_hidden_layers):
        model.add(keras.layers.Dense(num_units, activation="relu"))
        model.add(keras.layers.Dropout(dropout_rate))

    # Output layer
    model.add(keras.layers.Dense(1, activation="linear"))

    # Compile the model
    model.compile(
        optimizer=keras.optimizers.legacy.Adam(learning_rate),
        loss="mean_squared_error",  # You can change the loss function as needed
        metrics=["mean_absolute_error"],
    )

    return model

In [16]:
# Define a function to optimize the model using Optuna
def objective(trial):
    # Create the model
    model = create_model(trial)

    # Train the model
    model.fit(
        X_train,
        y_train,
        epochs=num_epochs,
        batch_size=batch_size,
        verbose=0,
        validation_data=(X_val, y_val),
    )

    # Evaluate the model on the validation set
    score = model.evaluate(X_val, y_val, verbose=0)

    # Return the negative mean absolute error as the objective to minimize
    return -score[1]


In [17]:
# Define the number of epochs and batch size
num_epochs = 50
batch_size = 32

In [18]:
# Get the input dimension
input_dim = X_train.shape[1]

In [19]:
# Create a study and optimize the model
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=100)

[I 2023-10-08 12:55:19,398] A new study created in memory with name: no-name-effe5512-cd91-45f2-a124-746c6f622d0a
2023-10-08 12:55:19.542284: W tensorflow/tsl/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz
[I 2023-10-08 12:55:30,725] Trial 0 finished with value: -178792.21875 and parameters: {'learning_rate': 0.000760894942184973, 'num_hidden_layers': 1, 'num_units': 150, 'dropout_rate': 0.17890882747417575}. Best is trial 0 with value: -178792.21875.
[I 2023-10-08 12:55:41,791] Trial 1 finished with value: -178839.4375 and parameters: {'learning_rate': 4.18887020922146e-05, 'num_hidden_layers': 1, 'num_units': 36, 'dropout_rate': 0.381878009841682}. Best is trial 1 with value: -178839.4375.
[I 2023-10-08 12:55:58,595] Trial 2 finished with value: -182202.328125 and parameters: {'learning_rate': 0.0008085752689945035, 'num_hidden_layers': 4, 'num_units': 216, 'dropout_rate': 0.4745177401391444}. Best is trial 2 with value: -182202.328125.
[I 2023-10-08 12:56

[I 2023-10-08 13:05:02,014] Trial 33 finished with value: -1420374.0 and parameters: {'learning_rate': 0.00748337781225453, 'num_hidden_layers': 5, 'num_units': 110, 'dropout_rate': 0.13321975299046535}. Best is trial 12 with value: -30379856.0.
[I 2023-10-08 13:05:19,527] Trial 34 finished with value: -3359887.0 and parameters: {'learning_rate': 0.002849529022095809, 'num_hidden_layers': 4, 'num_units': 91, 'dropout_rate': 0.09585588936930857}. Best is trial 12 with value: -30379856.0.
[I 2023-10-08 13:05:39,673] Trial 35 finished with value: -8156718.5 and parameters: {'learning_rate': 0.000996773072665463, 'num_hidden_layers': 5, 'num_units': 150, 'dropout_rate': 0.2100128631121636}. Best is trial 12 with value: -30379856.0.
[I 2023-10-08 13:06:00,301] Trial 36 finished with value: -3056856.0 and parameters: {'learning_rate': 0.00176888127287974, 'num_hidden_layers': 5, 'num_units': 73, 'dropout_rate': 0.2605044265731341}. Best is trial 12 with value: -30379856.0.
[I 2023-10-08 13:0

[I 2023-10-08 13:16:15,736] Trial 67 finished with value: -4886988.5 and parameters: {'learning_rate': 0.003986284924039692, 'num_hidden_layers': 4, 'num_units': 86, 'dropout_rate': 0.2684267102270502}. Best is trial 45 with value: -47572852.0.
[I 2023-10-08 13:16:38,564] Trial 68 finished with value: -7928075.5 and parameters: {'learning_rate': 0.005555190341507684, 'num_hidden_layers': 5, 'num_units': 176, 'dropout_rate': 0.22795706724055764}. Best is trial 45 with value: -47572852.0.
[I 2023-10-08 13:16:58,737] Trial 69 finished with value: -1573340.375 and parameters: {'learning_rate': 0.0015694172527248725, 'num_hidden_layers': 5, 'num_units': 120, 'dropout_rate': 0.27772741155097797}. Best is trial 45 with value: -47572852.0.
[I 2023-10-08 13:17:16,089] Trial 70 finished with value: -38044.28125 and parameters: {'learning_rate': 0.007613525062234885, 'num_hidden_layers': 3, 'num_units': 148, 'dropout_rate': 0.20570604038477502}. Best is trial 45 with value: -47572852.0.
[I 2023-1

In [20]:
# Print the best parameters and their value
best_params = study.best_params
best_value = study.best_value
print(f"Best Parameters: {best_params}")
print(f"Best Value: {best_value}")

Best Parameters: {'learning_rate': 0.0023382716942837627, 'num_hidden_layers': 5, 'num_units': 122, 'dropout_rate': 0.3469344226855262}
Best Value: -47572852.0


In [None]:
# Evaluate the final model on the test set
test_loss, test_accuracy = final_model.evaluate(X_test, y_test, verbose=1)
print(f"Test Loss: {test_loss}")
print(f"Test Mean Absolute Error: {test_accuracy}")

In [21]:
# Create a DataFrame for submission
submission_df = pd.DataFrame({'Id': test['Id'], 'SalePrice': y_pred})

# Save the submission DataFrame to a CSV file
submission_df.to_csv('sample_submission.csv', index=False)

NameError: name 'y_pred' is not defined