In [1]:
import os
import pickle
import numpy as np
import datetime
import json
from multiprocessing import Process
from multiprocess import Process

from tensorflow.keras.utils import plot_model
from tensorflow.keras.applications import VGG16, MobileNetV2, MobileNetV3Small, DenseNet201

from models import *
from utils import *

2024-05-08 15:13:46.638733: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-08 15:13:46.638821: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-08 15:13:46.640210: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-05-08 15:13:46.652582: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.

TensorFlow Addons (TFA) has ended development and in

In [2]:
IMAGE_WIDTH: int = 448
IMAGE_HEIGHT: int = 448


# Load Data
train_1_path: str = "../nybolig-scrape/output/train/train_1"
train_2_path: str = "../nybolig-scrape/output/train/train_2"
valid_path: str = "../nybolig-scrape/output/valid"
test_path: str = "../nybolig-scrape/output/test"

train1_df, train2_df, valid_df, test_df = data_to_df(
    [train_1_path, train_2_path, valid_path, test_path], preprocess=True, rm_outliers=True
)

display(train1_df.head())
print("Length of datasets:")
print(f"Train 1: {len(train1_df)}")
print(f"Train 2: {len(train2_df)}")
print(f"Valid: {len(valid_df)}")
print(f"Test: {len(test_df)}")
print(f"Total: {len(train1_df) + len(train2_df) + len(valid_df) + len(test_df)}")


#### Train Set 1 ####
train1_features = train1_df.drop(columns=["image_floorplan", "price"])
train1_images: np.array = preprocess_images(
    train1_df, "image_floorplan", IMAGE_WIDTH, IMAGE_HEIGHT, True, False, False
)
train1_prices: np.array = train1_df["price"].values


#### Train Set 2 ####
train2_features = train2_df.drop(columns=["image_floorplan", "price"])
train2_images: np.array = preprocess_images(
    train2_df, "image_floorplan", IMAGE_WIDTH, IMAGE_HEIGHT, True, False, False
)
train2_prices: np.array = train2_df["price"].values


#### Validation Set ####
valid_features = valid_df.drop(columns=["image_floorplan", "price"])
valid_images: np.array = preprocess_images(
    valid_df, "image_floorplan", IMAGE_WIDTH, IMAGE_HEIGHT, True, False, False
)
valid_prices: np.array = valid_df["price"].values


#### Test Set ####
test_features = test_df.drop(columns=["image_floorplan", "price"])
test_images: np.array = preprocess_images(
    test_df, "image_floorplan", IMAGE_WIDTH, IMAGE_HEIGHT, True, False, False
)
test_prices: np.array = test_df["price"].values


# Apparently the feature column order is not consistent between the datasets, so the 
# following code is needed to ensure that the columns are in the same order
train2_features = train2_features[train1_features.columns]
valid_features = valid_features[train1_features.columns]
test_features = test_features[train1_features.columns]


Processing ../nybolig-scrape/output/train/train_1: 100%|██████████| 311/311 [00:00<00:00, 1462363.84it/s]
Processing ../nybolig-scrape/output/train/train_2: 100%|██████████| 312/312 [00:00<00:00, 1641935.82it/s]
Processing ../nybolig-scrape/output/valid: 100%|██████████| 89/89 [00:00<00:00, 789203.08it/s]
Processing ../nybolig-scrape/output/test: 100%|██████████| 178/178 [00:00<00:00, 1287217.43it/s]
Preprocessing: 100%|██████████| 4/4 [00:00<00:00, 186.81it/s]


Removing outliers...
Datapoints before: 829


Removing outliers: 100%|██████████| 4/4 [00:00<00:00, 543.00it/s]


Datapoints after: 813


Unnamed: 0,postal_code,type,price,size,basement_size,rooms,year_built,year_rebuilt,energy_label,postal_avg_sqm_price,lat,lng,image_floorplan
0,46,0,1945000,70,0,2.0,1968.0,1968.0,5,33273.25,56.575356,8.205598,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
1,54,0,2725000,66,0,3.0,1934.0,1934.0,4,33507.5,55.696548,12.500334,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
2,39,0,3195000,51,0,2.0,1886.0,1886.0,5,51502.5,55.691847,12.559937,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
4,40,0,3550000,74,0,3.0,2018.0,2018.0,1,44946.75,55.658402,12.594399,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
5,40,0,5295000,120,0,4.0,2018.0,2018.0,1,44946.75,55.658402,12.594399,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."


Length of datasets:
Train 1: 284
Train 2: 286
Valid: 83
Test: 160
Total: 813


In [3]:
def save_model_and_evaluate(
    model: object,
    fit_history: object,
    test_images: np.array,
    test_features: np.array,
    test_prices: np.array,
    model_dir: str,
    model_type:str
):
    if model_type == 'RF':
        print("Saving Model...")
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        with open(f"{model_dir}/model", "wb") as file_pi:
            pickle.dump(model, file_pi)
        test_predictions = model.predict(test_features)

    if model_type == "CNN":
        # Save Model
        print("Saving Model...")
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        model.save(f"{model_dir}/model")
        # Save Training History
        with open(f"{model_dir}/history", "wb") as file_pi:
            pickle.dump(fit_history.history, file_pi)
        test_predictions = model.predict(test_images)
        # Save Model Architecture
        img = plot_model(model, to_file=f"{model_dir}/architecture.png", show_shapes=True, show_layer_names=True, show_dtype=True, rankdir="TB", expand_nested=False, dpi=96)

    if model_type == 'CNN_RF' or model_type == 'CNN_AE_RF':
        print("Saving Model...")
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        with open(f"{model_dir}/model", "wb") as file_pi:
            pickle.dump(model, file_pi)
        test_predictions = model.predict(test_images, test_features)

    # Evaluate Model
    print("Evaluating Model...")
    r2, mae, percentage_error, mse = regression_stats(test_prices, test_predictions)

    try:
        feature_importance = model.feature_importances_
        if model_type == "RF":
            feature_importance = dict(zip(test_features.columns, feature_importance))
        print(f"Before sorting: {feature_importance}")
        feature_importance = {
            k: v
            for k, v in sorted(
                feature_importance.items(), key=lambda item: item[1], reverse=True
            )
        }
        print(f"After sorting: {feature_importance}")
    except AttributeError:
        print("Cant find feature_importance")
        feature_importance = None

    # Load existing evaluation data
    evaluation_file_path = f"{model_dir}/evaluation.json"
    evaluation_data = {}
    if os.path.exists(evaluation_file_path):
        with open(evaluation_file_path, "r") as json_file:
            evaluation_data = json.load(json_file)

    # Add new evaluation data
    new_evaluation = {
        "Timestamp": str(datetime.datetime.now()),
        "R2": r2,
        "MAE": mae,
        "Percentage Error": percentage_error,
        "MSE": mse,
        "Feature Importances": (feature_importance),
    }
    evaluation_data[len(evaluation_data)] = new_evaluation

    # Save updated evaluation data
    with open(evaluation_file_path, "w") as json_file:
        json.dump(evaluation_data, json_file, indent=4)

    # Compute median evaluation values from all instances
    r2_values = [evaluation_data[key]["R2"] for key in evaluation_data]
    mae_values = [evaluation_data[key]["MAE"] for key in evaluation_data]
    percentage_error_values = [
        evaluation_data[key]["Percentage Error"] for key in evaluation_data
    ]
    mse_values = [evaluation_data[key]["MSE"] for key in evaluation_data]

    median_evaluation_data = {
        "R2": np.median(r2_values),
        "MAE": np.median(mae_values),
        "Percentage Error": np.median(percentage_error_values),
        "MSE": np.median(mse_values),
    }

    with open(f"{model_dir}/median_evaluation.json", "w") as json_file:
        json.dump(median_evaluation_data, json_file, indent=4)

    print("\nModel Evaluation:")
    print(new_evaluation)
    print("\nMedian Evaluation:")
    print(median_evaluation_data)
    print("Feauter Importance...")
    print(feature_importance)

    # Images (Create or open existing folder)
    if not os.path.exists(f"{model_dir}/images"):
        os.makedirs(f"{model_dir}/images")
    img_dir = f"{model_dir}/images"

    save_expected_predicted(test_prices, test_predictions, img_dir)
    save_residuals(test_prices, test_predictions, img_dir)

    if model_type == 'CNN':
        print("\nSaving Best and Worst Image Predictions")
        save_worst_best_predictions(model, test_predictions, test_prices, test_images, img_dir)

    if model_type != 'CNN': 
        print("\nSaving Feature Importance")
        save_features_importance(feature_importance, img_dir)

    save_worst_best(test_predictions, test_prices, test_features, model_dir)
    print("\nDone!")

    if model_type == 'CNN_AE_RF':
        print("\nSaving Reconstructions")
        save_reconstuctions(model, test_predictions, test_prices, test_images, img_dir)


def train_save_model(
    model_func: object,
    args: tuple,
    test_images: np.array,
    test_features: np.array,
    test_prices: np.array,
    model_dir: str,
    use_gpu: bool,
    model_type:str
):
    
    if use_gpu:
        set_gpu()
    else:
        set_cpu()

    if model_type == "CNN":
        model, fit_history = model_func(*args)
    if model_type == 'RF':
        model = model_func(*args)
        fit_history = None
    if model_type == 'CNN_RF' or model_type == 'CNN_AE_RF':
        model = model_func(*args)
        fit_history = None
    save_model_and_evaluate(model, fit_history, test_images, test_features, test_prices, model_dir, model_type)


def train_save_models(
    model_func: object,
    args: tuple,
    test_images: np.array,
    test_prices: np.array,
    model_dir: str,
    use_gpu: bool,
):
    if use_gpu:
        set_gpu()
    else:
        set_cpu()

    models, fit_histories = model_func(*args)
    for model_idx, (model, fit_history) in enumerate(zip(models, fit_histories)):
        save_model_and_evaluate(
            model, fit_history, test_images, test_prices, f"{model_dir}_{model_idx}"
        )

# Running on GPU

In [4]:
MODELS_PATH: str = "./models"
USE_GPU: bool = True

In [7]:
# Random Forest
# TYPE = "RF"
# MODEL_NAME: str = "RF"
# FUNCTION: object = RF
# ARGS: tuple = (
#     train2_features,
#     train2_prices,
# )


# CNN
# TYPE = "CNN"
# MODEL_NAME: str = "MobileNetV2"
# FUNCTION: object = CNN_model
# ARGS: tuple = (
#     MobileNetV2,
#     train1_images,
#     train1_prices,
#     valid_images,
#     valid_prices,
#     [
#         Flatten(),
#         Dense(512, activation="relu"),
#         BatchNormalization(),
#         Dropout(0.5),
#         Dense(256, activation="relu"),
#         BatchNormalization(),
#         Dropout(0.5),
#         Dense(128, activation="relu"),
#         BatchNormalization(),
#         Dropout(0.5),
#         Dense(1),
#     ],
# )


# CNN + Random Forest
# TYPE = "CNN_RF"
# MODEL_NAME: str = "MobileNetV2_RF"
# FUNCTION: object = CNN_RF_model
# ARGS: tuple = (
#     f"{MODELS_PATH}/MobileNetV2/model",
#     train2_images,
#     train2_features,
#     train2_prices,
# )


# CNN + Autoencoder + Random Forest
TYPE = 'CNN_RF'
MODEL_NAME: str = "VGG16_AE_RF"
# MODEL_NAME: str = "EfficientNetB3_AE_RF"
FUNCTION: object = CNN_AE_RF_model
ARGS: tuple = (
    f"{MODELS_PATH}/VGG16/model",
    # f"{MODELS_PATH}/EfficientNetB3/model",
    None,
    train2_images,
    train2_features,
    train2_prices,
)

In [8]:
from IPython.display import clear_output

N = 2

for i in range(N):
    clear_output(wait=True)
    print(f"Run {i + 1}/{N}")
    p = Process(
        target=train_save_model,
        args=(
            FUNCTION,
            ARGS,
            test_images,
            test_features,
            test_prices,
            f"{MODELS_PATH}/{MODEL_NAME}",
            USE_GPU,
            TYPE,
        ),
    )
    p.start()
    p.join()

print("All Done!")

Run 2/2


2024-05-08 15:20:37.653181: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-08 15:20:37.692801: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-08 15:20:37.693047: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-08 15:20:37.699768: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-08 15:20:37.699969: I external/local_xla/xla/stream_executor

1 Physical GPUs, 1 Logical GPU


2024-05-08 15:20:37.926639: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-08 15:20:37.926866: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-08 15:20:37.926892: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2022] Could not identify NUMA node of platform GPU id 0, defaulting to 0.  Your kernel may not have been built with NUMA support.
2024-05-08 15:20:37.926967: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-08 15:20:37.927018: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created

Epoch 1/30


2024-05-08 15:20:47.434071: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8902
2024-05-08 15:20:47.685134: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-05-08 15:20:49.008994: I external/local_tsl/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory
2024-05-08 15:20:53.510857: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f1d2058e160 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-05-08 15:20:53.510908: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce GTX 1080, Compute Capability 6.1
2024-05-08 15:20:53.516127: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1715174453.589742  166737 device_compiler.

Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


2024-05-08 15:23:08.958657: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 688816128 exceeds 10% of free system memory.
2024-05-08 15:23:27.536744: W external/local_tsl/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.62GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-05-08 15:23:28.355897: W external/local_tsl/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 3.62GiB with freed_by_count=0. The caller indicates that this is not a failure, but this may mean that there could be performance gains if more memory were available.
2024-05-08 15:23:28.600507: W external/local_tsl/tsl/framework/bfc_allocator.cc:296] Allocator (GPU_0_bfc) ran out of memory trying to allocate 1.55GiB with freed_by_count=0. The caller indicates that this is not a failure, but th

Saving Model...




Evaluating Model...
Before sorting: {'image_predictions': 0.03689113823055403, 'reconstruction_error': 0.02507884655387852, 'postal_code': 0.06357794652775754, 'type': 0.0, 'size': 0.6342430786977415, 'basement_size': 0.0, 'rooms': 0.0069665290381536625, 'year_built': 0.029392211274597308, 'year_rebuilt': 0.045119314430167236, 'energy_label': 0.007194854445207026, 'postal_avg_sqm_price': 0.11540237621445673, 'lat': 0.016150496244513854, 'lng': 0.019983208342972646}
After sorting: {'size': 0.6342430786977415, 'postal_avg_sqm_price': 0.11540237621445673, 'postal_code': 0.06357794652775754, 'year_rebuilt': 0.045119314430167236, 'image_predictions': 0.03689113823055403, 'year_built': 0.029392211274597308, 'reconstruction_error': 0.02507884655387852, 'lng': 0.019983208342972646, 'lat': 0.016150496244513854, 'energy_label': 0.007194854445207026, 'rooms': 0.0069665290381536625, 'type': 0.0, 'basement_size': 0.0}

Model Evaluation:
{'Timestamp': '2024-05-08 15:24:29.402137', 'R2': 0.7886327593