In [2]:
import os
import pickle
import numpy as np
import datetime
import json
from multiprocessing import Process
from multiprocess import Process

import keras
from tensorflow.keras.utils import plot_model
from tensorflow.keras.applications import VGG16, MobileNetV2, MobileNetV3Small, DenseNet201

from models import *
from utils import *
import matplotlib.pyplot as plt

In [3]:
# try reloading the module
IMAGE_WIDTH: int = 448
IMAGE_HEIGHT: int = 448


# Load Data
train_1_path: str = "../nybolig-scrape/output/train/train_1"
train_2_path: str = "../nybolig-scrape/output/train/train_2"
valid_path: str = "../nybolig-scrape/output/valid"
test_path: str = "../nybolig-scrape/output/test"

train1_df, train2_df, valid_df, test_df = data_to_df(
    [train_1_path, train_2_path, valid_path, test_path], preprocess=True, rm_outliers=True
)

display(train1_df.head())
# Print the lenghts of the datasets
print("Length of datasets:")
print(f"Train 1: {len(train1_df)}")
print(f"Train 2: {len(train2_df)}")
print(f"Valid: {len(valid_df)}")
print(f"Test: {len(test_df)}")
print(f"Total: {len(train1_df) + len(train2_df) + len(valid_df) + len(test_df)}")


#### Train Set 1 ####
train1_features = train1_df.drop(columns=["image_floorplan", "price"])
train1_images: np.array = preprocess_images(
    train1_df, "image_floorplan", IMAGE_WIDTH, IMAGE_HEIGHT, True, False, False
)
train1_prices: np.array = train1_df["price"].values


#### Train Set 2 ####
train2_features = train2_df.drop(columns=["image_floorplan", "price"])
train2_images: np.array = preprocess_images(
    train2_df, "image_floorplan", IMAGE_WIDTH, IMAGE_HEIGHT, True, False, False
)
train2_prices: np.array = train2_df["price"].values


#### Validation Set ####
valid_features = valid_df.drop(columns=["image_floorplan", "price"])
valid_images: np.array = preprocess_images(
    valid_df, "image_floorplan", IMAGE_WIDTH, IMAGE_HEIGHT, True, False, False
)
valid_prices: np.array = valid_df["price"].values


#### Test Set ####
test_features = test_df.drop(columns=["image_floorplan", "price"])
test_images: np.array = preprocess_images(
    test_df, "image_floorplan", IMAGE_WIDTH, IMAGE_HEIGHT, True, False, False
)
test_prices: np.array = test_df["price"].values


# Apparently the feature column order is not consistent between the datasets, so the 
# following code is needed to ensure that the columns are in the same order
train2_features = train2_features[train1_features.columns]
valid_features = valid_features[train1_features.columns]
test_features = test_features[train1_features.columns]


Processing ../nybolig-scrape/output/train/train_1: 100%|██████████| 113/113 [00:00<00:00, 140890.71it/s]
Processing ../nybolig-scrape/output/train/train_2: 100%|██████████| 114/114 [00:00<00:00, 259864.49it/s]
Processing ../nybolig-scrape/output/valid: 100%|██████████| 33/33 [00:00<00:00, 121307.65it/s]
Processing ../nybolig-scrape/output/test: 100%|██████████| 64/64 [00:00<00:00, 791845.00it/s]
Preprocessing: 100%|██████████| 4/4 [00:00<00:00, 81.91it/s]


Removing outliers...
Datapoints before: 317


Removing outliers: 100%|██████████| 4/4 [00:00<00:00, 240.04it/s]


Datapoints after: 311


Unnamed: 0,lattitude,longitude,postal_code,type,price,postal_avg_sqm_price,size,basement_size,rooms,year_built,year_rebuilt,energy_label,image_floorplan
0,55.7369662412981,12.5131171327647,41,0,1750000,0.0,40,0,2.0,1944.0,1944.0,4,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
1,55.6980849054223,12.5944699683182,19,0,8500000,0.0,138,0,4.0,2005.0,2005.0,4,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
2,55.545454,12.234008,31,0,5750000,0.0,111,0,3.0,2020.0,2020.0,0,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
3,55.7282025922529,12.5763975962706,43,0,2975000,0.0,62,0,1.0,1995.0,1995.0,5,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."
4,55.6722758410032,12.5733404752615,10,0,5295000,0.0,87,0,2.0,1917.0,1917.0,3,"[[[255, 255, 255], [255, 255, 255], [255, 255,..."


Length of datasets:
Train 1: 108
Train 2: 110
Valid: 32
Test: 61
Total: 311


In [3]:
# def save_expected_predicted(test_prices, test_predictions, img_dir):
#         #Set X and Y axis to [0, 9.000.000]
#     #plt.xlim(0, 9999999)
#     #plt.ylim(0, 9999999)
#     plt.scatter(test_prices, test_predictions)
#     plt.xlabel("Expected Price")
#     plt.ylabel("Predicted Price")
#     plt.title("Expected vs Predicted Price")
#     try: 
#         plt.plot([min(test_prices), max(test_prices)], [min(test_prices), max(test_prices)], color='red')
#     except:
#         pass
#     plt.savefig(f"{img_dir}/expected_vs_predicted.png")
#     plt.close()

# def save_residuals(test_prices, test_predictions, img_dir):
#     residuals = test_prices - test_predictions.reshape(-1)
#     plt.scatter(test_predictions, residuals)
#     try:
#         plt.hlines(y=0, xmin=test_prices.min(), xmax=test_prices.max(), colors="r")
#     except:
#         pass
#     plt.xlabel("Expected Price")
#     plt.ylabel("Residuals")
#     plt.title("Residuals")
#     plt.savefig(f"{img_dir}/residuals.png")
#     plt.close()

# def get_saliency_map(model, image):
#     image = np.expand_dims(image, axis=0)
#     image = image / 255.0
#     image = image.astype(np.float32)
#     image = tf.convert_to_tensor(image)
#     with tf.GradientTape() as tape:
#         tape.watch(image)
#         prediction = model(image)
#     gradients = tape.gradient(prediction, image)
#     gradients = tf.squeeze(gradients)
#     gradients = tf.reduce_max(gradients, axis=-1)
#     gradients = gradients.numpy()
#     gradients = (gradients - np.min(gradients)) / (np.max(gradients) - np.min(gradients))
#     return gradients

# def save_worst_best_predictions(model, test_predictions, test_prices, test_images, img_dir):
#     residuals = test_prices - test_predictions.reshape(-1)
#     distances = np.abs(test_prices - test_predictions.reshape(-1))
#     worst_predictions = np.argsort(distances)[-8:]
#     best_predictions = np.argsort(distances)[:8]
#     test_images = np.array(test_images)
#     for i, idx in enumerate(worst_predictions):
#         image = test_images[idx]
#         price = test_prices[idx]
#         prediction = test_predictions[idx]
#         residual = residuals[idx]
#         plt.imshow(image)
#         textstr = '\n'.join((
#             f"Price: {price}",
#             f"Predicted Price: {prediction}",
#             f"Residual: {residual}"
#         ))
#         plt.text(0.01, 0.99, textstr, fontsize=10, transform=plt.gcf().transFigure, verticalalignment='top')
#         plt.axis("off")
#         plt.savefig(f"{img_dir}/worst_{i}.png")
#         plt.close()
        
#         saliency_map = get_saliency_map(model, image)
#         plt.imshow(saliency_map, cmap="hot")
#         plt.axis("off")
#         plt.savefig(f"{img_dir}/worst_saliency_map_{i}.png")
#         plt.close()
        
#     for i, idx in enumerate(best_predictions):
#         image = test_images[idx]
#         price = test_prices[idx]
#         prediction = test_predictions[idx]
#         residual = residuals[idx]
#         plt.imshow(image)
#         textstr = '\n'.join((
#             f"Price: {price}",
#             f"Predicted Price: {prediction}",
#             f"Residual: {residual}"
#         ))
#         plt.text(0.01, 0.99, textstr, fontsize=10, transform=plt.gcf().transFigure, verticalalignment='top')
#         plt.axis("off")
#         plt.savefig(f"{img_dir}/best_{i}.png")
#         plt.close()
#         saliency_map = get_saliency_map(model, image)
#         plt.imshow(saliency_map, cmap="hot")
#         plt.axis("off")
#         plt.savefig(f"{img_dir}/best_saliency_map_{i}.png")
#         plt.close()

# def save_features_importance(feature_importance, img_dir):
#     #sort the feature_importance dict by value
#     feature_importance = {k: v for k, v in sorted(feature_importance.items(), key=lambda item: item[1], reverse=True)}
#     #add percentages to the bars
#     plt.bar(feature_importance.keys(), feature_importance.values())
#     #plt.bar_label = feature_importance.values()
#     plt.title('Feature Importance')
#     #Remove y-labels
#     plt.ylabel('')
#     plt.xticks(rotation=90)
#     #Zoom out so that text is visible 
#     plt.subplots_adjust(bottom=0.4)
#     plt.savefig(f"{img_dir}/feature_importance.png")
#     plt.close()

# def save_worst_best(test_predictions, test_prices, test_features, model_dir):
#     #Find the best predictions, and worst predictions. 
#     #Save them in two dataframes. Save a latex of the dataframe in a txt-file 
#     residuals = test_prices - test_predictions.reshape(-1)
#     distances = np.abs(test_prices - test_predictions.reshape(-1))
#     worst_predictions = np.argsort(distances)[-8:]
#     best_predictions = np.argsort(distances)[:8]
    
#     test_features_ = pd.DataFrame(test_features).copy()
#     test_features_["Price"] = test_prices
#     test_features_["Predicted Price"] = test_predictions
#     test_features_["Residual"] = residuals
#     test_features_ = test_features_.sort_values(by="Residual", ascending=False)
#     worst_df = test_features_.iloc[worst_predictions]
#     best_df = test_features_.iloc[best_predictions]
#     #save worst and best as latex in txt-file 
#     worst_df.to_latex(f"{model_dir}/worst_predictions.txt")
#     best_df.to_latex(f"{model_dir}/best_predictions.txt")

# def get_reconstructions(CNN_AE_RF_model, test_predictions, test_prices, test_images, img_dir):
#     residuals = test_prices - test_predictions.reshape(-1)
#     residuals = test_prices - test_predictions.reshape(-1)
#     distances = np.abs(test_prices - test_predictions.reshape(-1))
#     worst_predictions = np.argsort(distances)[-8:]
#     best_predictions = np.argsort(distances)[:8]
#     for i in range(8):
#         idx = best_predictions[i]
#         image = test_images[idx]
#         price = test_prices[idx]
#         prediction = test_predictions[idx]
#         residual = residuals[idx]
#         encoded_img, decoded_img = CNN_AE_RF_model.get_reconstruction(np.expand_dims(image, axis=0))
#         plt.imshow(tf.squeeze(decoded_img))
#         plt.show()
        

#     # for i in range(8):
#     #     idx = best_predictions[i]
#     #     image = test_images[idx]
#     #     price = test_prices[idx]
#     #     prediction = test_predictions[idx]
#     #     residual = residuals[idx]
#     #     encoded_img, decoded_img = CNN_AE_RF_model.get_reconstruction(np.expand_dims(image, axis=0))


def save_model_and_evaluate(
    model: object,
    fit_history: object,
    test_images: np.array,
    test_features: np.array,
    test_prices: np.array,
    model_dir: str,
    model_type:str
):
    if model_type == 'RF':
        print("Saving Model...")
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        with open(f"{model_dir}/model", "wb") as file_pi:
            pickle.dump(model, file_pi)
        test_predictions = model.predict(test_features)

    if model_type == "CNN":
        # Save Model
        print("Saving Model...")
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        model.save(f"{model_dir}/model")
        # Save Training History
        with open(f"{model_dir}/history", "wb") as file_pi:
            pickle.dump(fit_history.history, file_pi)
        test_predictions = model.predict(test_images)
        # Save Model Architecture
        # plot_model(model, to_file=f"{model_dir}/model_architecture.png", show_shapes=True, show_layer_names=True, show_dtype=True, rankdir="TB", expand_nested=False, dpi=96)
        img = plot_model(model, to_file=f"{model_dir}/architecture.png", show_shapes=True, show_layer_names=True, show_dtype=True, rankdir="TB", expand_nested=False, dpi=96)

    if model_type == 'CNN_RF' or model_type == 'CNN_AE_RF':
        print("Saving Model...")
        if not os.path.exists(model_dir):
            os.makedirs(model_dir)
        with open(f"{model_dir}/model", "wb") as file_pi:
            pickle.dump(model, file_pi)
        test_predictions = model.predict(test_images, test_features)

    # Evaluate Model
    print("Evaluating Model...")
    r2, mae, percentage_error, mse = regression_stats(test_prices, test_predictions)

    try:
        feature_importance = model.feature_importances_
        if model_type == "RF":
            feature_importance = dict(zip(test_features.columns, feature_importance))
        print(f"Before sorting: {feature_importance}")
        feature_importance = {
            k: v
            for k, v in sorted(
                feature_importance.items(), key=lambda item: item[1], reverse=True
            )
        }
        print(f"After sorting: {feature_importance}")
    except AttributeError:
        print("Cant find feature_importance")
        feature_importance = None

    # Load existing evaluation data
    evaluation_file_path = f"{model_dir}/evaluation.json"
    evaluation_data = {}
    if os.path.exists(evaluation_file_path):
        with open(evaluation_file_path, "r") as json_file:
            evaluation_data = json.load(json_file)

    # Add new evaluation data
    new_evaluation = {
        "Timestamp": str(datetime.datetime.now()),
        "R2": r2,
        "MAE": mae,
        "Percentage Error": percentage_error,
        "MSE": mse,
        "Feature Importances": (feature_importance),
    }
    evaluation_data[len(evaluation_data)] = new_evaluation

    # Save updated evaluation data
    with open(evaluation_file_path, "w") as json_file:
        json.dump(evaluation_data, json_file, indent=4)

    # Compute median evaluation values from all instances
    r2_values = [evaluation_data[key]["R2"] for key in evaluation_data]
    mae_values = [evaluation_data[key]["MAE"] for key in evaluation_data]
    percentage_error_values = [
        evaluation_data[key]["Percentage Error"] for key in evaluation_data
    ]
    mse_values = [evaluation_data[key]["MSE"] for key in evaluation_data]

    median_evaluation_data = {
        "R2": np.median(r2_values),
        "MAE": np.median(mae_values),
        "Percentage Error": np.median(percentage_error_values),
        "MSE": np.median(mse_values),
    }

    with open(f"{model_dir}/median_evaluation.json", "w") as json_file:
        json.dump(median_evaluation_data, json_file, indent=4)

    print("\nModel Evaluation:")
    print(new_evaluation)
    print("\nMedian Evaluation:")
    print(median_evaluation_data)
    print("Feauter Importance...")
    print(feature_importance)

    # Images (Create or open existing folder)
    if not os.path.exists(f"{model_dir}/images"):
        os.makedirs(f"{model_dir}/images")
    img_dir = f"{model_dir}/images"

    save_expected_predicted(test_prices, test_predictions, img_dir)
    save_residuals(test_prices, test_predictions, img_dir)

    if model_type == 'CNN':
        print("\nSaving Best and Worst Image Predictions")
        save_worst_best_predictions(model, test_predictions, test_prices, test_images, img_dir)

    if model_type != 'CNN': 
        print("\nSaving Feature Importance")
        save_features_importance(feature_importance, img_dir)

    save_worst_best(test_predictions, test_prices, test_features, model_dir)
    print("\nDone!")

    if model_type == 'CNN_AE_RF':
        print("\nSaving Reconstructions")
        save_reconstuctions(model, test_predictions, test_prices, test_images, img_dir)


def train_save_model(
    model_func: object,
    args: tuple,
    test_images: np.array,
    test_features: np.array,
    test_prices: np.array,
    model_dir: str,
    use_gpu: bool,
    model_type:str
):
    
    if use_gpu:
        set_gpu()
    else:
        set_cpu()

    if model_type == "CNN":
        model, fit_history = model_func(*args)
    if model_type == 'RF':
        model = model_func(*args)
        fit_history = None
    if model_type == 'CNN_RF' or model_type == 'CNN_AE_RF':
        model = model_func(*args)
        fit_history = None
    save_model_and_evaluate(model, fit_history, test_images, test_features, test_prices, model_dir, model_type)


def train_save_models(
    model_func: object,
    args: tuple,
    test_images: np.array,
    test_prices: np.array,
    model_dir: str,
    use_gpu: bool,
):
    if use_gpu:
        set_gpu()
    else:
        set_cpu()

    models, fit_histories = model_func(*args)
    for model_idx, (model, fit_history) in enumerate(zip(models, fit_histories)):
        save_model_and_evaluate(
            model, fit_history, test_images, test_prices, f"{model_dir}_{model_idx}"
        )

# Running on GPU

In [4]:
MODELS_PATH: str = "./models"
USE_GPU: bool = True

In [8]:
# TYPE = "RF"
# MODEL_NAME: str = "RF"
# FUNCTION: object = RF
# ARGS: tuple = (
#     train2_features,
#     train2_prices,
# )


# TYPE = "CNN"
# MODEL_NAME: str = "DenseNet201"
# FUNCTION: object = CNN_model
# ARGS: tuple = (
#     DenseNet201,
#     train1_images,
#     train1_prices,
#     valid_images,
#     valid_prices,
#     [
#         Flatten(),
#         Dense(512, activation="relu"),
#         BatchNormalization(),
#         Dropout(0.5),
#         Dense(256, activation="relu"),
#         BatchNormalization(),
#         Dropout(0.5),
#         Dense(128, activation="relu"),
#         BatchNormalization(),
#         Dropout(0.5),
#         Dense(1),
#     ],
# )


TYPE = "CNN_RF"
MODEL_NAME: str = "DenseNet201_RF"
FUNCTION: object = CNN_RF_model
ARGS: tuple = (
    f"{MODELS_PATH}/DenseNet201/model",
    train2_images,
    train2_features,
    train2_prices,
)


# TYPE = 'CNN_RF'
# MODEL_NAME: str = "VGG16_AE_RF"
# # MODEL_NAME: str = "EfficientNetB3_AE_RF"
# FUNCTION: object = CNN_AE_RF_model
# ARGS: tuple = (
#     f"{MODELS_PATH}/VGG16/model",
#     # f"{MODELS_PATH}/EfficientNetB3/model",
#     None,
#     train2_images,
#     train2_features,
#     train2_prices,
# )


# TYPE = 'CNN_RF'
# MODEL_NAME: str = "N_CNN_MobileNetV2_RF"
# FUNCTION: object = N_CNN_RF_model
# ARGS: tuple = (
#     2,
#     MobileNetV3Small,
#     np.concatenate((train1_images, train2_images), axis=0),
#     pd.concat((train1_features, train2_features), axis=0),
#     np.concatenate((train1_prices, train2_prices), axis=0),
# )
# p = Process(
#     target=train_save_models,
#     args=(FUNCTION, ARGS, test_images, test_prices, f"{MODELS_PATH}/{MODEL_NAME}"),
# )


# TYPE = "N_CNN"
# MODEL_NAME: str = "N_CNN_MobileNetV2"
# FUNCTION: object = N_CNN_model
# ARGS: tuple = (
#     MobileNetV2,
#     train_images,
#     train_prices,
#     valid_images,
#     valid_prices,
#     3,
# )
# p = Process(
#     target=train_save_models,
#     args=(FUNCTION, ARGS, test_images, test_prices, f"{MODELS_PATH}/{MODEL_NAME}"),
# )

In [9]:
# import os
# os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'

In [10]:
from IPython.display import clear_output

N = 10

for i in range(N):
    clear_output(wait=True)
    print(f"Run {i + 1}/{N}")
    p = Process(
        target=train_save_model,
        args=(
            FUNCTION,
            ARGS,
            test_images,
            test_features,
            test_prices,
            f"{MODELS_PATH}/{MODEL_NAME}",
            USE_GPU,
            TYPE,
        ),
    )
    p.start()
    p.join()

print("All Done!")

Run 10/10


2024-05-07 12:46:03.839818: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-07 12:46:03.867716: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-07 12:46:03.867841: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-07 12:46:03.872080: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-07 12:46:03.872304: I external/local_xla/xla/stream_executor

1 Physical GPUs, 1 Logical GPU


2024-05-07 12:46:04.048852: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-07 12:46:04.049009: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-07 12:46:04.049029: I tensorflow/core/common_runtime/gpu/gpu_device.cc:2022] Could not identify NUMA node of platform GPU id 0, defaulting to 0.  Your kernel may not have been built with NUMA support.
2024-05-07 12:46:04.049097: I external/local_xla/xla/stream_executor/cuda/cuda_executor.cc:887] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2024-05-07 12:46:04.049118: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1929] Created

Saving Model...




Evaluating Model...
Before sorting: {'image_predictions': 0.028308380114582667, 'postal_code': 0.06400561764371611, 'type': 0.0, 'size': 0.644189406365415, 'basement_size': 0.0, 'rooms': 0.00811290042659113, 'year_built': 0.03130893665578053, 'year_rebuilt': 0.044231884923078224, 'energy_label': 0.009867339974762827, 'postal_avg_sqm_price': 0.12808142806033834, 'lat': 0.015996647905721282, 'lng': 0.025897457930013922}
After sorting: {'size': 0.644189406365415, 'postal_avg_sqm_price': 0.12808142806033834, 'postal_code': 0.06400561764371611, 'year_rebuilt': 0.044231884923078224, 'year_built': 0.03130893665578053, 'image_predictions': 0.028308380114582667, 'lng': 0.025897457930013922, 'lat': 0.015996647905721282, 'energy_label': 0.009867339974762827, 'rooms': 0.00811290042659113, 'type': 0.0, 'basement_size': 0.0}

Model Evaluation:
{'Timestamp': '2024-05-07 12:47:49.381209', 'R2': 0.8033712951495242, 'MAE': 651976.0, 'Percentage Error': 15.10911101046815, 'MSE': 948315000907.5, 'Feature 