In [None]:
import os 
import sys
import json
import cv2
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from keras.preprocessing.image import load_img, img_to_array
from PIL import Image 
import numpy as np


# Loading Data 

In [None]:
class House: 
  def __init__(self, address, postal_code, type, real_price, 
                size, basement_size, rooms, year_built, 
                year_rebuilt, energy_label, image_floorplan): 
    
    #Textual Data 
    self.address = address  
    self.postal_code = postal_code
    self.type = type
    self.price = real_price
    self.size = size
    self.basement_size = basement_size
    self.rooms = rooms
    self.year_built = year_built
    self.year_rebuilt = year_rebuilt
    self.energy_label = energy_label

    #Image Data 
    self.image_floorplan = image_floorplan
    
    #Predictions 
    self.predicted_price = None

def load_jpg_and_json(folder_path:str) -> (dict, cv2):
  files = os.listdir(folder_path)
  jpg_file = None
  json_file = None
  # Find the jpg and json file in the folder
  for filename in files:
    if filename.endswith(".jpg"):
      jpg_file = os.path.join(folder_path, filename)
    elif filename.endswith(".json"):
      json_file = os.path.join(folder_path, filename)
  
  # Load the jpg and json file
  if jpg_file:
    image_data = cv2.imread(jpg_file)  
  else:
    image_data = None
  if json_file:
    with open(json_file, "r") as f:
      json_data = json.load(f) 
  return json_data, image_data

def create_house_instance(json_data, jpg): 
  address = json_data["address"]
  postal_code = json_data["postal_code"]
  type = json_data["type"]
  price = json_data["price"]
  size = json_data["size"]
  basement_size = json_data["basement_size"]
  rooms = json_data["rooms"]
  year_built = json_data["year_built"]
  year_rebuilt = json_data["year_rebuilt"] if "year_rebuilt" in json_data else json_data["year_built"]
  energy_label = json_data["energy_label"]
  image_floorplan = jpg

  house = House(address, postal_code, type, price, 
                size, basement_size, rooms, year_built, 
                year_rebuilt, energy_label, image_floorplan)
  return house

def load_houses(folder_path:str):
  houses = []
  for folder in os.listdir(folder_path):
    json_data, jpg = load_jpg_and_json(os.path.join(folder_path, folder))
    house = create_house_instance(json_data, jpg)
    houses.append(house)
  return houses

#If we want to work with a DF 
def data_to_DF(houses: list[House])-> pd.DataFrame:
  data = []
  for house in houses:
    data.append([house.address, house.postal_code, house.type, house.price, 
                house.size, house.basement_size, house.rooms, house.year_built, 
                house.year_rebuilt, house.energy_label, house.image_floorplan])
    
  df = pd.DataFrame(data, columns = ["address", "postal_code", "type", "price", 
                "size", "basement_size", "rooms", "year_built", 
                "year_rebuilt", "energy_label", "image_floorplan"])
  return df

In [None]:
path = "../nybolig-scrape/output"
houses = load_houses(path)
houses_df = data_to_DF(houses)
postal_codes = (1000, 2900)
types = ["Villa", "Rækkehus", "Ejerlejlighed"]
data = houses_df[(houses_df['postal_code'] >= postal_codes[0]) & (houses_df['postal_code'] <= postal_codes[1])]
display(houses_df.head())

# Preproccessing

In [None]:
"""
Add labels-column to the data-points, based on prices
Creates labels based on a normal distribution around the data.
That is, we have more labels the closer we are to the mean price, and less the further away we are.
Return a data-Frame with the labels and the label codes. 
"""
def label_data(data: pd.DataFrame, num_labels:int)-> pd.DataFrame:
  #We want to predict the price of the house 
  min = data['price'].min()
  first_quan = data['price'].quantile(0.25)
  mean = data['price'].mean()
  third_quan = data['price'].quantile(0.75)
  max = data['price'].max()
  #Create a normal distribution of the labels
  f1 = np.linspace(0, min, round(num_labels*0.023))
  f2 = np.linspace(min, first_quan, round(num_labels*0.14))
  f3 = np.linspace(first_quan, mean, round(num_labels*0.34))
  f4 = np.linspace(mean, third_quan, round(num_labels*0.34))
  f5 = np.linspace(third_quan,max, round(num_labels*0.14))
  f6 = np.linspace(max, max*2, round(num_labels*0.023))
  potential_labels = np.concatenate((f1, f2, f3, f4, f5, f6))

  #Create the label codes
  label_codes = [(i, label) for i, label in enumerate(potential_labels)]
  
  #Create the labels
  price_labels = []
  price_bracket = []
  for price in data['price']:
    diff = abs(potential_labels - price)
    index = np.argmin(diff)
    price_labels.append(index)
    left = potential_labels[index-1] if index > 0 else potential_labels[index]
    right = potential_labels[index+1] if index < len(potential_labels)-1 else potential_labels[index]
    price_bracket.append((left, right))

  data['label'] = price_labels
  data['price_bracket'] = price_bracket
  return data, label_codes

def convert_to_grayscale(image):
  gray_scale = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
  gray_scale = gray_scale.reshape(gray_scale.shape[0], gray_scale.shape[1], 1)
  return gray_scale

def resize_image(image, target_width, target_height):
  return cv2.resize(image, (target_width, target_height), interpolation = cv2.INTER_AREA)

def normalize_image(image):
  return image/255

def rescale_plan_by_size(image, size, basement_size): 
  canvas_size = image.shape[0]
  size_ratio = size/canvas_size
  basement_size_ratio = basement_size/canvas_size
  #Downsize the image, but make it fit the canvas size
  image = resize_image(image, int(image.shape[1]*size_ratio), int(image.shape[0]*size_ratio))
  
  

def preprocces_data(data: pd.DataFrame)-> pd.DataFrame:
  data = data.drop(columns=["address"])
  data['basement_size'] = data["basement_size"].fillna(0)
  data['year_rebuilt'] = data['year_rebuilt'].where(~data['year_rebuilt'].isna(), data['year_built']).astype(int)
  data['type'] = data['type'].astype('category').cat.codes
  data['energy_label'] = data['energy_label'].astype('category').cat.codes
  data.dropna(inplace=True)
  
  #data['image_floorplan'] = data['image_floorplan'].apply(convert_to_grayscale)
  
  #data['image_floorplan'] = data['image_floorplan'].apply(scale_image)
  
  #Optimal: use ImageGenerator to augment the images
  return data

preprocessed_data = preprocces_data(houses_df)
display(preprocessed_data.head())



In [None]:
print(preprocessed_data['image_floorplan'][0].shape)

In [None]:
#X_train, X_test, y_train, y_test = train_test_split(preprocessed_data['image_floorplan'].values, preprocessed_data['price'], test_size = 0.2, random_state = 0)
# Split the data into train, validation and test sets with a 60-20-20 ratio
train_df, test_df = train_test_split(preprocessed_data, test_size=0.2, random_state=0)
train_df, valid_df = train_test_split(train_df, test_size=0.20, random_state=0)

# Reshape the images to the desired size
target_height, target_width = 224*2, 224*2
train_images = np.array([cv2.resize(image, (target_height, target_width)) for image in train_df["image_floorplan"]])
valid_images = np.array([cv2.resize(image, (target_height, target_width)) for image in valid_df["image_floorplan"]])
test_images = np.array([cv2.resize(image, (target_height, target_width)) for image in test_df["image_floorplan"]])

plt.figure(figsize=(10, 10))
for i, (image, label) in enumerate(zip(train_images[0:9], train_df["price"][0:9])):
    plt.subplot(3, 3, i + 1)
    plt.imshow(image, cmap="gray")
    plt.title(f"{label}")
    plt.axis("off")

print(train_images[0].shape)

# Price Estimation 

In [None]:
#https://python.plainenglish.io/judge-the-book-price-by-its-cover-with-image-regression-using-cnns-python-770707e4fe67
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPool2D, MaxPool1D, BatchNormalization
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

In [None]:
input_shape = (train_images[0].shape)

model = Sequential()

model.add(Conv2D(filters = 16, kernel_size = (3, 3), activation='relu',
                 input_shape = input_shape))
#model.add(BatchNormalization())
model.add(Conv2D(filters = 16, kernel_size = (3, 3), activation='relu'))
#model.add(BatchNormalization())
model.add(MaxPool2D(strides=(2,2)))
model.add(Dropout(0.25))
model.add(Conv2D(filters = 32, kernel_size = (3, 3), activation='relu'))
#model.add(BatchNormalization())
model.add(Conv2D(filters = 32, kernel_size = (3, 3), activation='relu'))
#model.add(BatchNormalization())
model.add(MaxPool2D(strides=(2,2)))
model.add(Dropout(0.25))

model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.25))

model.add(Dense(1024, activation='relu'))
model.add(Dropout(0.4))
#model.add(Dense(n_classes, activation='softmax'))
model.add(Dense(1, activation='linear'))
learning_rate = 0.01

model.compile(loss = 'mae',
              optimizer = Adam(learning_rate))

epochs = 1
batch_size = 32
model.fit(train_images, train_df['price'], validation_data=(valid_images, valid_df['price']), epochs=epochs, batch_size=batch_size)

In [None]:
#Evaluate the model 
model.evaluate(test_images, test_df['price'])

#Make predictions
predictions = model.predict(test_images).flatten()

print(predictions[:20])
print(test_df['price'][0:20])

In [None]:
def plot_regression_results(model_name, y_test, y_pred):
    # Plotting the test set results
    plt.scatter(y_test, y_pred)

    # Calculate residuals
    residuals = y_pred - y_test

    # Calculate distances from the perfect fit line
    distances = np.abs(y_test - y_pred)

    # Define color gradient based on distances
    colors = distances / np.max(distances)  # Normalize distances to range [0, 1]
    # colors = plt.cm.RdYlGn_r(colors)  # Reverse the colormap: green (furthest), red (closest)

    # Plot true values vs predictions with color gradient
    plt.scatter(y_test, y_pred, c=colors)
    plt.xlabel('True values')
    plt.ylabel('Predictions')
    # Plot the perfect fit line
    plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], c='r')
    # Name the perfect fit line
    plt.title(f'True values vs Predictions ({model_name})')
    plt.colorbar(label='Distance from Diagonal')
    plt.legend(['Test values', 'Perfect fit'])
    plt.show()

    # Plot residuals
    plt.scatter(y_pred, residuals, c=colors)
    plt.hlines(y=0, xmin=y_test.min(), xmax=y_test.max(), colors='r')
    plt.title(f'Residual plot ({model_name})')
    plt.xlabel('Predicted values')
    plt.ylabel('Residuals')
    plt.colorbar(label='Distance from Diagonal')
    plt.legend(['Residuals', 'Perfect fit'])
    plt.show()

plot_regression_results('CNN', test_df['price'], predictions)