In [None]:
import os 
import json
import cv2
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from utils import *

# Loading Data 

In [None]:
class House: 
  def __init__(self, address, postal_code, type, real_price, 
                size, basement_size, rooms, year_built, 
                year_rebuilt, energy_label, image_floorplan): 
    
    #Textual Data 
    self.address = address  
    self.postal_code = postal_code
    self.type = type
    self.price = real_price
    self.size = size
    self.basement_size = basement_size
    self.rooms = rooms
    self.year_built = year_built
    self.year_rebuilt = year_rebuilt
    self.energy_label = energy_label

    #Image Data 
    self.image_floorplan = image_floorplan
    
    #Predictions 
    self.predicted_price = None


def load_jpg_and_json(folder_path:str) -> (dict, np.ndarray):
  files = os.listdir(folder_path)
  jpg_file_path = None
  json_file_path = None

  # Find the jpg and json file in the folder
  for filename in files:
    if filename.endswith(".jpg"):
      jpg_file_path = os.path.join(folder_path, filename)
    elif filename.endswith(".json"):
      json_file_path = os.path.join(folder_path, filename)

  # Load the jpg
  image_data = cv2.imread(jpg_file_path)
  # Load the json
  with open(json_file_path, "r", encoding="utf-8") as file:
    json_data = json.load(file)

  if image_data is None:
    raise Exception(f"Error loading image {jpg_file_path}")
  if json_data is None:
    raise Exception(f"Error loading json {json_file_path}")

  return json_data, image_data

def create_house_instance(json_data, jpg): 
  address = json_data["address"]
  postal_code = json_data["postal_code"]
  type = json_data["type"]
  price = json_data["price"]
  size = json_data["size"]
  basement_size = json_data["basement_size"]
  rooms = json_data["rooms"]
  year_built = json_data["year_built"]
  year_rebuilt = json_data["year_rebuilt"] if json_data["year_rebuilt"] else None
  energy_label = json_data["energy_label"]
  image_floorplan = jpg

  house = House(address, postal_code, type, price, 
                size, basement_size, rooms, year_built, 
                year_rebuilt, energy_label, image_floorplan)
  return house

def load_houses(folder_path: str, max_houses: int = None):
    houses = []
    count = 0  # Counter to track the number of loaded houses
    for folder in os.listdir(folder_path):
        if max_houses is not None and count >= max_houses:
            break  # Stop loading houses if the maximum number is reached
        try:
            json_data, jpg = load_jpg_and_json(os.path.join(folder_path, folder))
            house = create_house_instance(json_data, jpg)
            houses.append(house)
            count += 1
        except Exception as e:
            print(f"Error loading house {folder}: {e}")
    return houses

#If we want to work with a DF 
def data_to_DF(houses: list[House])-> pd.DataFrame:
  data = []
  for house in houses:
    data.append([house.address, house.postal_code, house.type, house.price, 
                house.size, house.basement_size, house.rooms, house.year_built, 
                house.year_rebuilt, house.energy_label, house.image_floorplan])
  df = pd.DataFrame(data, columns = ["address", "postal_code", "type", "price", 
                "size", "basement_size", "rooms", "year_built", 
                "year_rebuilt", "energy_label", "image_floorplan"])
  return df


In [None]:
path = "../nybolig-scrape/output"
houses = load_houses(path, max_houses=1000)
houses_df = data_to_DF(houses)
houses_df = houses_df[(houses_df['postal_code'] >= 1000) & (houses_df['postal_code'] <= 2920)]
#Count type 
print(houses_df['type'].value_counts())
#houses_df = houses_df[houses_df['type'] == 'ejerlejlighed']

print("Number of datapoints: ", len(houses_df))
display(houses_df)

# Preprocessing

In [None]:
from sklearn.preprocessing import OneHotEncoder
"""
Add labels-column to the data-points, based on prices. Simply version
"""
def binary_labels(df: pd.DataFrame) -> pd.DataFrame:
  mean = df["price"].mean()
  df["label"] = df["price"].apply(lambda x: 0 if x > mean else 1)
  return df

"""
Add labels-column to the data-points, based on prices
Creates labels based on a normal distribution around the data.
That is, we have more labels the closer we are to the mean price, and less the further away we are.
Return a data-Frame with the labels and the label codes. 
"""
def normal_distribution_label(data: pd.DataFrame, num_labels:int)-> pd.DataFrame:
  #We want to predict the price of the house 
  min = data['price'].min()
  first_quan = data['price'].quantile(0.25)
  mean = data['price'].mean()
  third_quan = data['price'].quantile(0.75)
  max = data['price'].max()
  #Create a normal distribution of the labels
  f1 = np.linspace(0, min, round(num_labels*0.023))
  f2 = np.linspace(min, first_quan, round(num_labels*0.14))
  f3 = np.linspace(first_quan, mean, round(num_labels*0.34))
  f4 = np.linspace(mean, third_quan, round(num_labels*0.34))
  f5 = np.linspace(third_quan,max, round(num_labels*0.14))
  f6 = np.linspace(max, max*2, round(num_labels*0.023))
  potential_labels = np.concatenate((f1, f2, f3, f4, f5, f6))

  #Create the label codes
  label_codes = [(i, label) for i, label in enumerate(potential_labels)]
  
  #Create the labels
  price_labels = []
  price_bracket = []
  for price in data['price']:
    diff = abs(potential_labels - price)
    index = np.argmin(diff)
    price_labels.append(index)
    left = potential_labels[index-1] if index > 0 else potential_labels[index]
    right = potential_labels[index+1] if index < len(potential_labels)-1 else potential_labels[index]
    price_bracket.append((left, right))

  data['label'] = price_labels
  data['price_bracket'] = price_bracket
  return data, label_codes

def label_low_med_high(df: pd.DataFrame, onehot:bool)-> pd.DataFrame:
  price_ranges = {
    "low": (0,df['price'].quantile(0.33)),
    "med": (df['price'].quantile(0.33), df['price'].quantile(0.66)),
    "high": (df['price'].quantile(0.66), df['price'].max()), 
  }
  def label(price): 
    if price >= price_ranges['low'][0] and price<= price_ranges['low'][1]: 
      return 0
    elif price >= price_ranges['med'][0] and price <= price_ranges['med'][1]:
      return 1
    else: 
      return 2
  df['label_price'] = df['price'].apply(label)

  return df 
  

def preprocces_data(df: pd.DataFrame)-> pd.DataFrame:
  """
  Preprocess the data.
  """
  #df = df.drop(columns=["address"])
  #Feature Columns
  df['basement_size'] = df["basement_size"].fillna(0)
  df['year_rebuilt'] = df['year_rebuilt'].where(~df['year_rebuilt'].isna(), df['year_built']).astype(int)
  #df['type'] = df['type'].astype('category').cat.codes
  df['energy_label'] = df['energy_label'].astype('category').cat.codes
  #data.dropna(inplace=True)

  #Image Columns 
  #df['image_floorplan'] = df['image_floorplan'].apply(convert_to_grayscale)
  #Optimal: use ImageGenerator to augment the images#
  
  #Adding Labels 
  df = (label_low_med_high(df, onehot=True))
  return df

In [None]:
houses_df = preprocces_data(houses_df)
display(houses_df.head())

# Setting up splits

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into train, validation and test sets with a 60-20-20 ratio
train_df, test_df = train_test_split(houses_df, test_size=0.2, random_state=0)
train_df, valid_df = train_test_split(train_df, test_size=0.20, random_state=0)

In [None]:
# Reshape the images to the desired size
#Could set this up in preprocessing 
target_height = 500
target_width = 500

train_images = np.array([cv2.resize(image, (500, 500)) for image in train_df["image_floorplan"]])
valid_images = np.array([cv2.resize(image, (500, 500)) for image in valid_df["image_floorplan"]])
test_images = np.array([cv2.resize(image, (500, 500)) for image in test_df["image_floorplan"]])

In [None]:
plt.figure(figsize=(10, 10))
for i, (image, label) in enumerate(zip(train_df["image_floorplan"][0:9], train_df["price"][0:9])):
    plt.subplot(3, 3, i + 1)
    plt.imshow(image)
    plt.title(f"{label}")
    plt.axis("off")

# Functionality for model training

In [None]:
import tensorflow as tf
def save_model(model, name): 
  model.save(name)
  
def load_model(model_name): 
  model = tf.keras.models.load_model(model_name)
  return model

# Pre-trained VGG16

In [None]:
import tensorflow as tf
# Check available GPUs
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Set the GPU to be used
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)
else:
    print("No GPU available")

## Model Fitting

In [None]:
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten
from tensorflow.keras.optimizers import Adam

# Load pre-trained VGG16 model (without including top layers)
input_shape = train_images[0].shape 
base_model = VGG16(weights='imagenet', include_top=False, input_shape=input_shape)

# Freeze the pre-trained layers
for layer in base_model.layers:
    layer.trainable = False

# Add new top layers for regression
model = Sequential([
    base_model,
    Flatten(),
    Dense(512, activation="relu"),
    Dense(256, activation="relu"),
    Dense(1, activation="linear")
])

In [None]:
# Compile the model
model.compile(optimizer=Adam(learning_rate=0.0001), loss='mean_absolute_error')
model.summary()

In [None]:
# Train the model
history = model.fit(train_images, train_df["price"], validation_data=(valid_images, valid_df["price"]), epochs=2)

In [None]:
#Save th model 
save_model(model, "VVG16_model")

In [None]:
model.evaluate(test_images, test_df["price"])

# Make predictions
predictions = model.predict(test_images)

## Model Evaluation

In [None]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error as mae
from sklearn.metrics import mean_squared_error as mse

real_prices = test_df['price'].values
predicted_prices = predictions.flatten()

# Print the R2 score, MAE and MSE
print(f"R2 score: {r2_score(real_prices, predicted_prices):.2f}")
print(f"Mean Absolute Error: {mae(real_prices, predicted_prices):.2f}")
print(f"Mean Squared Error: {mse(real_prices, predicted_prices):.2f}")

# Plot the predictions
plt.figure(figsize=(10, 10))
for i, (image, label, prediction) in enumerate(zip(test_images[0:9], test_df["price"][0:9], predictions[0:9])):
    plt.subplot(3, 3, i + 1)
    plt.imshow(image)
    plt.title(f"Real: {label}\nPredicted: {prediction[0]:.0f}")
    plt.axis("off")
plt.show()

# Plot the predictions vs real prices
plot_regression_results('VGG16', real_prices, predicted_prices)

# Binary Classification 

## Model Fitting

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, Conv2D, MaxPool2D, MaxPool1D, BatchNormalization
from tensorflow.keras.layers import Activation, Dropout, Flatten, Dense
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.optimizers import Adam, RMSprop
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.models import load_model

In [None]:
input_shape = (train_images[0].shape)

binary_model = Sequential()
binary_model.add(Conv2D(filters = 16, kernel_size = (3, 3), activation='relu', input_shape = input_shape))
binary_model.add(BatchNormalization())
binary_model.add(Conv2D(filters = 16, kernel_size = (3, 3), activation='relu'))
#model.add(BatchNormalization())
binary_model.add(MaxPool2D(strides=(2,2)))
binary_model.add(Dropout(0.25))
binary_model.add(Conv2D(filters = 32, kernel_size = (3, 3), activation='relu'))
#binary_#model.add(BatchNormalization())
binary_model.add(Conv2D(filters = 32, kernel_size = (3, 3), activation='relu'))
#binary_#model.add(BatchNormalization())
binary_model.add(MaxPool2D(strides=(2,2)))
binary_model.add(Dropout(0.25))
#binary_
binary_model.add(Flatten())
binary_model.add(Dense(512, activation='relu'))
binary_model.add(Dropout(0.25))
#binary_
binary_model.add(Dense(1024, activation='relu'))
binary_model.add(Dropout(0.4))
#binary_#model.add(Dense(n_classes, activation='softmax'))
binary_model.add(Dense(1, activation='softmax'))


In [None]:
# Compile the model
binary_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
target_height, target_width = 224, 224
train_images = np.array([cv2.resize(image, (target_height, target_width)) for image in train_df["image_floorplan"]])
valid_images = np.array([cv2.resize(image, (target_height, target_width)) for image in valid_df["image_floorplan"]])
test_images = np.array([cv2.resize(image, (target_height, target_width)) for image in test_df["image_floorplan"]])

mean = houses_df["price"].median()
train_labels = train_df["price"].apply(lambda x: 0 if x > mean else 1)
valid_labels = valid_df["price"].apply(lambda x: 0 if x > mean else 1)
print(valid_labels[:10])

binary_model.fit(train_images, train_labels, validation_data=(valid_images, valid_labels), epochs=8, batch_size=32)

In [None]:
#Save the binary model
binary_model.save("binary_model")

## Binary Model: Evaluation

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import seaborn as sns

predicted_test_labels = binary_model.predict(test_images)
actual_test_labels = [test_df['price'].apply(lambda x: 0 if x >= mean else 1)]
print(np.round(predictions,2))

# Print the accuracy
print(f"Accuracy: {accuracy_score(actual_test_labels, predicted_test_labels):.2f}")

