# Modules to be imported

In [None]:
import re
import overpy
import psycopg2
import statistics
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import csv

# Working with the extracted data

In [None]:
#set option to see all rows
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

#open csv file from extracted immo entries
immos = []
with open('immo.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=' ', quotechar='|')
    for row in spamreader:
        print(' '.join(row).split(','))
        immos.append(' '.join(row))

In [None]:
#create new dataframe and insert all the results
df = pd.DataFrame()
#do some string transformations and split the string into separate values
for i in immos:
    i = i.replace('"','').replace('\'','')
    i = re.split(r'[()]', i)
    i = [x for x in i if not(x=='' or x==',')]
    df1 = pd.DataFrame(i)
    df1 = df1[0].str.split(',', expand=True)
    df = pd.concat([df,df1])
   

In [None]:
#initial dataframe length without filters
len(df)

In [None]:
# get only the data which has m^2 information
df = df[df[1].str.contains('m²')==True]
df = df.reset_index(inplace=False, drop=True)
df.head()

In [None]:
len(df)

In [None]:
df = df.where(pd.notnull(df), None)

for i, rows in df.iterrows():
    #print(print(rows[4]))
    if(rows[4] is not None):
        if(re.match("[A-Z]{2}$",rows[4].lstrip())):
            #print(i)
            df[5][i] = df[4][i].lstrip()
        if(re.search(r"^\d{4}",rows[3].lstrip())):
            #print(i)
            df[4][i] = df[3][i].lstrip()
            df[3][i] = 'nicht vorhanden'
        if(re.search(r"\.\—",df[3][i])):
                if(re.search(r"^\d{4}",rows[4].lstrip())):
                    df[2][i] = df[2][i]+(df[3][i].lstrip())
                    df[3][i] = 'nicht vorhanden'
                    #print(i)
                elif(re.search(r"^\d{4}",rows[5].lstrip())):
                    df[2][i] = df[2][i]+(df[3][i].lstrip())
                    df[3][i] = df[4][i].lstrip()
                    df[4][i] = df[5][i].lstrip()
                    df[5][i] = df[6][i].lstrip()
                    df[6][i] = None
                    #df[3][i] = 'nicht vorhanden'

        if (rows[6] is not None):
            if(re.match("[A-Z]{2}$",rows[6].lstrip())):
                #print(i)
                df[4][i] = df[5][i].lstrip()
                df[5][i] = df[6][i].lstrip()
                df[6][i] = None
            elif(re.match(r"^\d{4}",rows[6].lstrip())):
                df[4][i] = df[6][i].lstrip()
                df[5][i] = df[7][i].lstrip()
                df[6][i] = None
                df[7][i] = None
                #print(df.iloc[i])
            elif(re.match(r"^\d{4}",rows[7].lstrip())):
                #df[5][i] = df[6][i].lstrip()
                df[4][i] = df[7][i].lstrip()
                df[5][i] = df[8][i].lstrip()
                df[6][i] = None
                df[7][i] = None
                df[8][i] = None
#   

In [None]:
df.head()

In [None]:
#check if every row has a zip code
import math
for i, rows in df.iterrows():
    if(rows[4] is  None):
        print(i)
        continue


In [None]:
#manual data extension with the help of google
df.loc[1269][4] = '4052 Basel'
df.loc[1269][5] = 'BS'
df.loc[1829][4] = '4051 Basel'
df.loc[1829][5] = 'BS'
df = df.drop([1880])
df.loc[2261][4] = '1005 Lausanne'
df.loc[2281][4] = '1018 Lausanne'
df.loc[2281][5] = 'VD'
df.loc[2385][4] = '3013 Bern'
df.loc[2385][5] = 'BE'
df.loc[2515][4] = '3007 Bern'
df.loc[2515][5] = 'BE'
df.loc[3397][4] = '6992 Lugano'
df.loc[3614][4] = '2503 Biel/Bienne'
df.loc[3614][5] = 'BE'


In [None]:
df = df.drop(columns=[6,7,8])

In [None]:
# get only the data which has room information
df = df[df[0].str.contains('room')==True]
len(df)

In [None]:
# get only the data which has correct price information
df = df[df[2].str.contains('CHF')==True]
len(df)

In [None]:
#split rooms into number and word
df[0] = df[0].str.split()
df[1] = df[1].str.split()
df[2] = df[2].str.split()
df[4] = df[4].str.split()
df[6] = df[5]

In [None]:
df.head()

In [None]:
#only save number of rooms, m2 value and francs and split zip code into two columns
for i,rows in df.iterrows():
    rows[0] = float(rows[0][0])
    rows[1] = int(rows[1][0])
    rows[2] = int(rows[2][1].replace('.—',''))
    rows[5] = str(rows[4][1])
    rows[4] = int(rows[4][0])
    

In [None]:
df.head()

In [None]:
#create id
df.insert(0, 'ID', range(1, len(df)+1))

In [None]:
len(df)

# WEB API using Overpass Turbo

In [None]:
# defining the overpass query
query = f'''
[out:json];
area[name="Zurich"] -> .searchArea0;
area[name="Geneva"] -> .searchArea1;
area[name="Basel"] -> .searchArea2;
area[name="Lausanne"] -> .searchArea3;
area[name="Bern"] -> .searchArea4;
area[name="Winterthur"] -> .searchArea5;
area[name="Luzern"] -> .searchArea6;
area[name="St. Gallen"] -> .searchArea7;
area[name="Lugano"] -> .searchArea8;
area[name="Biel"] -> .searchArea9;
( area.searchArea0; area.searchArea1; area.searchArea2; area.searchArea3; area.searchArea4; area.searchArea5; area.searchArea6; area.searchArea7; area.searchArea8; area.searchArea9;) -> .searchArea;
(node["shop"="supermarket"](area.searchArea);way["shop"="supermarket"](area.searchArea););
out center;
'''

# creating Overpass API object
api = overpy.Overpass()

# performing the query and retrieveing the results
response = api.query(query)

#create dataframe and append ways
rdf= pd.DataFrame()
for i in response.ways:
    if("addr:postcode" in i.tags and "name" in i.tags):
            tdf = pd.DataFrame(i.tags,index=[i.id])
            tdf["type"] = 'way'
            rdf = pd.concat([rdf,tdf])

#append nodes to the dataframe
for i in response.nodes:
    if("addr:postcode" in i.tags and "name" in i.tags):
        tdf = pd.DataFrame(i.tags,index=[i.id])
        tdf["type"] = 'node'
        rdf = pd.concat([rdf,tdf])


In [None]:
rdf.head()

# Connection to AWS PostgreSQL and inserting Data

In [None]:
conn = psycopg2.connect(
        host="ads-database1.cbwqb3cep5ch.eu-central-1.rds.amazonaws.com",
        database="adsdatabase",
        user="adschief1",
        password="1eYvmDnMPikKSImeLrev"
    )
cur = conn.cursor()

In [None]:
# Create the table for ImmoScout24 in the database
cur.execute(
    '''CREATE TABLE IF NOT EXISTS immos (
   "id" int PRIMARY KEY,
   "rooms" FLOAT NOT NULL,
   "size" INT NOT NULL,
   "price" INT NOT NULL,
   "address" VARCHAR ( 255 ),
   "zipcode" INT NOT NULL,
   "city"  VARCHAR ( 15 ) NOT NULL,
   "kanton" VARCHAR ( 2 ) NOT NULL
);'''
)

In [None]:
cur.execute(
    '''CREATE TABLE IF NOT EXISTS supermarkets (
   "id" BIGINT PRIMARY KEY,
   "type" VARCHAR ( 10 ) NOT NULL,
   "name" VARCHAR ( 255 ),
   "zipcode" INT NOT NULL,
   "city" VARCHAR ( 50 ) ,
   "address" VARCHAR ( 255 )
);'''
)

In [None]:
# Insert the data from ImmoScout24 into the database

# Iterate over the properties array
for i, rows in df.iterrows():
    
    # Extract the individual details
    id = rows["ID"]
    rooms = rows[0]
    size = rows[1]
    price = rows[2]
    address = rows[3]
    zipcode = rows[4]
    city = rows[5]
    kanton = rows[6].lstrip()
    
    # Process the individual data record as needed
    print("Rooms:", rooms)
    print("Size:", size)
    print("Price:", price)
    print("Address:", address)
    print("Zip-Code", zipcode)
    print("city", city)
    print("Kanton", kanton)
    print("--------------------")


    # Insert the data from ImmoScout24 into the database
    cur.execute(
        "INSERT INTO immos (id, rooms, size, price, address, zipcode, city, kanton) VALUES (%s, %s, %s, %s, %s, %s, %s, %s) on conflict do nothing",
        (id,rooms, size, price, address, zipcode, city, kanton)
    )


# Commit the changes to the database
conn.commit()

# Close the cursor and connection
#cur.close()
#conn.close()


In [None]:
for i, rows in rdf.iterrows():
    
    # Extract the individual details
    id = i
    type = rows["type"]
    name = rows["name"]
    address = str(rows["addr:street"])+' '+str(rows["addr:housenumber"])
    zipcode = rows["addr:postcode"]
    city = rows["addr:city"]


    # Process the individual data record as needed
    print("id:", id)
    print("type:", type)
    print("name:", name)
    print("Address:", address)
    print("Zip-Code", zipcode)
    print("city", city)
    print("--------------------")

    # Insert the data from Overpass into the database
    cur.execute(
        "INSERT INTO supermarkets (id, type, name, address, zipcode, city) VALUES (%s, %s, %s, %s, %s, %s) ON CONFLICT DO NOTHING",
        (id, type, name, address, zipcode, city)
    )
# Commit the changes to the database
conn.commit()

# Close the cursor and connection
#cur.close()
#conn.close()

In [None]:
cur = conn.cursor()
cur.execute('''      
UPDATE immos
SET city = 'St.Gallen'
WHERE city = 'St.';
''')

#### Join the two Tables "immoscout24_data" and "overpass_data"

In [None]:
query = "SELECT * FROM immos"
immo_df = pd.read_sql(query, conn)
immo_df.head()

In [None]:
query = "SELECT * FROM supermarkets"
sm_df = pd.read_sql(query, conn)
sm_df.head()

In [None]:
immo_df.head()

In [None]:
# get amount of markets per zip code
dupli = sm_df.pivot_table(index = ['zipcode'], aggfunc ='size')

In [None]:
dupli.head()

In [None]:
# merge supermarket list with final dataset
final = immo_df.merge(dupli.rename('supermarkets'),on='zipcode',how='left')
final = final.fillna(0)

# EDA using our data from PostgreSQL

In [None]:
# Summary statistics
final.describe()

In [None]:
# Examine the data types
final.dtypes

In [None]:
# Count number of missing values
final.isnull().sum()

In [None]:
# Retrieve the data from the database
cur.execute("SELECT price FROM immos")

# Fetch all the rows of the query result
rows = cur.fetchall()

# Extract the values from the rows
prices = [row[0] for row in rows]

# Calculate the mean
mean = statistics.mean(prices)

# Calculate the median
median = statistics.median(prices)

# Calculate the mode
mode = statistics.mode(prices)

# Calculate the variance
variance = statistics.variance(prices)

# Calculate the standard deviation
std_dev = statistics.stdev(prices)

# Print the results
print("Mean:", mean)
print("Median:", median)
print("Mode:", mode)
print("Variance:", variance)
print("Standard Deviation:", std_dev)

In [None]:
# Histogram of apartment prices
plt.figure(figsize=(10, 6))
plt.hist(final['price'], bins=20, edgecolor='black')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.title('Distribution of Apartment Prices')
plt.show()

# Scatter plot of apartment price vs. square footage
plt.figure(figsize=(10, 6))
plt.scatter(final['size'], final['price'])
plt.xlabel('Square Footage')
plt.ylabel('Price')
plt.title('Apartment Price vs. Square Footage')
plt.show()

# Correlation Matrix 

### How do we separate the data into training and testing data?

In [None]:
# Split the data into training and test sets (80% training, 20% testing)
train_df, test_df = train_test_split(final, test_size=0.2, random_state=42)


# Create the correlation matrix 
correlation_matrix = train_df.corr()

print(correlation_matrix)

### Correlation Matrix Plot

In [None]:
# Correlation Matrix Plot (corrplot)
sns.pairplot(train_df)

# normalization standardization

In [None]:
# Let us standardize training set, creating the so-called Z-scores.
train_df_mean = train_df.mean()
train_df_std = train_df.std()
train_df_stand = (train_df - train_df_mean)/train_df_std

# Let us do the same for the test dataset
test_df_mean = test_df.mean()
test_df_std = test_df.std()
test_df_stand = (test_df - test_df_mean)/test_df_std

print("The data points have been standardized.")

# Linear Regression

In [None]:
# Create an empty list that will eventually hold all created feature columns.
feature_columns = []

resolution_in_Zs = 0.3  # 3/10 of a standard deviation.


# Create a bucket feature column for latitude.
rooms_as_a_numeric_column = tf.feature_column.numeric_column("rooms")
rooms_boundaries = list(np.arange(int(min(train_df_stand['rooms'])), 
                                     int(max(train_df_stand['rooms'])), 
                                     resolution_in_Zs))
rooms = tf.feature_column.bucketized_column(rooms_as_a_numeric_column, rooms_boundaries)

# Create a bucket feature column for longitude.
size_as_a_numeric_column = tf.feature_column.numeric_column("size")
size_boundaries = list(np.arange(int(min(train_df_stand['size'])), 
                                      int(max(train_df_stand['size'])), 
                                      resolution_in_Zs))
size = tf.feature_column.bucketized_column(size_as_a_numeric_column, size_boundaries)

# Create a feature cross of latitude and longitude.
rooms_x_size = tf.feature_column.crossed_column([rooms, size], hash_bucket_size=100)
crossed_feature = tf.feature_column.indicator_column(rooms_x_size)
feature_columns.append(crossed_feature)  

# Convert the list of feature columns into a layer that will later be fed into the model. 
my_feature_layer = tf.keras.layers.DenseFeatures(feature_columns)


In [None]:
def create_model(my_learning_rate, feature_layer):
  """Create and compile a simple linear regression model."""
  # Most simple tf.keras models are sequential.
  model = tf.keras.models.Sequential()

  # Add the layer containing the feature columns to the model.
  model.add(feature_layer)

  # Add one linear layer to the model to yield a simple linear regressor.
  model.add(tf.keras.layers.Dense(units=1, input_shape=(1,)))

  # Construct the layers into a model that TensorFlow can execute.
  model.compile(optimizer=tf.keras.optimizers.experimental.RMSprop(learning_rate=my_learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.MeanSquaredError()])

  return model           


def train_model(model, dataset, epochs, batch_size, target_name):
  """Feed a dataset into the model in order to train it."""

  # Split the dataset into features and label.
  features = {name:np.asarray(value).astype(np.float32) for name, value in dataset.items()}
  label = np.array(features.pop(target_name))
  history = model.fit(x=features, y=label, batch_size=batch_size,
                      epochs=epochs, shuffle=True)

  # Get details that will be useful for plotting the loss curve.
  epochs = history.epoch
  hist = pd.DataFrame(history.history)
  rmse = hist["mean_squared_error"]

  return epochs, rmse   

print("The create_model and the train_model functions are set.")

# Loss function

In [None]:
def plot_the_loss_curve(epochs, mse):
  """Plot a curve of loss vs. epoch."""

  plt.figure()
  plt.xlabel("Epoch")
  plt.ylabel("Mean Squared Error")

  plt.plot(epochs, mse, label="Loss")
  plt.legend()
  plt.ylim([mse.min()*0.95, mse.max() * 1.05])
  plt.show()  

print("The plot_the_loss_curve function is correctly defined.")

In [None]:
print(train_df_stand.dtypes)


In [None]:
# The following variables are the hyperparameters.
learning_rate = 0.01 
epochs = 500 
batch_size = 300 

# Here we define the target.
target_name = "price"

# Establish the model's topography.
my_model = create_model(learning_rate, my_feature_layer)

# Train the model on the normalized training set.
epochs, mse = train_model(my_model, train_df_stand, epochs, batch_size, target_name)
plot_the_loss_curve(epochs, mse)

test_features = {name:np.asarray(value).astype(np.float32) for name, value in test_df_stand.items()}
test_label = np.array(test_features.pop(target_name)) # isolate the label
print("\n Evaluate the linear regression model against the test set:")
my_model.evaluate(x = test_features, y = test_label, batch_size=batch_size)

# NN

In [None]:
def create_model(my_learning_rate, my_feature_layer):
  """Create and compile a simple network."""
  # Most simple tf.keras models are sequential.
  model = tf.keras.models.Sequential()

  # Add the layer containing the feature columns to the model.
  model.add(my_feature_layer)

  # Define the first hidden layer with 20 nodes.   
  model.add(tf.keras.layers.Dense(units=20, 
                                  activation='relu', 
                                  name='Hidden1'))
  
  # Define the second hidden layer with 10 nodes (i.e., 20/2). 
  model.add(tf.keras.layers.Dense(units=10, 
                                  activation='relu', 
                                  name='Hidden2'))
  
  # Define the output layer.
  model.add(tf.keras.layers.Dense(units=1,  
                                  name='Output'))                              
  
  model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=my_learning_rate),
                loss="mean_squared_error",
                metrics=[tf.keras.metrics.MeanSquaredError()])

  return model

In [None]:
def train_model(model, dataset, epochs, target_name,
                batch_size=None):
  """Train the model by feeding it data."""

  # Split the dataset into features and label.
  features = {name:np.asarray(value).astype(np.float32) for name, value in dataset.items()}
  label = np.array(features.pop(target_name))
  history = model.fit(x=features, y=label, batch_size=batch_size,
                      epochs=epochs, shuffle=True) 

  # The list of epochs is stored separately from the rest of history.
  epochs = history.epoch
  
  # To track the progression of training, gather a snapshot of the model's mean squared error at each epoch. 
  hist = pd.DataFrame(history.history)
  mse = hist["mean_squared_error"]

  return epochs, mse

In [None]:
# The following variables are the hyperparameters.
learning_rate = 0.01
epochs = 500
batch_size = 300

# Specify the label
target_name = "price"

# Establish the model's topography.
my_model = create_model(learning_rate, my_feature_layer)

# defined by the feature_layer.
epochs, mse = train_model(my_model, train_df_stand, epochs, 
                          target_name, batch_size)
plot_the_loss_curve(epochs, mse)

# After building a model against the training set, test that model
# against the test set.
test_features = {name:np.asarray(value).astype(np.float32) for name, value in test_df_stand.items()}
test_target = np.array(test_features.pop(target_name)) # isolate the target
print("\n Evaluate the new model against the test set:")
my_model.evaluate(x = test_features, y = test_target, batch_size=batch_size)

# Comparison

Assuming the convergence of both models, we can compare the test set loss for each. In our experiments, the loss of the linear regression was lower than that of the deep neural network (even if not dramatically lower), which suggests that the linear regression model will make better predictions than the deep neural network model.

However, we need to take into consideration model complexity, as well as other issues like explainability and the fact that the loss and MSE on both models are high due to possibly lack of data or due to the quality of the data. 

If the market would be regulated, the linear regression would be preferred instead of the deep neural network.