# Modules to be imported

In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import overpy
import psycopg2
import statistics
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

# Webscraping from www.immoscout24.ch

In [2]:
# Create an empty array to store the scraped data
properties = []
for page in range(1, 3):
# Create a new instance of Chrome driver
    driver = webdriver.Chrome()

# Navigate to the search result page for Zurich city on ImmoScout24.ch

    print(page)
        # List of URLs to scrape
    urls = [
    'https://www.immoscout24.ch/en/real-estate/rent/city-zuerich?pn='+str(page),
    'https://www.immoscout24.ch/en/real-estate/rent/city-geneve?pn='+str(page),
    'https://www.immoscout24.ch/en/real-estate/rent/city-basel?pn='+str(page),
    'https://www.immoscout24.ch/en/real-estate/rent/city-lausanne?pn='+str(page),
    'https://www.immoscout24.ch/en/real-estate/rent/city-bern?pn='+str(page),
    'https://www.immoscout24.ch/en/real-estate/rent/city-winterthur?pn='+str(page),
    'https://www.immoscout24.ch/en/real-estate/rent/city-luzern?pn='+str(page),
    'https://www.immoscout24.ch/en/real-estate/rent/city-st-gallen?pn='+str(page),
    'https://www.immoscout24.ch/en/real-estate/rent/city-lugano?pn='+str(page),
    'https://www.immoscout24.ch/en/real-estate/rent/city-biel-bienne?pn='+str(page)
]

    # loop thorugh each URL 

    for url in urls:
               
        driver.get(url)

        # Wait for the page to load completely
        driver.implicitly_wait(10)

        # Use BeautifulSoup to parse the HTML content of the page
        soup = BeautifulSoup(driver.page_source, "html.parser")

        # Find all the property listings on the page
        listings = soup.find_all("a", {"class": "Wrapper__A-kVOWTT"})
        

    # Loop through the listings and extract the required information
    for listing in listings:
        #print(listing)
        # Get the price, size, and address of the property
        price = listing.find("h3", {"class": re.compile("^Box-cYFBPY hKJGPR Heading")}).text.strip()
        #size = listing.find("h3", {"class": re.compile("^Box-cYFBPY hKJGPR Heading")}).text.strip()
        address = listing.find("span", {"class": "AddressLine__TextStyled-eaUAMD"}).text.strip()
        
        # Add the property details to the properties array as a tuple
        properties.append((price, address))

    # Close the Chrome driver
    driver.close()

    # Print the scraped data array
    print("Properties:", properties)

WebDriverException: Message: 'chromedriver' executable needs to be in PATH. Please see https://sites.google.com/a/chromium.org/chromedriver/home


# WEB API using Overpass Turbo

In [6]:
# defining the overpass query
query = '''
[out:json];
{{geocodeArea:Zurich}} -> .searchArea0;
{{geocodeArea:Geneva}} -> .searchArea1;
{{geocodeArea:Basel}} -> .searchArea2;
{{geocodeArea:Lausanne}} -> .searchArea3;
{{geocodeArea:Bern}} -> .searchArea4;
{{geocodeArea:Winterthur}} -> .searchArea5;
{{geocodeArea:Luzern}} -> .searchArea6;
{{geocodeArea:St. Gallen}} -> .searchArea7;
{{geocodeArea:Lugano}} -> .searchArea8;
{{geocodeArea:Biel}} -> .searchArea9;
( area.searchArea0; area.searchArea1; area.searchArea2; area.searchArea3; area.searchArea4; area.searchArea5; area.searchArea6; area.searchArea7; area.searchArea8; area.searchArea9;) -> .searchArea;
(node["shop"="supermarket"](area.searchArea);way["shop"="supermarket"](area.searchArea););
out center;
'''

# creating Overpass API object
api = overpy.Overpass()

# performing the query and retrieveing the results
response = api.query(query)

# print the name and zip code of each supermarket
for element in response["elements"]:
    if element["tags"].get("name") and element["tags"].get("addr:postcode"):
        name = element["tags"]["name"]
        zip_code = element["tags"]["addr:postcode"]
        data = (name, zip_code)

AttributeError: 'Overpass' object has no attribute 'get'

#### Save the Data into PostgreSQL

# Connection to AWS PostgreSQL and inserting Data

In [None]:
#Connection to the database
conn = psycopg2.connect(
    host="ads-database1.cbwqb3cep5ch.eu-central-1.rds.amazonaws.com",
    database="adsdatabase",
    user="adschief1",
    password="1eYvmDnMPikKSImeLrev"
)

# Creating a cursor object for executing SQL statements
cur = conn.cursor()

# Iterate over the properties array
for property in properties:
    # Extract the property details
    details = property[0].split(", ")  # Split the details by comma and space

    # Extract the individual details
    rooms = details[0]
    size = details[1]
    price = details[2]
    address = details[3]
    zipcode = details[4]
    kanton = details [5]
    
    # Process the individual data record as needed
    print("Rooms:", rooms)
    print("Size:", size)
    print("Price:", price)
    print("Address:", address)
    print("Zip-Code", zipcode)
    print("Kanton", kanton)
    print("--------------------")


    # Insert the data from ImmoScout24 into the database
    cur.execute(
        "INSERT INTO your_table_name (rooms, size, price, address, zipcode, kanton) VALUES (%s, %s, %s, %s. %s, %s)",
        (rooms, size, price, address, zipcode, kanton)
    )
    
    # Insert the data from Overpass Turbo into the database
    cur.execute(
        "INSERT INTO your_table_name2 (name, zip_code) VALUES (%s, %s)"
        (name, zip_code)
    )

# Commit the changes to the database
conn.commit()

# Close the cursor and connection
cur.close()
conn.close()


#### Join the two Tables "immoscout24_data" and "overpass_data"

In [None]:
# Join the two tables 
query = '''
SELECT *
FROM immoscout24_data
JOIN overpass_data ON immoscout24_data.zip_code = overpass_data.zip_code
'''
data_df = pd.read_sql(query, conn)


# EDA using our data from PostgreSQL

In [None]:
# First few rows of the DataFrame
data_df.head()

In [None]:
# Summary statistics
data_df.describe()

In [None]:
# Examine the data types
data_df.dtypes

In [None]:
# Count number of missing values
data_df.isnull().sum()

In [None]:
#mean, median, mean, median, mode, variance, standard deviation
#histogram, boxplots, scatter plots
#correlation analysis
#hypothesis testing 

# Retrieve the data from the database
cur.execute("SELECT price FROM XXXXXXXXXXXX")

# Fetch all the rows of the query result
rows = cur.fetchall()

# Extract the values from the rows
prices = [row[0] for row in rows]

# Calculate the mean
mean = statistics.mean(prices)

# Calculate the median
median = statistics.median(prices)

# Calculate the mode
mode = statistics.mode(prices)

# Calculate the variance
variance = statistics.variance(prices)

# Calculate the standard deviation
std_dev = statistics.stdev(prices)

# Print the results
print("Mean:", mean)
print("Median:", median)
print("Mode:", mode)
print("Variance:", variance)
print("Standard Deviation:", std_dev)

# Correlation Matrix 

### How do we separate the data into training and testing data? NO CLUE

In [None]:
# Query to fetch the data
query = "SELECT rooms, size, price, address, zip_code, kanton FROM your_table;"

# Fetch the data into a DataFrame
df = pd.read_sql(query, conn)

# Split the data into training and test sets (70% training, 30% testing)
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)


# Create the correlation matrix
correlation_matrix = train_df.corr()

print(correlation_matrix)

### Correlation Matrix Plot

In [None]:
# Correlation Matrix Plot (corrplot)
sns.pairplot(train_df)