# This was only used once to extract the data from ImmoScout24
### Should not be run, as the EDA is fine tuned for the extraction on the 25th of May 2023

In [None]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup
import re
import overpy
import psycopg2
import statistics
#import tensorflow as tf
#from tensorflow import keras
#from tensorflow.keras import layers
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split

In [None]:
# List of URLs to scrape

urls = [
    'https://www.immoscout24.ch/en/real-estate/rent/city-winterthur',
    'https://www.immoscout24.ch/en/real-estate/rent/city-zuerich',
    'https://www.immoscout24.ch/en/real-estate/rent/city-geneve',
    'https://www.immoscout24.ch/en/real-estate/rent/city-basel',
    'https://www.immoscout24.ch/en/real-estate/rent/city-lausanne',
    'https://www.immoscout24.ch/en/real-estate/rent/city-bern',
    'https://www.immoscout24.ch/en/real-estate/rent/city-luzern',
    'https://www.immoscout24.ch/en/real-estate/rent/city-st-gallen',
    'https://www.immoscout24.ch/en/real-estate/rent/city-lugano',
    'https://www.immoscout24.ch/en/real-estate/rent/city-biel-bienne'
]   


### Looping through every URL and getting entries for every page

In [None]:
# loop thorugh each URL 
def extract(link):
        L = []
        # Create a new instance of Chrome driver
        driver = webdriver.Chrome()

        # Navigate to the search result page for ten selected cities on ImmoScout24.ch
        driver.get(link)
        

        # Wait for the page to load completely
        driver.implicitly_wait(10)

        # Use BeautifulSoup to parse the HTML content of the page
        soup = BeautifulSoup(driver.page_source, "html.parser")

        #get last page number
        p = soup.find("div",{"class": "Box-cYFBPY Flex-feqWzG FwNOn dCDRxm"}).children
        *_, last = p # for a better understanding check PEP 448
        pages = last.text
        print("this is the amount of pages: ", pages)

        #iterate through all the pages
        for page in range(1,int(pages)+1):
            
            driver.get(link+'?pn='+str(page))

            soup = BeautifulSoup(driver.page_source, "html.parser")

            # Find all the property listings on the page
            listings = soup.find_all("a", {"class": "Wrapper__A-kVOWTT"})


        # Loop through the listings and extract the required information
            for listing in listings:
                #print(listing)
                # Get the price, size, and address of the property
                price = listing.find("h3", {"class": re.compile("^Box-cYFBPY hKJGPR Heading")}).text.strip()
                #size = listing.find("h3", {"class": re.compile("^Box-cYFBPY hKJGPR Heading")}).text.strip()
                address = listing.find("span", {"class": "AddressLine__TextStyled-eaUAMD"}).text.strip()

                # Add the property details to the properties array as a tuple
                L.append((price, address))
                #print(properties)
        
        # Close the Chrome driver
        driver.close()
        return L

### Multiprocessing to open multiple chrome tabs and extract data faster

In [None]:
import multiprocess as mp
import time
from multiprocessing import Process, Manager
from itertools import repeat
from functools import partial
import concurrent.futures

#timer for performance
start = time.perf_counter()

#trial and error with multiprocessing
#with mp.Pool() as pool:
#    result = pool.map(extract,urls)
#    print(result)

#Multithreading used to open multiple chrome windows
with concurrent.futures.ThreadPoolExecutor(max_workers=5) as executor: 
    result = executor.map(extract,  urls)
    
finish = time.perf_counter()

print(f'Finished urls in {round(finish-start,2)} second(s)')

In [None]:
r = list(result)

In [None]:
#write results into immo.csv file
import csv
with open('immo.csv', 'w') as f:
    # using csv.writer method from CSV package
    write = csv.writer(f)
    write.writerows(r)