# Packages

In [1]:
from geopy.geocoders import Nominatim
import undetected_chromedriver as uc
from bs4 import BeautifulSoup
import requests
import random
import time as t
import pandas as pd
import re
from deep_translator import GoogleTranslator
from tqdm.notebook import tqdm
from fake_useragent import UserAgent
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.common.keys import Keys

# Function

In [2]:
def flatten(A):
    rt = []
    for i in A:
        if isinstance(i,list): rt.extend(flatten(i))
        else: rt.append(i)
    return rt

In [3]:
def has_number(input_string):
    pattern = r"\d+"
    match = re.search(pattern, input_string)
    if match:
        return True
    else:
        return False

In [4]:
def extract_number(input_string):
    pattern = r"(\d+)"
    match = re.search(pattern, input_string)
    if match:
        extracted_number = int(match.group(1))  # Convert the matched number to an integer
        return extracted_number
    else:
        return None 

In [5]:
def addressToCoord(address):
    loc = Nominatim(user_agent="Geopy Library")
    getLoc = loc.geocode(address)
    return(getLoc.latitude, getLoc.longitude)

In [6]:
def is_string_in_another_string_once(main_string, sub_string):
    index = main_string.find(sub_string)
    if index != -1:
        second_index = main_string.find(sub_string, index + 1)
        if second_index == -1:
            return True
    return False

In [7]:
def uniquize(seq):
    seen = set()
    seen_add = seen.add
    return [x for x in seq if not (x in seen or seen_add(x))]

In [8]:
ua = UserAgent()

# Scrape

In [9]:
mother_url = 'https://www.immonet.de'
url_apartments = 'https://www.immonet.de/wohnung-mieten.html'
url_houses = 'https://www.immonet.de/haus-mieten.html'

## City Links

In [11]:
all_cities = []

for real_estate in tqdm((url_apartments, url_houses)):
    
    header = {'User-Agent':str(ua.random)}
    content = requests.get(real_estate, headers = header)
    soup = BeautifulSoup(content.content, 'html.parser')

    cities1_container = soup.find('form', {'class':'clearfix'})
    cities1_raw = cities1_container.find_all('a', {'href':True})
    cities1 = [mother_url + i['href'] for i in cities1_raw]

    cities2_container = soup.find('ul', {'class':'row list-unstyled custom-bullet text-left'})
    cities2_narrow = cities2_container.find_all('li', {'class':'col-xs-6 col-md-3 padding-6'})
    cities2_raw = [i.find('a', {'href':True}) for i in cities2_narrow]
    cities2 = [mother_url + i['href'] for i in cities2_raw]

    all_cities.append(cities1 + cities2)
    
all_cities = flatten(all_cities)

HBox(children=(FloatProgress(value=0.0, max=2.0), HTML(value='')))




## Pagination Generation

In [13]:
all_pages = []

for link in tqdm(all_cities):
    header = {'User-Agent':str(ua.random)}
    content = requests.get(link, headers = header)
    soup = BeautifulSoup(content.content, 'html.parser')
    
    test_lstpg = soup.find_all('li', {'class':'pagination-item'})
    if len(test_lstpg) != 0:
        lstpg = soup.find_all('li', {'class':'pagination-item'})[-1].text
        pages = [link.replace('mieten', 'mieten-seite' + str(i)) for i in range(2, int(lstpg))]
        pages += [link]
        all_pages.append(pages)
    else:
        all_pages.append(link)
        
all_pages = flatten(all_pages)

HBox(children=(FloatProgress(value=0.0, max=80.0), HTML(value='')))




## Product Links

In [14]:
all_links = []

for page in tqdm(all_pages):
    header = {'User-Agent':str(ua.random)}
    content = requests.get(page, headers = header)
    soup = BeautifulSoup(content.content, 'html.parser')
    
    product_container = soup.find('div', {'id':'result-list-stage'})
    product_container_narrow = product_container.find_all('div', {'class':'flex-grow-1 overflow-hidden box-25'})
    products_raw = [i.find('a', {'href':True}) for i in product_container_narrow]
    
    if is_string_in_another_string_once(page, 'wohnung'):
        products = [(mother_url + i['href'], 'apartment') for i in products_raw]
    elif is_string_in_another_string_once(page, 'haus'):
        products = [(mother_url + i['href'], 'house') for i in products_raw]
    else:
        continue
        
    all_links.append(products)
    
all_links = uniquize(flatten(all_links))

HBox(children=(FloatProgress(value=0.0, max=546.0), HTML(value='')))




## Data Collection

In [15]:
all_sizes = []
all_links1 = []
all_links2 = []
all_rooms1 = []
all_rooms2 = []
all_final_costs1 = []
all_final_costs2 = []
all_zip_codes1 = []
all_latitudes1 = []
all_zip_codes2 = []
all_latitudes2 = []
all_longitudes1 = []
all_longitudes2 = []
all_floors1 = []
all_floors2 = []
all_balconies = []
all_storages = []
all_years1 = []
all_years2 = []
all_elevators = []
all_energies1 = []
all_energies2 = []
all_basic_neccessities_ratings1 = []
all_transportation_ratings1 = []
all_basic_neccessities_ratings2 = []
all_transportation_ratings2 = []
all_exterior_sizes = []
all_interior_sizes = []
all_house_types = []
all_terraces = []
all_parkings = []
all_swimming_pools = []

for duo in tqdm(all_links):
    if duo[1] == 'apartment':    
        header = {'User-Agent':str(ua.random)}
        content = requests.get(duo[0], headers = header)
        soup = BeautifulSoup(content.content, 'html.parser')
        
        test_general = soup.find('div', {'class':'hardfacts ng-star-inserted'})
        if test_general:
            price_container = test_general.find('div', {'class':'hardfact ng-star-inserted'})
            price_container_narrow = price_container.find('div')
            price_raw = price_container_narrow.find('strong')
            final_cost_raw =  price_raw.text.strip().replace('\xa0€', '').replace('.', '').replace(',', '.')

            if 'auf Anfrage' not in final_cost_raw:
                final_cost = float(price_raw.text.strip().replace('\xa0€', '').replace('.', '').replace(',', '.'))
            else:
                final_cost = None

            if test_general:
                general_container = test_general.find('div', {'class':'flex ng-star-inserted'})
                general_container_narrow = general_container.find('span')

                size = general_container_narrow.text.strip().replace(' m²', '').replace(',', '.')
                if 'k.A.' not in size:
                    size = float(general_container_narrow.text.strip().replace(' m²', '').replace(',', '.'))
                else:
                    size = None

                rooms_container = general_container.find_all('div', {'class':'hardfact ng-star-inserted'})[-1]
                rooms_container_narrow = rooms_container.find('span')
                rooms_raw = rooms_container_narrow.text.strip().replace(',', '.')

                if has_number(rooms_container_narrow.text) and 'k.A.' not in rooms_raw:
                    try:
                        rooms = float(rooms_container_narrow.text.strip().replace(',', '.'))
                    except:
                        room = None
                else:
                    room = None

            else:
                size = None
                rooms = None
                final_cost = None

            test_map = soup.find('div', {'class':'mb-50 ng-star-inserted'})
            if test_map:
                map_container = soup.find('sd-cell-row', {'class':'cell-size-100 cell__row'})
                map_container_narrow = map_container.find('sd-cell-col', {'class':'cell__col is-center-v'})
                map_container_narrower = map_container_narrow.find_all('span')
                map_container_raw = [i.text.strip().replace('\xa0', '').replace('(', '').replace(')', '') for i in map_container_narrower]

                zip_code = int(map_container_raw[-1].split()[0])
                if 'Straße nicht freigegeben' not in map_container_raw[0]:
                    try:
                        latlong = addressToCoord(map_container_raw[0] + ', ' + map_container_raw[1])
                    except:
                        try:
                            latlong = addressToCoord(map_container_raw[1])
                        except:
                            latlong = None

                    if latlong != None:    
                        latitude = float(latlong[0])
                        longitude = float(latlong[1])
                    else:
                        latitude = None
                        longitude = None
                else:
                    try:
                        latlong = addressToCoord(map_container_raw[1])
                    except:
                        try:
                            latlong = addressToCoord(" ".join(map_container_raw[1].split()[:2]))
                        except:
                            latlong = None

                    if latlong != None:    
                        latitude = float(latlong[0])
                        longitude = float(latlong[1])
                    else:
                        latitude = None
                        longitude = None

            else:
                latitude = None
                longitude = None
                zip_code = None

            test_features = soup.find('app-estate-object-informations', {'class':'ng-star-inserted'})
            if test_features:
                test_floor = test_features.find('sd-cell-col', {'class':'cell__col'})
                if test_floor:
                    floor_container = test_features.find('sd-cell-col', {'class':'cell__col'})
                    if len(floor_container.find_all('p')) != 0:
                        floor_raw = floor_container.find_all('p')[-1]
                        try:
                            floor = int(floor_raw.text.strip().replace('. Geschoss', ''))
                        except:
                            floor = None
                    else:
                        floor = None
                else:
                    floor = None

                amenities_container = soup.find_all('div', {'class':'textlist textlist--icon card-content ng-star-inserted'})
                if len(amenities_container) == 2:
                    additionals_container = amenities_container[0]
                    year_container = amenities_container[-1]

                    additionals_narrow = additionals_container.find('ul')
                    additionals_narrower = additionals_narrow.find_all('li', {'class':'ng-star-inserted'})
                    addition_raw = [i.text.strip().lower() for i in additionals_narrower]

                    balcony = 0
                    storage = 0
                    for component in addition_raw:
                        if 'balkon' in component:
                            balcony = 1
                        elif 'weitere räume' in component:
                            storage = 1
                        else:
                            continue

                    year_narrow = year_container.find('ul')
                    year_narrower = year_narrow.find_all('li', {'class':'ng-star-inserted'})
                    year_raw = [i.text.strip().lower() for i in year_narrower]

                    year = None
                    elevator = 0
                    for component in year_raw:
                        if 'baujahr' in component:
                            if has_number(component):
                                try:
                                    year = int(component.replace('baujahr: ', ''))
                                except:
                                    year = int(extract_number(component.replace('baujahr: ', '')))
                            else:
                                year = None
                        elif 'personenaufzug' in component:
                            elevator = 1
                        else:
                            continue

                elif len(amenities_container) == 1: 
                    year = None

                    additionals_container = amenities_container[0]
                    additionals_narrow = additionals_container.find('ul')
                    additionals_narrower = additionals_narrow.find_all('li', {'class':'ng-star-inserted'})
                    addition_raw = [i.text.strip().lower() for i in additionals_narrower]

                    balcony = 0
                    storage = 0
                    for component in addition_raw:
                        if 'balkon' in component:
                            balcony = 1
                        elif 'weitere räume' in component:
                            storage = 1
                        else:
                            continue
                else:
                    year = None
                    balcony = 0
                    storage = 0

            test_energy = soup.find('app-energy-equipment', {'class':'ng-star-inserted'})
            if test_energy:
                energy_narrow = test_energy.find('sd-cell-col', {'class':'cell__col'})
                energy_raw = energy_narrow.find_all('p')
                energy = [i.text.strip() for i in energy_raw][-1]
                energy = GoogleTranslator(source='de', target='en').translate(energy).lower()
            else:
                energy = None

            ratings_general = soup.find('app-local-rating-card', {'class':'no-print link-active'})
            test_ratings = soup.find('div', {'class':'flex flex-wrap flex-col ng-star-inserted'})
            if test_ratings:
                ratings_container = ratings_general.find('sd-card', {'class':'mb-0 card card--is-grey ng-star-inserted'})
                ratings_container_narrow = ratings_container.find('div', {'class':'flex flex-wrap flex-col ng-star-inserted'})
                ratings_container_narrowest = ratings_container_narrow.find_all('div', {'class':'rating-meter__content'})
                ratings_raw = [i.find('div', {'class':'rating-meter__value'}).text.strip().lower() for i in ratings_container_narrowest]

                basic_neccessities_word = ratings_raw[0]
                transportation_word = ratings_raw[1]

                word_to_rating = {
                "super": 5,
                "gut":4,
                "okay":3,
                "mäßig":2,
                "wenig":1,
                "?":0
            }
                basic_neccessities_rating = word_to_rating.get(basic_neccessities_word)
                transportation_rating = word_to_rating.get(transportation_word)
            else:
                basic_neccessities_rating = None
                transportation_rating = None

            all_sizes.append(size)
            all_rooms1.append(rooms)
            all_final_costs1.append(final_cost)
            all_zip_codes1.append(zip_code)
            all_latitudes1.append(latitude)
            all_longitudes1.append(longitude)
            all_floors1.append(floor)
            all_balconies.append(balcony)
            all_storages.append(storage)
            all_years1.append(year)
            all_elevators.append(elevator)
            all_energies1.append(energy)
            all_basic_neccessities_ratings1.append(basic_neccessities_rating)
            all_transportation_ratings1.append(transportation_rating)
            all_links1.append(duo[0])
            
        else:
            continue

    else:
        header = {'User-Agent':str(ua.random)}
        content = requests.get(duo[0], headers = header)
        soup = BeautifulSoup(content.content, 'html.parser')
        
        test_general = soup.find('div', {'class':'hardfacts ng-star-inserted'})
        if test_general:
            price_container = test_general.find('div', {'class':'hardfact hardfact__price ng-star-inserted'})
            price_container_narrow = price_container.find('div')
            price_raw = price_container_narrow.find('strong')
            final_cost_raw = price_raw.text.strip().replace('\xa0€', '').replace('.', '').replace(',', '.')

            if 'auf Anfrage' not in final_cost_raw:
                final_cost = float(price_raw.text.strip().replace('\xa0€', '').replace('.', '').replace(',', '.'))
            else:
                final_cost = None

            if test_general:
                general_container = test_general.find('div', {'class':'flex ng-star-inserted'})
                general_container_narrow = general_container.find_all('span')


                interior_size_raw = general_container_narrow[0].text.strip().replace(' m²', '').replace(',', '.')
                if 'k.A.' not in interior_size_raw:
                    interior_size = float(general_container_narrow[0].text.strip().replace(' m²', '').replace(',', '.'))
                else:
                    interior_size = None


                rooms_raw = general_container_narrow[1].text.strip().replace(',', '.')

                if has_number(rooms_raw) and 'k.A.' not in rooms_raw:
                    try:
                        rooms = float(general_container_narrow[1].text.strip().replace(',', '.'))
                    except:
                        rooms = None
                else:
                    room = None

                exterior_size = general_container_narrow[-1].text.strip().replace(' m²', '').replace(',', '.')
                if 'k.A.' not in exterior_size:
                    exterior_size = float(general_container_narrow[-1].text.strip().replace(' m²', '').replace(',', '.'))
                else:
                    exterior_size = None

            else:
                interior_size = None
                rooms = None
                final_cost = None
                exterior_size = None

            test_map = soup.find('div', {'class':'mb-50 ng-star-inserted'})
            if test_map:
                map_container = soup.find('sd-cell-row', {'class':'cell-size-100 cell__row'})
                map_container_narrow = map_container.find('sd-cell-col', {'class':'cell__col is-center-v'})
                map_container_narrower = map_container_narrow.find_all('span')
                map_container_raw = [i.text.strip().replace('\xa0', '').replace('(', '').replace(')', '') for i in map_container_narrower]

                zip_code = int(map_container_raw[-1].split()[0])
                if 'Straße nicht freigegeben' not in map_container_raw[0]:
                    try:
                        latlong = addressToCoord(map_container_raw[0] + ', ' + map_container_raw[1])
                    except:
                        try:
                            latlong = addressToCoord(map_container_raw[1])
                        except:
                            latlong = None
                    if latlong != None:    
                        latitude = float(latlong[0])
                        longitude = float(latlong[1])
                    else:
                        latitude = None
                        longitude = None
                else:
                    try:
                        latlong = addressToCoord(map_container_raw[1])
                    except:
                        try:
                            latlong = addressToCoord(" ".join(map_container_raw[1].split()[:2]))
                        except:
                            latlong = None
                    if latlong != None:    
                        latitude = float(latlong[0])
                        longitude = float(latlong[1])
                    else:
                        latitude = None
                        longitude = None
            else:
                latitude = None
                longitude = None
                zip_code = None

            test_features = soup.find('app-estate-object-informations', {'class':'ng-star-inserted'})
            if test_features:
                test_ameneties = test_features.find('div', {'class':'equipment card-content ng-star-inserted'})
                if test_ameneties:
                    ameneties_narrow = test_ameneties.find_all('sd-cell-col', {'class':'cell__col'})
                    ameneties_raw = [i.text.lower() for i in ameneties_narrow]

                    year = None
                    house_type = None
                    floors = 1
                    for component in ameneties_raw:
                        if 'baujahr' in component:
                            if has_number(component):
                                try:
                                    year = int(component.replace('baujahr: ', ''))
                                except:
                                    year = int(extract_number(component.replace('baujahr: ', '')))
                            else:
                                year = None
                        elif 'geschosse' in component or 'geschoss' in component:
                            floors = int(component.replace('geschosse', '').replace('geschoss', ''))
                        elif 'kategorie' in component:
                            house_type = GoogleTranslator(source='de', target='en').translate(component.replace('kategorie', '')).lower()
                        else:
                            continue

                else:
                    house_type = None
                    year = None
                    floors = 1

                test_additionals = test_features.find('div', {'class':'textlist textlist--icon card-content ng-star-inserted'})
                if test_additionals:
                    additionals_container = test_additionals.find_all('li', {'class':'ng-star-inserted'})
                    addition_raw = [i.text.strip().lower() for i in additionals_container]

                    terrace = 0
                    swimming_pool = 0
                    parking = 0
                    for component in addition_raw:
                        if 'terrasse' in component:
                            terrace = 1
                        elif 'swimming-pool' in component:
                            swimming_pool = 1
                        elif 'stellplätze' in component or 'stellplatz':
                            parking = 1
                        else:
                            continue
                else:
                    terrace = 0
                    swimming_pool = 0
                    parking = 0
            else:
                house_type = None
                year = None
                terrace = 0
                swimming_pool = 0
                parking = 0

            test_energy = soup.find('app-energy-equipment', {'class':'ng-star-inserted'})
            if test_energy:
                energy_narrow = test_energy.find('sd-cell-col', {'class':'cell__col'})
                energy_raw = energy_narrow.find_all('p')
                energy = [i.text.strip() for i in energy_raw][-1]
                energy = GoogleTranslator(source='de', target='en').translate(energy).lower()
            else:
                energy = None

            ratings_general = soup.find('app-local-rating-card', {'class':'no-print link-active'})
            test_ratings = soup.find('div', {'class':'flex flex-wrap flex-col ng-star-inserted'})
            if test_ratings:
                ratings_container = ratings_general.find('sd-card', {'class':'mb-0 card card--is-grey ng-star-inserted'})
                ratings_container_narrow = ratings_container.find('div', {'class':'flex flex-wrap flex-col ng-star-inserted'})
                ratings_container_narrowest = ratings_container_narrow.find_all('div', {'class':'rating-meter__content'})
                ratings_raw = [i.find('div', {'class':'rating-meter__value'}).text.strip().lower() for i in ratings_container_narrowest]

                basic_neccessities_word = ratings_raw[0]
                transportation_word = ratings_raw[1]

                word_to_rating = {
                "super":5,
                "gut":4,
                "okay":3,
                "mäßig":2,
                "wenig":1,
                "?":0
            }
                basic_neccessities_rating = word_to_rating.get(basic_neccessities_word)
                transportation_rating = word_to_rating.get(transportation_word)
            else:
                basic_neccessities_rating = None
                transportation_rating = None

            all_rooms2.append(rooms)
            all_final_costs2.append(final_cost)
            all_zip_codes2.append(zip_code)
            all_latitudes2.append(latitude)
            all_longitudes2.append(longitude)
            all_floors2.append(floors)
            all_years2.append(year)
            all_energies2.append(energy)
            all_basic_neccessities_ratings2.append(basic_neccessities_rating)
            all_transportation_ratings2.append(transportation_rating)
            all_exterior_sizes.append(exterior_size)
            all_interior_sizes.append(interior_size)
            all_house_types.append(house_type)
            all_terraces.append(terrace)
            all_parkings.append(parking)
            all_swimming_pools.append(swimming_pool)
            all_links2.append(duo[0])
        else:
            continue
        
all_sizes = flatten(all_sizes)
all_rooms1 = flatten(all_rooms1)
all_final_costs1 = flatten(all_final_costs1)
all_zip_codes1 = flatten(all_zip_codes1)
all_latitudes1 = flatten(all_latitudes1)
all_longitudes1 = flatten(all_longitudes1)
all_floors1 = flatten(all_floors1)
all_balconies = flatten(all_balconies)
all_storages = flatten(all_storages)
all_years1 = flatten(all_years1)
all_elevators = flatten(all_elevators)
all_energies1 = flatten(all_energies1)
all_basic_neccessities_ratings1 = flatten(all_basic_neccessities_ratings1)
all_transportation_ratings1 = flatten(all_transportation_ratings1)
all_links1 = flatten(all_links1)

all_rooms2 = flatten(all_rooms2)
all_final_costs2 = flatten(all_final_costs2)
all_zip_codes2 = flatten(all_zip_codes2)
all_latitudes2 = flatten(all_latitudes2)
all_longitudes2 = flatten(all_longitudes2)
all_floors2 = flatten(all_floors2)
all_years2 = flatten(all_years2)
all_energies2 = flatten(all_energies2)
all_basic_neccessities_ratings2 = flatten(all_basic_neccessities_ratings2)
all_transportation_ratings2 = flatten(all_transportation_ratings2)
all_exterior_sizes = flatten(all_exterior_sizes)
all_interior_sizes = flatten(all_interior_sizes)
all_house_types = flatten(all_house_types)
all_terraces = flatten(all_terraces)
all_parkings = flatten(all_parkings)
all_swimming_pools = flatten(all_swimming_pools)
all_links2 = flatten(all_links2)

HBox(children=(FloatProgress(value=0.0, max=13721.0), HTML(value='')))




# Storage

In [21]:
data_apartments = pd.DataFrame()
data_houses = pd.DataFrame()

data_apartments['Link'] = all_links1
data_apartments['Year Constructed'] = all_years1
data_apartments['Interior Size'] = all_sizes
data_apartments['Room Count'] = all_rooms1
data_apartments['Zip Code'] = all_zip_codes1
data_apartments['Latitude'] = all_latitudes1
data_apartments['Longitude'] = all_longitudes1
data_apartments['Floor Level'] = all_floors1
data_apartments['Balcony'] = all_balconies
data_apartments['Storage'] = all_storages
data_apartments['Elevator'] = all_elevators
data_apartments['Basic Neccessities Rating'] = all_basic_neccessities_ratings1
data_apartments['Transportation Availability Rating'] = all_transportation_ratings1
data_apartments['Price'] = all_final_costs1

data_houses['Link'] = all_links2
data_houses['House Type'] = all_house_types
data_houses['Year Constructed'] = all_years2
data_houses['Interior Size'] = all_interior_sizes
data_houses['Exterior Size'] = all_exterior_sizes
data_houses['Room Count'] = all_rooms2
data_houses['Zip Code'] = all_zip_codes2
data_houses['Latitude'] = all_latitudes2
data_houses['Longitude'] = all_longitudes2
data_houses['Floor Count'] = all_floors2
data_houses['Parking'] = all_parkings
data_houses['Terrace'] = all_terraces
data_houses['Basic Neccessities Rating'] = all_basic_neccessities_ratings2
data_houses['Transportation Availability Rating'] = all_transportation_ratings2
data_houses['Price'] = all_final_costs2
data_houses['Year Constructed'] = data_houses['Year Constructed'].replace(0, None)
data_houses['House Type'] = data_houses['House Type'].replace(0, None)

In [85]:
def format_value(value):
    if value >= 1000 and value <= 9999:
        return f'0{value:04}'
    return str(value)

data_apartments['Zip Code'] = data_apartments['Zip Code'].apply(format_value)
data_houses['Zip Code'] = data_houses['Zip Code'].apply(format_value)

In [86]:
data_apartments.to_excel('immonetApartments.xlsx', index=False, engine='openpyxl')
data_houses.to_excel('immonetHouses.xlsx', index=False, engine='openpyxl')

In [91]:
data_apartments = pd.read_excel('immonetApartments.xlsx', engine='openpyxl', dtype={'Zip Code': str})
data_houses = pd.read_excel('immonetHouses.xlsx', engine='openpyxl', dtype={'Zip Code': str})