In [20]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from bs4 import BeautifulSoup
import time

def scrape_apartments(url):
    # Setup ChromeDriver
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run Chrome in headless mode (no GUI)
    
    # Initialize lists to store data
    addresses = []
    titles = []
    descriptions = []
    rooms = []
    living_space = []
    prices = []

    # Setup ChromeDriver
    driver = webdriver.Chrome(service=service, options=options)

    # Loop through the 50 pages
    for page_num in range(1, 51):  # Adjusted the range to include pages from 1 to 50
        print(f"Scraping page {page_num}...")
        page_url = f"{url}?pn={page_num}"  # Adjusted the URL structure
        driver.get(page_url)
        time.sleep(3)  # Add a delay to allow the page to load

        # Get the full content of the website
        source = driver.page_source

        # Parse HTML content with BeautifulSoup
        soup = BeautifulSoup(source, 'html.parser')

        # Get addresses
        address_elements = soup.find_all(class_='HgListingCard_address_JGiFv')
        for element in address_elements:
            address = element.find('address').text.strip()
            addresses.append(address)

        # Get titles and descriptions
        title_desc_elements = soup.find_all(class_='HgListingDescription_description_r5HCO')
        for element in title_desc_elements:
            title = element.find('span').text.strip()
            description_elem = element.find('p', class_='HgListingDescription_large_uKs3J')
            description = description_elem.text.strip() if description_elem else ''
            titles.append(title)
            descriptions.append(description)

        # Get rooms, living space, and prices
        room_space_price_elements = soup.find_all(class_='HgListingRoomsLivingSpacePrice_roomsLivingSpacePrice_M6Ktp')
        for element in room_space_price_elements:
            strong_tags = element.find_all('strong')
            rooms.append(strong_tags[0].text.strip()) if strong_tags else rooms.append('')
            living_space.append(strong_tags[1].text.strip()) if len(strong_tags) > 1 else living_space.append('')
            prices.append(element.find('span').text.strip())

    # Close driver
    driver.quit()

    # Ensure all arrays have the same length
    data_length = min(len(addresses), len(titles), len(descriptions), len(rooms), len(living_space), len(prices))

    # Dataframe
    df = pd.DataFrame({'Address': addresses[:data_length],
                       'Title': titles[:data_length],
                       'Description': descriptions[:data_length],
                       'Rooms': rooms[:data_length],
                       'Living Space (sqm)': living_space[:data_length],
                       'Price': prices[:data_length]})

    # Save to file
    df.to_csv('immoscout24.csv', sep=";", index=False)

    return df

# Specify the URL to scrape
url = 'https://www.immoscout24.ch/en/real-estate/buy/country-switzerland-fl'

# Run the scraping function and display the results
apartment_data = scrape_apartments(url)
print(apartment_data)


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
         

In [16]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from bs4 import BeautifulSoup
import time

def scrape_apartments(url):
    # Setup ChromeDriver
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run Chrome in headless mode (no GUI)
    
    driver = webdriver.Chrome(service=service, options=options)

    # Initialize lists to store data
    addresses = []
    titles = []
    descriptions = []
    rooms = []
    living_space = []
    prices = []

    # Loop through the pages (example range is 1 to 50)
    for page_num in range(1, 51):
        print(f"Scraping page {page_num}...")
        page_url = f"{url}?pn={page_num}"
        driver.get(page_url)
        time.sleep(3)  # Allow time for the page to load fully

        # Get the full content of the webpage
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')

        # Extract addresses
        address_elements = soup.find_all(class_='HgListingCard_address_JGiFv')
        for element in address_elements:
            address = element.find('address').text.strip()
            addresses.append(address)

        # Extract titles and descriptions
        title_desc_elements = soup.find_all(class_='HgListingDescription_description_r5HCO')
        for element in title_desc_elements:
            title = element.find('span').text.strip()
            description_elem = element.find('p', class_='HgListingDescription_large_uKs3J')
            description = description_elem.text.strip() if description_elem else ''
            titles.append(title)
            descriptions.append(description)

        # Extract rooms, living space, and prices
        room_space_price_elements = soup.find_all(class_='HgListingRoomsLivingSpacePrice_roomsLivingSpacePrice_M6Ktp')
        for element in room_space_price_elements:
            strong_tags = element.find_all('strong')
            rooms.append(strong_tags[0].text.strip() if strong_tags else '')
            living_space.append(strong_tags[1].text.strip() if len(strong_tags) > 1 else '')
            
            # Extract price using the specific class name provided
            price_elem = element.find('span', class_='HgListingRoomsLivingSpacePrice_price_u9Vee')
            prices.append(price_elem.text.strip() if price_elem else 'Not available')

    # Close the driver after scraping
    driver.quit()

    # Ensure all lists have the same length before creating DataFrame
    min_length = min(len(addresses), len(titles), len(descriptions), len(rooms), len(living_space), len(prices))
    df = pd.DataFrame({
        'Address': addresses[:min_length],
        'Title': titles[:min_length],
        'Description': descriptions[:min_length],
        'Rooms': rooms[:min_length],
        'Living Space (sqm)': living_space[:min_length],
        'Price': prices[:min_length]
    })

    # Save data to a CSV file
    df.to_csv('immoscout24.csv', sep=";", index=False)

    return df

# URL for scraping
url = 'https://www.immoscout24.ch/en/real-estate/buy/country-switzerland-fl'
apartment_data = scrape_apartments(url)
print(apartment_data)


Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scraping page 49...
Scraping page 50...
         

Web Scraping for each canton

In [11]:
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import pandas as pd
from bs4 import BeautifulSoup
import time

def scrape_apartments(url):
    # Setup ChromeDriver
    service = Service(ChromeDriverManager().install())
    options = webdriver.ChromeOptions()
    options.add_argument('--headless')  # Run Chrome in headless mode (no GUI)
    
    driver = webdriver.Chrome(service=service, options=options)

    # Initialize lists to store data
    addresses = []
    titles = []
    descriptions = []
    rooms = []
    living_space = []
    prices = []

    # Loop through the pages (limited to first two pages)
    for page_num in range(1, 51):
        print(f"Scraping page {page_num}...")
        page_url = f"{url}?pn={page_num}"
        driver.get(page_url)
        time.sleep(3)  # Allow time for the page to load fully

        # Get the full content of the webpage
        source = driver.page_source
        soup = BeautifulSoup(source, 'html.parser')

        # Extract addresses
        address_elements = soup.find_all(class_='HgListingCard_address_JGiFv')
        for element in address_elements:
            address = element.find('address').text.strip()
            addresses.append(address)

        # Extract titles and descriptions
        title_desc_elements = soup.find_all(class_='HgListingDescription_description_r5HCO')
        for element in title_desc_elements:
            title = element.find('span').text.strip()
            description_elem = element.find('p', class_='HgListingDescription_large_uKs3J')
            description = description_elem.text.strip() if description_elem else ''
            titles.append(title)
            descriptions.append(description)

        # Extract rooms, living space, and prices
        room_space_price_elements = soup.find_all(class_='HgListingRoomsLivingSpacePrice_roomsLivingSpacePrice_M6Ktp')
        for element in room_space_price_elements:
            strong_tags = element.find_all('strong')
            rooms.append(strong_tags[0].text.strip() if strong_tags else '')
            living_space.append(strong_tags[1].text.strip() if len(strong_tags) > 1 else '')
            
            # Extract price using the specific class name provided
            price_elem = element.find('span', class_='HgListingRoomsLivingSpacePrice_price_u9Vee')
            prices.append(price_elem.text.strip() if price_elem else 'Not available')

    # Close the driver after scraping
    driver.quit()

    # Ensure all lists have the same length before creating DataFrame
    min_length = min(len(addresses), len(titles), len(descriptions), len(rooms), len(living_space), len(prices))
    df = pd.DataFrame({
        'Address': addresses[:min_length],
        'Title': titles[:min_length],
        'Description': descriptions[:min_length],
        'Rooms': rooms[:min_length],
        'Living Space (sqm)': living_space[:min_length],
        'Price': prices[:min_length]
    })

    return df

# List of cantons
cantons = [
    'Zurich', 'Bern', 'Lucerne', 'Uri', 'Schwyz', 'Obwalden', 'Nidwalden', 'Glarus', 'Zug', 'Fribourg', 
    'Solothurn', 'basel-landschaft', 'Basel-Stadt', 'Schaffhausen', 'appenzell-inner-rhoden', 'appenzell-ausser-rhoden', 
    'st-gallen', 'Graubuenden', 'Aargau', 'Thurgau', 'Ticino', 'Vaud', 'Valais', 'Neuchatel', 'Geneva', 'Jura'
]

# Scrape apartments for each canton
for canton in cantons:
    print(f"Scraping apartments in {canton}...")
    url = f"https://www.immoscout24.ch/en/real-estate/buy/canton-{canton.replace(' ', '-').lower()}"
    apartment_data = scrape_apartments(url)
    # Save data to a CSV file for each canton
    apartment_data.to_csv(f'immoscout24_{canton.replace(" ", "_").lower()}.csv', sep=";", index=False)
    print(f"Scraping for {canton} complete.")


Scraping apartments in basel-landschaft...
Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
Scraping page 22...
Scraping page 23...
Scraping page 24...
Scraping page 25...
Scraping page 26...
Scraping page 27...
Scraping page 28...
Scraping page 29...
Scraping page 30...
Scraping page 31...
Scraping page 32...
Scraping page 33...
Scraping page 34...
Scraping page 35...
Scraping page 36...
Scraping page 37...
Scraping page 38...
Scraping page 39...
Scraping page 40...
Scraping page 41...
Scraping page 42...
Scraping page 43...
Scraping page 44...
Scraping page 45...
Scraping page 46...
Scraping page 47...
Scraping page 48...
Scrapi

In [12]:
import os
import pandas as pd

# Directory where CSV files are located
directory = 'Data Cantons'

# Get list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

# Initialize an empty list to store DataFrames
dfs = []

# Read each CSV file and append to the list of DataFrames
for file in csv_files:
    df = pd.read_csv(os.path.join(directory, file), sep=';')
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
merged_df = pd.concat(dfs, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_immoscout24_data.csv', sep=';', index=False)

print("Merged CSV file saved successfully.")


Merged CSV file saved successfully.


In [2]:
import os
import pandas as pd

# Directory where CSV files are located
directory = 'Data Cantons'

# Get list of all CSV files in the directory
csv_files = [file for file in os.listdir(directory) if file.endswith('.csv')]

# Initialize an empty list to store DataFrames
dfs = []

# Read each CSV file, add 'Canton' column and append to the list of DataFrames
for file in csv_files:
    canton_name = file.split('_')[1].split('.')[0].title()  # Extract canton name from file name
    df = pd.read_csv(os.path.join(directory, file), sep=';')
    df['Canton'] = canton_name
    dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
merged_df = pd.concat(dfs, ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_df.to_csv('merged_immoscout24_data_with_canton.csv', sep=';', index=False)

print("Merged CSV file with canton names saved successfully.")


Merged CSV file with canton names saved successfully.
