In [1]:
import requests
from bs4 import BeautifulSoup
from collections import Counter
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist
from cachetools import TTLCache

# Define the 10 biggest cities in Switzerland
cities = ['Zurich', 'Genf', 'Basel', 'Lausanne', 'Bern', 'Winterthur', 'Lucerne', 'St-Gallen', 'Lugano', 'Biel-Bienne']

# Create a cache with a time-to-live (TTL) of 2 hour to speed up reiterations
cache = TTLCache(maxsize=100, ttl=7200)

# Function to fetch a URL with caching
def fetch_url(url):
    if url in cache:
        return cache[url]
    response = requests.get(url)
    cache[url] = response
    return response

# Function to scrape apartment descriptions from ImmobilienScout24.ch
def scrape_apartment_descriptions(city):
    url = f'https://www.immoscout24.ch/en/real-estate/city-{city.lower().replace(" ", "-")}'
    response = fetch_url(url)
    soup = BeautifulSoup(response.content, 'html.parser')
    description_div = soup.find('div', class_='Text__TextStyled-fiIwWW')
    description_texts = [desc.get_text(strip=True) for desc in descriptions]
    return description_texts

# Scrape apartment descriptions for each city and page
all_descriptions = []
for city in cities:
    page = 1
    while True:
        url = f'https://www.immoscout24.ch/en/real-estate/rent/city-{city.lower().replace(" ", "-")}?pn={page}'
        #print("URL:", url)  # Print the generated URL for debugging
        response = fetch_url(url)
        soup = BeautifulSoup(response.content, 'html.parser')
        descriptions = soup.find_all('p', {'class': ['Box-cYFBPY', 'cVjoUC']})
        if not descriptions:
            break
        description_texts = [desc.get_text(strip=True) for desc in descriptions]
        all_descriptions.extend(description_texts)
        next_button = soup.find('div', class_='Box-cYFBPY jlxgvH')
        if not next_button:
            break
        page += 1

# Combine all descriptions into a single text
all_text = ' '.join(all_descriptions)

# Text Preprocessing
stop_words = set(stopwords.words('german'))  # Adjust the language as per your requirements
tokens = word_tokenize(all_text)
tokens = [word.lower() for word in tokens if word.isalpha() and word.lower() not in stop_words]

# Keyword summarization using Frequency Distribution
fdist = FreqDist(tokens)
keyword_counts = fdist.most_common(20)
keywords = [keyword for keyword, count in keyword_counts]

print("Keywords:", keywords)


[nltk_data] Downloading package punkt to /Users/taagaar8/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/taagaar8/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Keywords: ['de', 'wohnung', 'à', 'küche', 'et', 'vermieten', 'un', 'avec', 'a', 'une', 'per', 'la', 'le', 'nähe', 'chf', 'au', 'e', 'badezimmer', 'sowie', 'balkon']
