### Code from GPT and myself to scrape the links of Kamernet for Amsterdam on Thu July 30

In [2]:
import requests
from bs4 import BeautifulSoup
import time

# Base URL of the page to scrape
base_url = 'https://kamernet.nl/huren/huurwoningen-amsterdam'

# Parameters for the query string
params = {
    'searchview': '1',
    'maxRent': '0',
    'minSize': '2',
    'radius': '5',
    'sort': '1',
    'pageNo': 1
}

# Function to scrape a single page
def scrape_page(page_number):
    params['pageNo'] = page_number
    response = requests.get(base_url, params=params)
    if response.status_code == 200:
        soup = BeautifulSoup(response.content, 'html.parser')
        links = soup.select('a.MuiTypography-root.MuiLink-root.MuiLink-underlineNone.MuiPaper-root')
        hrefs = [link.get('href') for link in links if link.get('href')]
        return hrefs
    else:
        print(f"Failed to retrieve page {page_number}. Status code: {response.status_code}")
        return []

# Function to scrape multiple pages
def scrape_multiple_pages(start_page, end_page):
    all_hrefs = []
    for page_number in range(start_page, end_page + 1):
        print(f"Scraping page {page_number}...")
        hrefs = scrape_page(page_number)
        all_hrefs.extend(hrefs)
        time.sleep(2)  # Be polite and avoid making too many requests in a short time
    return all_hrefs

# Scrape the first 21 pages
all_hrefs = scrape_multiple_pages(1, 21)

# Print all hrefs
for href in all_hrefs:
    print(href)

Scraping page 1...
Scraping page 2...
Scraping page 3...
Scraping page 4...
Scraping page 5...
Scraping page 6...
Scraping page 7...
Scraping page 8...
Scraping page 9...
Scraping page 10...
Scraping page 11...
Scraping page 12...
Scraping page 13...
Scraping page 14...
Scraping page 15...
Scraping page 16...
Scraping page 17...
Scraping page 18...
Scraping page 19...
Scraping page 20...
Scraping page 21...
/huren/kamer-amsterdam/javastraat/kamer-2244992
/huren/kamer-kortenhoef/bernard-van-beeklaan/kamer-2244946
/huren/kamer-amsterdam/spakenburgstraat/kamer-2244949
/huren/kamer-amsterdam/wethouder-in-'t-veldstraat/kamer-2244647
/huren/kamer-amsterdam/johan-huizingalaan/kamer-2244926
/huren/studio-amsterdam/welnastraat/studio-2244909
/huren/kamer-amsterdam/eerste-van-swindenstraat/kamer-2241349
/huren/kamer-amsterdam/rapenburg/kamer-2244752
/huren/appartement-amsterdam/jan-evertsenstraat/appartement-2243760
/huren/kamer-almere/presidentstraat/kamer-2098922
/huren/kamer-amsterdam/groetst

In [8]:
# Find the number of unique links
unique_links = set(all_hrefs)
num_unique_links = len(unique_links)

print(f"Number of unique links: {num_unique_links}")

Number of unique links: 297


### Turn it into a dataframe with only the unique links

In [9]:
import pandas as pd

# Convert the list to a pandas DataFrame
df = pd.DataFrame(all_hrefs, columns=["links"])

In [11]:
df.describe()

Unnamed: 0,links
count,434
unique,297
top,/huren/kamer-almere/presidentstraat/kamer-2098922
freq,21


In [12]:
# Remove duplicates to keep only unique links
df_unique = df.drop_duplicates()

In [13]:
df_unique.describe()

Unnamed: 0,links
count,297
unique,297
top,/huren/kamer-amsterdam/javastraat/kamer-2244992
freq,1


In [14]:
# Export the DataFrame to a CSV file
csv_file = 'links.csv'
df_unique.to_csv(csv_file, index=False)

print(f"CSV file '{csv_file}' created successfully.")

CSV file 'links.csv' created successfully.


### What to use:
1. undoc api
2. requests + BS4
3. playwright

### If it has to be PLAYWRIGHT:
1. playwright to access
- css selector instead of xpath (page locator)
2. beautiful to scrape
3. regex to clean