# Scrapping 

## Getting the links
First we get links for villa from Boligsiden:

In [7]:
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
import re
import numpy as np
import matplotlib.pyplot as plt
import json
import os
from tqdm import tqdm 


In [208]:
def scrape_data(page_number):
    #We sort for villa as this is what we want 
    base_url = 'https://www.boligsiden.dk/tilsalg/villa?page={page_number}'

    # Boligsiden is structured with pagenumbers that go up to 200:
    url = base_url.format(page_number=page_number)

    # We ensure that we tell them who we are 
    response = requests.get(url, headers={
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64)',
        'name': 'Isabel Jacobsen',
        'email': 'qxh140@alumni.ku.dk',
        'Purpose': 'Research data collection as a student at the University of Copenhagen'
    })

    # Then check if the request was successful
    if response.status_code == 200:
        # Boligsiden contains a JSON file with the links that we want 
        soup = BeautifulSoup(response.content, 'lxml')
        script_tag = soup.find('script', id="__NEXT_DATA__")
        if script_tag:
            json_data = json.loads(script_tag.string)

            ads = json_data.get('props', {}).get('pageProps', {}).get('searchResults', {}).get('on_market', [])

            # We extract the URLs for the individual sale listings 
            url_list = []
            for ad in ads:
                address_data = ad.get('address', {})
                href = address_data.get('_links', {}).get('self', {}).get('href', '')
                if href:
                    url_list.append(f"https://www.boligsiden.dk{href}")

            # Return all found URLs
            return url_list
        else:
            print("Could not find the JSON data in the page.")
    else:
        print(f"Failed to retrieve page {page_number}, status code: {response.status_code}")

    return []

# As mentioned Boligsiden has a standard of 200 pages 
num_pages = 200

# Create an empty list to store all URLs
all_urls = []

# We look through the pages 
for page_number in tqdm(range(1, num_pages + 1), desc="Scraping URLs"):  # Use tqdm for progress bar
    urls = scrape_data(page_number)
    all_urls.extend(urls)
    
    # Sleep for 1 second between requests to avoid overwhelming the server
    time.sleep(1)

# Make final list of working URLs:
final_urls_temp = [url for url in all_urls]

# Remove duplicates in final_urls_temp:
final_urls = list(set(final_urls_temp))
num_duplicates_removed_url = len(final_urls_temp) - len(final_urls)

# Print number of duplicates removed:
print("Number of duplicates removed:", num_duplicates_removed_url)

# Save final URLs to a CSV file:
pd.DataFrame(final_urls).to_csv('Final_urls_villa.csv', index=False, header=False)

#We remove 7 dumplicates

Scraping URLs: 100%|██████████| 200/200 [07:51<00:00,  2.36s/it]

Number of duplicates removed: 7





In [210]:
#Checking to make sure the links look right and how many links we got 
final_urls
link_count = len(final_urls)

print(f"Total number of links: {link_count}")


Total number of links: 9993


## Getting information out of the links 
First testing the code then running on all the links

In [11]:
#We use selenium as the other methods did not work 
from selenium import webdriver 
from webdriver_manager.chrome import ChromeDriverManager 
from selenium.webdriver.chrome.options import Options 
from selenium.webdriver.chrome.service import Service 
from selenium.webdriver.common.by import By 

# Disable search engine choice for Google Chrome (needed for newer versions of Google Chrome) 
# We add headless and the user-agent to make scraping like this easier and faster 
chrome_options = Options() 
chrome_options.add_argument("--disable-search-engine-choice-screen")
chrome_options.add_argument("--headless")
chrome_options.add_argument('user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36')

# Initialize the Chrome driver with the options
driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=chrome_options) 

# URL to visit
url = 'https://www.boligsiden.dk' 
driver.get(url) 

# Wait for the cookie consent button to be available and click it
cookie_button = driver.find_element(By.ID, 'didomi-notice-agree-button')
cookie_button.click()




In [214]:

# We split the scraping into chunks so that we dont use up all the memory on the computer save data chunks
data_chunks_dir = "1_Data_Chunks_Location"

# We first create the directory 
if not os.path.exists(data_chunks_dir):
    os.makedirs(data_chunks_dir)


In [216]:
#We first test on a fraction so that we know that this should work on them all
# Initialize lists for storing data We end up not using them all but this was the orignal list based of the information in the API that we ended up
#Not being able to use 
adress_list = []
nr_rooms_list = []
property_type_list = []
elevator_list = []
balcony_or_terrace_list = []
energy_label_list = []
description_list = []
latitude_list = []
longitude_list = []
street_name_list = []
postal_code_list = []
city_list = []
price_list = []
provice_list = []
home_size_list = []
ground_size_list = []
basement_size_list = []
days_market_list = []
monthlyExpense_list = []
nr_bath_list = []
perAreaPrice_list = []
year_built_list = []
downPayment_list = []
grossMortgage_list = []
net_mortgage_list = []
realtor_name_list = []
off_vurdering_list = []

# Initialize a counter to keep track of the number of scraped URLs
scraped_count = 0

# Initialize lists to hold data for the current chunk
current_chunk = []

# Define the fraction of URLs to test on, we started with 5
test_fraction = 5
final_urls = final_urls[:test_fraction]


# We then create our loop 
for i in tqdm(range(len(final_urls)), desc="Scraping Data"):
    # List of URLs to scrape from above 
    url = final_urls[i]

    # Selenium scraping we add sleep to aviod breaking anything 
    driver.get(url)
    time.sleep(1)  
    html = driver.page_source
    soup = BeautifulSoup(html, 'lxml')

    # The page contains usefull json information that we get out
    script_tag = soup.find('script', {'type': 'application/ld+json'})
    if script_tag:
        json_text = script_tag.string.strip()
        try:
            data = json.loads(json_text)
        except json.JSONDecodeError as e:
            print("Error decoding JSON:", e)
            data = []
    else:
        data = []

    # Process JSON data, as it was very messy 
    if isinstance(data, list):
        for item in data:
            if isinstance(item, dict):
                type_ = item.get('@type', 'Unknown')

                if type_ == 'SingleFamilyResidence':
                    address = item.get('Address', {})
                    street_address = address.get('streetAddress', 'NaN')
                    locality = address.get('addressLocality', 'NaN')
                    postal_code = address.get('postalCode', 'NaN')
                    latitude = item.get('geo', {}).get('latitude', 'NaN')
                    longitude = item.get('geo', {}).get('longitude', 'NaN')
                    number_of_rooms = item.get('numberOfRooms', 'NaN')

                    street_name_list.append(street_address)
                    city_list.append(locality)
                    postal_code_list.append(postal_code)
                    latitude_list.append(latitude)
                    longitude_list.append(longitude)
                    nr_rooms_list.append(number_of_rooms)

                elif type_ == 'Product':
                    price = item.get('offers', {}).get('price', 'NaN')
                    price_list.append(price)

    # We then scrape additional fields using BeautifulSoup, this was the stuff visible on the page as a user 
    # House type
    type_element = soup.find('span', class_="text-black text-sm pr-2")
    property_type = type_element.get_text(strip=True) if type_element else 'N/A'
    property_type_list.append(property_type)

    # Elevator, Rooms, Year Built, Toilets, Balcony, Terrace were together as they all had the same class and no other defining features 
    elements = soup.find_all('span', class_="text-gray-800 whitespace-nowrap border-dotted text-sm")
    info = {
        'Elevator': 'NaN',
        'Rooms': 'NaN',
        'Year Built': 'NaN',
        'Toilets': 'NaN',
        'Balcony': 'NaN',
        'Terrace': 'NaN'
    }

    keywords = {
        'Elevator': 'Elevator',
        'Rooms': 'værelser',
        'Year Built': 'Opført',
        'Toilets': ['toilet', 'toiletter'],
        'Balcony': 'Altan',
        'Terrace': 'Terrasse'
    }

    for element in elements:
        text = element.get_text(strip=True)
        for key, keyword in keywords.items():
            if isinstance(keyword, list):
                if any(kw in text for kw in keyword):
                    info[key] = text.split(':', 1)[-1].strip() or 'NaN'
            else:
                if keyword in text:
                    info[key] = text.split(':', 1)[-1].strip() or 'NaN'

    elevator_list.append(info['Elevator'])
    nr_rooms_list.append(info['Rooms'])
    year_built_list.append(info['Year Built'])
    nr_bath_list.append(info['Toilets'])
    # Handle either/or for Balcony and Terrace (turned out to be useless as Boligsiden doesnt write if houses have a balcony, even when they clearly do in the photo.)
    # We kept this as we originally were going to scrape for more house types, but decided not to and focus on villas
    if info['Balcony'] == 'Ja' or info['Terrace'] == 'Ja':
        balcony_or_terrace_list.append('Ja')
    else:
        balcony_or_terrace_list.append('NaN')

    # Province
    provins_element = soup.find('a', {'data-testid': 'breadcrumb'})
    provins = provins_element.get_text(strip=True) if provins_element else 'N/A'
    provice_list.append(provins)

    # Off Vurdering
    off_vurdering_element = soup.find('span', string='Off. vurdering 2022:').find_next('span', class_="font-bold")
    off_vurdering_value = off_vurdering_element.text.replace(' kr.', '').strip() if off_vurdering_element else 'N/A'
    off_vurdering_list.append(off_vurdering_value)

    # Energy label
    energy_label_element = soup.find('svg', id='Lag_1')

    if energy_label_element:
        # Extract the title element within the SVG
        title_element = energy_label_element.find('title')

        if title_element:
            # Get the text content of the title element
            title_text = title_element.text

            # Strip "Energimærke " from the title to get just "G"
            label = title_text.replace("Energimærke ", "").strip()

            # Add the label to the energy_label_list
            energy_label_list.append(label)
        else:
            energy_label_list.append('N/A')
    else:
        energy_label_list.append('N/A')

    # Days on Market
    dage_element = soup.find('div', class_="pb-0.5 border-dashed border-b border-gray-300 cursor-pointer text-sm text-gray-800")
    dage = dage_element.get_text(strip=True) if dage_element else 'N/A'
    dage_without_tilsalg = dage.replace("Til salg i alt: ", "")
    days_market_list.append(dage_without_tilsalg)

    #The ejerudg, ground size and house size were also together so here they are split up
    span_elements = soup.find_all('span', class_='text-gray-800 whitespace-nowrap cursor-pointer border-b border-gray-400 pb-0.5 border-dotted text-sm')


    for element in span_elements:
       text = element.get_text(strip=True)
       if 'Ejerudg.:' in text:
        mu_value = text.replace("Ejerudg.:", "").replace(" kr/md", "").strip()
        monthlyExpense_list.append(mu_value)
       elif 'Grund:' in text:
        ground_size_value = text.replace("Grund:", "").replace(" m²", "").strip()
        ground_size_list.append(ground_size_value)
       elif 'm²' in text and 'Grund' not in text:
        home_size_value = text.replace(" m²", "").strip()
        home_size_list.append(home_size_value)

    
       # Append data to the current chunk
    current_chunk.append({
        'Adress': street_name_list[i],
        'City': city_list[i],
        'Street name': street_name_list[i],
        'Postal code': postal_code_list[i],
        'Latitude': latitude_list[i],
        'Longitude': longitude_list[i],
        'Rooms': nr_rooms_list[i],
        'Property type': property_type_list[i],
        'Elevator': elevator_list[i],
        'Balcony/Terrace': balcony_or_terrace_list[i],
        'Energy label': energy_label_list[i],
        'Price': price_list[i],
        'Province': provice_list[i],
        'Days on Market': days_market_list[i],
        'Link': final_urls[i],
        'Off. vurdering': off_vurdering_list[i],
        'Home size': home_size_list[i],
        'Monthly expense': monthlyExpense_list[i],
        'Ground size': ground_size_list[i]
    })

    # Increment the counter
    scraped_count += 1

    # Save data chunk
    if scraped_count % test_fraction == 0 or i == len(final_urls) - 1:
        chunk_number = (scraped_count - 1) // test_fraction + 1
        df_chunk = pd.DataFrame(current_chunk)
        df_chunk.to_csv(os.path.join("1_Data_Chunks_Location", f'data_chunk_{chunk_number}.csv'), index=False)
        current_chunk = []

    time.sleep(1)
    #More sleeping so that nothing should break or panic  

#  We connect and save chunks were not necessary when running our small test but good to have and undertand how they work 
all_chunks = []
num_chunks = (scraped_count // test_fraction) + (1 if scraped_count % test_fraction != 0 else 0)

for chunk_number in range(1, num_chunks + 1):
    chunk_filename = os.path.join("1_Data_Chunks_Location", f'data_chunk_{chunk_number}.csv')
    chunk_df = pd.read_csv(chunk_filename)
    all_chunks.append(chunk_df)

boligsiden_df = pd.concat(all_chunks, ignore_index=True)

# Remove duplicates and save final dataframe
initial_row_count = len(boligportalen_df)
boligsiden_df = boligsiden_df.drop_duplicates(subset=['Link'])
final_row_count = len(boligsiden_df)
duplicates_removed = initial_row_count - final_row_count

boligsiden_df.to_csv('1_boligsiden.csv', index=False)

duplicates_exist = boligsiden_df['Link'].duplicated().any()

print("Duplicates exist in the 'Link' column:", duplicates_exist)
print("Number of duplicates removed:", duplicates_removed)

boligsiden_df.head()
#We remove three duplicates 

Scraping Data: 100%|██████████| 2/2 [00:19<00:00,  9.63s/it]

Duplicates exist in the 'Link' column: False
Number of duplicates removed: 3





Unnamed: 0,Adress,City,Street name,Postal code,Latitude,Longitude,Rooms,Property type,Elevator,Balcony/Terrace,Energy label,Price,Province,Days on Market,Link,Off. vurdering,Home size,Monthly expense,Ground size
0,Nørregade 27,Agerskov,Nørregade 27,6534,55.13096,9.136025,4,Villa,,,E,495000,Sydjylland,Til salg i alt:22dage,https://www.boligsiden.dk/addresses/0a3f50b7-3...,,100,1.079,1015
1,Keldsvej 5,Store Fuglede,Keldsvej 5,4480,55.577656,11.180144,4 værelser,Villa,,,C,1695000,Vest- og Sydsjælland,Til salg i alt:50dage,https://www.boligsiden.dk/addresses/0a3f50ac-e...,1.488.000,223,2.079,924


In [177]:
#Making sure all the columns are right 
print(boligsiden_df.columns)
boligsiden_df.head(10)

Index(['Adress', 'City', 'Street name', 'Postal code', 'Latitude', 'Longitude',
       'Rooms', 'Property type', 'Elevator', 'Balcony/Terrace', 'Energy label',
       'Price', 'Province', 'Days on Market', 'Link', 'Off. vurdering',
       'Home size', 'Monthly expense', 'Ground size'],
      dtype='object')


Unnamed: 0,Adress,City,Street name,Postal code,Latitude,Longitude,Rooms,Property type,Elevator,Balcony/Terrace,Energy label,Price,Province,Days on Market,Link,Off. vurdering,Home size,Monthly expense,Ground size
0,Nørregade 27,Agerskov,Nørregade 27,6534,55.13096,9.136025,4,Villa,,,E,495000,Sydjylland,Til salg i alt:22dage,https://www.boligsiden.dk/addresses/0a3f50b7-3...,,100,1.079,1015
1,Keldsvej 5,Store Fuglede,Keldsvej 5,4480,55.577656,11.180144,4 værelser,Villa,,,C,1695000,Vest- og Sydsjælland,50 dage,https://www.boligsiden.dk/addresses/0a3f50ac-e...,1.488.000,223,2.079,924


### Now for the actual scraping of information

In [242]:
#Reloading all the packages as a backup 
import os
import json
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from tqdm import tqdm

# Function to save and load progress, as there were some problems trying to run the whole thing 
def save_progress(chunk_index):
    with open('progress.txt', 'w') as f:
        f.write(str(chunk_index))

def load_progress():
    try:
        with open('progress.txt', 'r') as f:
            return int(f.read().strip())
    except FileNotFoundError:
        return -1  

# Initialize lists for storing data again we dont use them all but i forgot to remove them so they are here 
data_lists = {
    'adress_list': [],
    'nr_rooms_list': [],
    'property_type_list': [],
    'elevator_list': [],
    'balcony_or_terrace_list': [],
    'energy_label_list': [],
    'description_list': [],
    'latitude_list': [],
    'longitude_list': [],
    'street_name_list': [],
    'postal_code_list': [],
    'city_list': [],
    'price_list': [],
    'provice_list': [],
    'home_size_list': [],
    'ground_size_list': [],
    'basement_size_list': [],
    'days_market_list': [],
    'monthlyExpense_list': [],
    'nr_bath_list': [],
    'perAreaPrice_list': [],
    'year_built_list': [],
    'downPayment_list': [],
    'grossMortgage_list': [],
    'net_mortgage_list': [],
    'realtor_name_list': [],
    'off_vurdering_list': []
}

# First we load URLs from CSV
df = pd.read_csv('Final_urls_villa.csv', header=None)
final_urls = df[0].tolist()

# Then split URLs into chunks, as we were wasting time with it failing before 
chunk_size = 500
url_chunks = [final_urls[i:i + chunk_size] for i in range(0, len(final_urls), chunk_size)]

# Initialize the driver again (as it failed once because it lost connectiong)
driver = webdriver.Chrome()

# Load the last processed chunk index
start_chunk_index = load_progress()

# Process each chunk starting from the last processed chunk
for chunk_index in range(start_chunk_index + 1, len(url_chunks)):
    print(f"Processing chunk {chunk_index + 1}/{len(url_chunks)}")
    urls = url_chunks[chunk_index]
    current_chunk = []
    scraped_count = 0

    for url in tqdm(urls, desc=f"Scraping Chunk {chunk_index + 1}", unit="url"):
        driver.get(url)
        time.sleep(2)  # Adjust as needed
        html = driver.page_source
        soup = BeautifulSoup(html, 'lxml')

        # Extract JSON-LD data
        script_tag = soup.find('script', {'type': 'application/ld+json'})
        if script_tag:
            json_text = script_tag.string.strip()
            try:
                data = json.loads(json_text)
            except json.JSONDecodeError as e:
                print("Error decoding JSON:", e)
                data = []
        else:
            data = []

        # Process JSON-LD data
        if isinstance(data, list):
            for item in data:
                if isinstance(item, dict):
                    type_ = item.get('@type', 'Unknown')

                    if type_ == 'SingleFamilyResidence':
                        address = item.get('Address', {})
                        street_address = address.get('streetAddress', 'NaN')
                        locality = address.get('addressLocality', 'NaN')
                        postal_code = address.get('postalCode', 'NaN')
                        latitude = item.get('geo', {}).get('latitude', 'NaN')
                        longitude = item.get('geo', {}).get('longitude', 'NaN')
                        number_of_rooms = item.get('numberOfRooms', 'NaN')

                        data_lists['street_name_list'].append(street_address)
                        data_lists['city_list'].append(locality)
                        data_lists['postal_code_list'].append(postal_code)
                        data_lists['latitude_list'].append(latitude)
                        data_lists['longitude_list'].append(longitude)
                        data_lists['nr_rooms_list'].append(number_of_rooms)

                    elif type_ == 'Product':
                        price = item.get('offers', {}).get('price', 'NaN')
                        data_lists['price_list'].append(price)

        # Scrape more stuff using BeautifulSoup
        type_element = soup.find('span', class_="text-black text-sm pr-2")
        property_type = type_element.get_text(strip=True) if type_element else 'N/A'
        data_lists['property_type_list'].append(property_type)

        elements = soup.find_all('span', class_="text-gray-800 whitespace-nowrap border-dotted text-sm")
        info = {
            'Elevator': 'NaN',
            'Rooms': 'NaN',
            'Year Built': 'NaN',
            'Toilets': 'NaN',
            'Balcony': 'NaN',
            'Terrace': 'NaN'
        }

        keywords = {
            'Elevator': 'Elevator',
            'Rooms': 'værelser',
            'Year Built': 'Opført',
            'Toilets': ['toilet', 'toiletter'],
            'Balcony': 'Altan',
            'Terrace': 'Terrasse'
        }

        for element in elements:
            text = element.get_text(strip=True)
            for key, keyword in keywords.items():
                if isinstance(keyword, list):
                    if any(kw in text for kw in keyword):
                        info[key] = text.split(':', 1)[-1].strip() or 'NaN'
                else:
                    if keyword in text:
                        info[key] = text.split(':', 1)[-1].strip() or 'NaN'

        data_lists['elevator_list'].append(info['Elevator'])
        data_lists['nr_rooms_list'].append(info['Rooms'])
        data_lists['year_built_list'].append(info['Year Built'])
        data_lists['nr_bath_list'].append(info['Toilets'])

        if info['Balcony'] == 'Ja' or info['Terrace'] == 'Ja':
            data_lists['balcony_or_terrace_list'].append('Ja')
        else:
            data_lists['balcony_or_terrace_list'].append('NaN')

        provins_element = soup.find('a', {'data-testid': 'breadcrumb'})
        provins = provins_element.get_text(strip=True) if provins_element else 'N/A'
        data_lists['provice_list'].append(provins)

        #This failed an hour into scrapping because i forgot the N/A option so that was fun 
        off_vurdering_element = soup.find('span', string='Off. vurdering 2022:')
        if off_vurdering_element:
            off_vurdering_value_element = off_vurdering_element.find_next('span', class_="font-bold")
            off_vurdering_value = off_vurdering_value_element.text.replace(' kr.', '').strip() if off_vurdering_value_element else 'N/A'
        else:
            off_vurdering_value = 'N/A'
        data_lists['off_vurdering_list'].append(off_vurdering_value)

        energy_label_element = soup.find('svg', id='Lag_1')
        if energy_label_element:
            title_element = energy_label_element.find('title')
            if title_element:
                title_text = title_element.text
                label = title_text.replace("Energimærke ", "").strip()
                data_lists['energy_label_list'].append(label)
            else:
                data_lists['energy_label_list'].append('N/A')
        else:
            data_lists['energy_label_list'].append('N/A')

        dage_element = soup.find('div', class_="pb-0.5 border-dashed border-b border-gray-300 cursor-pointer text-sm text-gray-800")
        dage = dage_element.get_text(strip=True) if dage_element else 'N/A'
        dage_without_tilsalg = dage.replace("Til salg i alt: ", "")
        data_lists['days_market_list'].append(dage_without_tilsalg)

        span_elements = soup.find_all('span', class_='text-gray-800 whitespace-nowrap cursor-pointer border-b border-gray-400 pb-0.5 border-dotted text-sm')
        for element in span_elements:
            text = element.get_text(strip=True)
            if 'Ejerudg.:' in text:
                mu_value = text.replace("Ejerudg.:", "").replace(" kr/md", "").strip()
                data_lists['monthlyExpense_list'].append(mu_value)
            elif 'Grund:' in text:
                ground_size_value = text.replace("Grund:", "").replace(" m²", "").strip()
                data_lists['ground_size_list'].append(ground_size_value)
            elif 'm²' in text and 'Grund' not in text:
                home_size_value = text.replace(" m²", "").strip()
                data_lists['home_size_list'].append(home_size_value)

        current_chunk.append({
            'Adress': data_lists['street_name_list'][-1],
            'City': data_lists['city_list'][-1],
            'Street name': data_lists['street_name_list'][-1],
            'Postal code': data_lists['postal_code_list'][-1],
            'Latitude': data_lists['latitude_list'][-1],
            'Longitude': data_lists['longitude_list'][-1],
            'Rooms': data_lists['nr_rooms_list'][-1],
            'Property type': data_lists['property_type_list'][-1],
            'Elevator': data_lists['elevator_list'][-1],
            'Balcony/Terrace': data_lists['balcony_or_terrace_list'][-1],
            'Energy label': data_lists['energy_label_list'][-1],
            'Price': data_lists['price_list'][-1],
            'Province': data_lists['provice_list'][-1],
            'Days on Market': data_lists['days_market_list'][-1],
            'Link': url,
            'Off. vurdering': data_lists['off_vurdering_list'][-1],
            'Home size': data_lists['home_size_list'][-1],
            'Monthly expense': data_lists['monthlyExpense_list'][-1],
            'Ground size': data_lists['ground_size_list'][-1]
        })
        #I realized i forgot to store the year built and number of bathrooms :(
        scraped_count += 1
        time.sleep(0.5) #reduced the sleeptime slightly

    # Save the current chunk to CSV
    chunk_filename = os.path.join("1_Data_Chunks_Location", f'data_chunk_{chunk_index + 1}.csv')
    pd.DataFrame(current_chunk).to_csv(chunk_filename, index=False)

    # Update progress
    save_progress(chunk_index)

# Final cleanup
driver.quit()
print("Processing complete.")


Processing chunk 1/20


Scraping Chunk 1: 100%|██████████| 500/500 [34:31<00:00,  4.14s/url]


Processing chunk 2/20


Scraping Chunk 2: 100%|██████████| 500/500 [34:55<00:00,  4.19s/url]


Processing chunk 3/20


Scraping Chunk 3: 100%|██████████| 500/500 [34:32<00:00,  4.15s/url]


Processing chunk 4/20


Scraping Chunk 4: 100%|██████████| 500/500 [34:22<00:00,  4.12s/url]


Processing chunk 5/20


Scraping Chunk 5: 100%|██████████| 500/500 [34:39<00:00,  4.16s/url]


Processing chunk 6/20


Scraping Chunk 6: 100%|██████████| 500/500 [34:29<00:00,  4.14s/url]


Processing chunk 7/20


Scraping Chunk 7: 100%|██████████| 500/500 [34:38<00:00,  4.16s/url]


Processing chunk 8/20


Scraping Chunk 8: 100%|██████████| 500/500 [34:24<00:00,  4.13s/url]


Processing chunk 9/20


Scraping Chunk 9: 100%|██████████| 500/500 [34:34<00:00,  4.15s/url]


Processing chunk 10/20


Scraping Chunk 10: 100%|██████████| 500/500 [34:13<00:00,  4.11s/url]


Processing chunk 11/20


Scraping Chunk 11: 100%|██████████| 500/500 [34:36<00:00,  4.15s/url]


Processing chunk 12/20


Scraping Chunk 12: 100%|██████████| 500/500 [34:09<00:00,  4.10s/url]


Processing chunk 13/20


Scraping Chunk 13: 100%|██████████| 500/500 [34:22<00:00,  4.13s/url]


Processing chunk 14/20


Scraping Chunk 14: 100%|██████████| 500/500 [34:52<00:00,  4.19s/url]


Processing chunk 15/20


Scraping Chunk 15: 100%|██████████| 500/500 [34:35<00:00,  4.15s/url]


Processing chunk 16/20


Scraping Chunk 16: 100%|██████████| 500/500 [34:27<00:00,  4.14s/url]


Processing chunk 17/20


Scraping Chunk 17: 100%|██████████| 500/500 [34:19<00:00,  4.12s/url]


Processing chunk 18/20


Scraping Chunk 18: 100%|██████████| 500/500 [34:45<00:00,  4.17s/url]


Processing chunk 19/20


Scraping Chunk 19: 100%|██████████| 500/500 [34:13<00:00,  4.11s/url]


Processing chunk 20/20


Scraping Chunk 20: 100%|██████████| 493/493 [33:52<00:00,  4.12s/url]


Processing complete.


In [248]:
#Then we collect all the chunks to a single dataframe 
#reload packages as i ran the scraping in the night and i just wanted to insure everything was remembered
import pandas as pd
import os

# Directory where chunk files are saved
chunks_dir = "1_Data_Chunks_Location"

# We get a list of all chunk files
chunk_files = [f for f in os.listdir(chunks_dir) if f.startswith('data_chunk_') and f.endswith('.csv')]

# Sort the files to ensure they are in the correct order
chunk_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))

# Initialize a list to store dataframes
chunk_dfs = []

# Read and collect all the chunks into the dataframe 
for file in chunk_files:
    chunk_path = os.path.join(chunks_dir, file)
    df_chunk = pd.read_csv(chunk_path)
    chunk_dfs.append(df_chunk)

final_df = pd.concat(chunk_dfs, ignore_index=True)

# Save the final dataframe to a CSV file
final_output_path = '1_boligsiden.csv'
final_df.to_csv(final_output_path, index=False)
#And now we look at it and feel joy, that it actually worked :D
final_df


Unnamed: 0,Adress,City,Street name,Postal code,Latitude,Longitude,Rooms,Property type,Elevator,Balcony/Terrace,Energy label,Price,Province,Days on Market,Link,Off. vurdering,Home size,Monthly expense,Ground size
0,Nørregade 27,Agerskov,Nørregade 27,6534,55.130960,9.136025,4 værelser,Villa,,,E,495000,Sydjylland,Til salg i alt:22dage,https://www.boligsiden.dk/addresses/0a3f50b7-3...,,100,1.079,1015
1,Keldsvej 5,Store Fuglede,Keldsvej 5,4480,55.577656,11.180144,7 værelser,Villa,,,C,1695000,Vest- og Sydsjælland,Til salg i alt:50dage,https://www.boligsiden.dk/addresses/0a3f50ac-e...,1.488.000,223,2.079,924
2,Tofteengen 8,Roskilde,Tofteengen 8,4000,55.693270,12.147028,7 værelser,Villa,,,,6495000,Østsjælland,Til salg i alt:23dage,https://www.boligsiden.dk/addresses/616d7375-2...,5.611.000,190,2.944,834
3,Længstedal 16,Skjern,Længstedal 16,6900,56.003277,8.696856,7 værelser,Villa/Landejendom,,,C,1495000,Vestjylland,42 dage,https://www.boligsiden.dk/addresses/19c2c328-b...,1.956.000,224,2.222,17768
4,Åskrænten 22,Egå,Åskrænten 22,8250,56.203278,10.264264,5 værelser,Villa,,,C,6998000,Østjylland,0 dag,https://www.boligsiden.dk/addresses/0a3f50c4-c...,5.934.000,163,4.825,810
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10152,Ligustervej 27,Silkeborg,Ligustervej 27,8600,56.151430,9.481631,4 værelser,Villa,,,C,1695000,Østjylland,Til salg i alt:15dage,https://www.boligsiden.dk/addresses/0a3f50c1-e...,1.478.000,94,2.162,830
10153,Elmsager 1,Risskov,Elmsager 1,8240,56.205704,10.211967,5,Villa,,,C,5248000,Østjylland,Til salg i alt:37dage,https://www.boligsiden.dk/addresses/0a3f50c2-c...,4.105.000,145,3.516,982
10154,Grønnemose Alle 140,Søborg,Grønnemose Alle 140,2860,55.723960,12.500110,5 værelser,Villa,,,E,7195000,Københavns omegn,Til salg i alt:14dage,https://www.boligsiden.dk/addresses/0a3f50a4-4...,4.160.000,112,3.909,975
10155,Lerbjergvej 42,Kolding,Lerbjergvej 42,6000,55.490196,9.439367,6,Villa,,,C,3195000,Sydjylland,Til salg i alt:28dage,https://www.boligsiden.dk/addresses/0a3f50bc-1...,2.157.000,144,2.323,792


In [9]:
#I realised i forgot to scrape for year built and toillets/save them after scraping the information so i do that now 
data_chunks_dir = "1_Data_Chunks_Location_test"

if not os.path.exists(data_chunks_dir):
    os.makedirs(data_chunks_dir)


In [23]:
#Reloading all the packages as a backup 
import os
import json
import time
import pandas as pd
from bs4 import BeautifulSoup
from selenium import webdriver
from tqdm import tqdm

# Function to save and load progress, as there were some problems trying to run the whole thing 
def save_progress(chunk_index):
    with open('progress2.txt', 'w') as f:
        f.write(str(chunk_index))

def load_progress():
    try:
        with open('progress2.txt', 'r') as f:
            return int(f.read().strip())
    except FileNotFoundError:
        return -1  

# Initialize lists for storing data again we dont use them all but i forgot to remove them so they are here 
data_lists = {
    'adress_list': [],
    'latitude_list': [],
    'longitude_list': [],
    'street_name_list': [],
    'postal_code_list': [],
    'city_list': [],
    'nr_bath_list': [],
    'year_built_list': [],

}

# First we load URLs from CSV
df = pd.read_csv('Final_urls_villa.csv', header=None)
final_urls = df[0].tolist()

# Then split URLs into chunks, as we were wasting time with it failing before 
chunk_size = 500
url_chunks = [final_urls[i:i + chunk_size] for i in range(0, len(final_urls), chunk_size)]

# Initialize the driver again (as it failed once because it lost connectiong)
driver = webdriver.Chrome()

# Load the last processed chunk index
start_chunk_index = load_progress()

# Process each chunk starting from the last processed chunk
for chunk_index in range(start_chunk_index + 1, len(url_chunks)):
    print(f"Processing chunk {chunk_index + 1}/{len(url_chunks)}")
    urls = url_chunks[chunk_index]
    current_chunk = []
    scraped_count = 0

    for url in tqdm(urls, desc=f"Scraping Chunk {chunk_index + 1}", unit="url"):
        driver.get(url)
        time.sleep(0.5)  # Adjust as needed
        html = driver.page_source
        soup = BeautifulSoup(html, 'lxml')

        # Extract JSON-LD data
        script_tag = soup.find('script', {'type': 'application/ld+json'})
        if script_tag:
            json_text = script_tag.string.strip()
            try:
                data = json.loads(json_text)
            except json.JSONDecodeError as e:
                print("Error decoding JSON:", e)
                data = []
        else:
            data = []

        # Process JSON-LD data
        if isinstance(data, list):
            for item in data:
                if isinstance(item, dict):
                    type_ = item.get('@type', 'Unknown')

                    if type_ == 'SingleFamilyResidence':
                        address = item.get('Address', {})
                        street_address = address.get('streetAddress', 'NaN')
                        locality = address.get('addressLocality', 'NaN')
                        postal_code = address.get('postalCode', 'NaN')
                        latitude = item.get('geo', {}).get('latitude', 'NaN')
                        longitude = item.get('geo', {}).get('longitude', 'NaN')
                       
                        data_lists['street_name_list'].append(street_address)
                        data_lists['city_list'].append(locality)
                        data_lists['postal_code_list'].append(postal_code)
                        data_lists['latitude_list'].append(latitude)
                        data_lists['longitude_list'].append(longitude)
            

        # Scrape more stuff using BeautifulSoup
        type_element = soup.find('span', class_="text-black text-sm pr-2")
        property_type = type_element.get_text(strip=True) if type_element else 'N/A'
        
        elements = soup.find_all('span', class_="text-gray-800 whitespace-nowrap border-dotted text-sm")
        info = {
            
            'Year Built': 'NaN',
            'Toilets': 'NaN',
            
        }

        keywords = {
            
            'Year Built': 'Opført',
            'Toilets': ['toilet', 'toiletter'],
           
        }

        for element in elements:
            text = element.get_text(strip=True)
            for key, keyword in keywords.items():
                if isinstance(keyword, list):
                    if any(kw in text for kw in keyword):
                        info[key] = text.split(':', 1)[-1].strip() or 'NaN'
                else:
                    if keyword in text:
                        info[key] = text.split(':', 1)[-1].strip() or 'NaN'

        data_lists['year_built_list'].append(info['Year Built'])
        data_lists['nr_bath_list'].append(info['Toilets'])


       
        current_chunk.append({
            'Adress': data_lists['street_name_list'][-1],
            'City': data_lists['city_list'][-1],
            'Street name': data_lists['street_name_list'][-1],
            'Postal code': data_lists['postal_code_list'][-1],
            'Latitude': data_lists['latitude_list'][-1],
            'Longitude': data_lists['longitude_list'][-1],
            'Year built': data_lists['year_built_list'][-1],
            'Bathrooms': data_lists['nr_bath_list'][-1]
        })

        scraped_count += 1
        time.sleep(0.5) #reduced the sleeptime slightly

    # Save the current chunk to CSV
    chunk_filename = os.path.join("1_Data_Chunks_Location_test", f'data_chunk_{chunk_index + 1}.csv')'
    0
    pd.DataFrame(current_chunk).to_csv(chunk_filename, index=False)

    # Update progress
    save_progress(chunk_index)

# Final cleanup
driver.quit()
print("Processing complete.")
#I get all the location data so that i can merge 

Processing chunk 1/20


Scraping Chunk 1: 100%|██████████| 500/500 [22:18<00:00,  2.68s/url]


Processing chunk 2/20


Scraping Chunk 2: 100%|██████████| 500/500 [22:16<00:00,  2.67s/url]


Processing chunk 3/20


Scraping Chunk 3: 100%|██████████| 500/500 [22:30<00:00,  2.70s/url]


Processing chunk 4/20


Scraping Chunk 4: 100%|██████████| 500/500 [22:26<00:00,  2.69s/url]


Processing chunk 5/20


Scraping Chunk 5: 100%|██████████| 500/500 [22:16<00:00,  2.67s/url]


Processing chunk 6/20


Scraping Chunk 6: 100%|██████████| 500/500 [21:49<00:00,  2.62s/url]


Processing chunk 7/20


Scraping Chunk 7: 100%|██████████| 500/500 [22:00<00:00,  2.64s/url]


Processing chunk 8/20


Scraping Chunk 8: 100%|██████████| 500/500 [22:04<00:00,  2.65s/url]


Processing chunk 9/20


Scraping Chunk 9: 100%|██████████| 500/500 [22:07<00:00,  2.66s/url]


Processing chunk 10/20


Scraping Chunk 10: 100%|██████████| 500/500 [21:56<00:00,  2.63s/url]


Processing chunk 11/20


Scraping Chunk 11: 100%|██████████| 500/500 [22:07<00:00,  2.65s/url]


Processing chunk 12/20


Scraping Chunk 12: 100%|██████████| 500/500 [22:13<00:00,  2.67s/url]


Processing chunk 13/20


Scraping Chunk 13: 100%|██████████| 500/500 [22:32<00:00,  2.71s/url]


Processing chunk 14/20


Scraping Chunk 14: 100%|██████████| 500/500 [22:18<00:00,  2.68s/url]


Processing chunk 15/20


Scraping Chunk 15: 100%|██████████| 500/500 [22:40<00:00,  2.72s/url]


Processing chunk 16/20


Scraping Chunk 16: 100%|██████████| 500/500 [22:35<00:00,  2.71s/url]


Processing chunk 17/20


Scraping Chunk 17: 100%|██████████| 500/500 [22:23<00:00,  2.69s/url]


Processing chunk 18/20


Scraping Chunk 18: 100%|██████████| 500/500 [22:05<00:00,  2.65s/url]


Processing chunk 19/20


Scraping Chunk 19: 100%|██████████| 500/500 [22:04<00:00,  2.65s/url]


Processing chunk 20/20


Scraping Chunk 20: 100%|██████████| 493/493 [21:44<00:00,  2.65s/url]


Processing complete.


In [27]:
#Then we collect all the chunks to a single dataframe 
#reload packages as i ran the scraping in the night and i just wanted to insure everything was remembered
import pandas as pd
import os

# Directory where chunk files are saved
chunks_dir = "1_Data_Chunks_Location_test"

# We get a list of all chunk files
chunk_files = [f for f in os.listdir(chunks_dir) if f.startswith('data_chunk_') and f.endswith('.csv')]

# Sort the files to ensure they are in the correct order
chunk_files.sort(key=lambda x: int(x.split('_')[-1].split('.')[0]))

# Initialize a list to store dataframes
chunk_dfs = []

# Read and collect all the chunks into the dataframe 
for file in chunk_files:
    chunk_path = os.path.join(chunks_dir, file)
    df_chunk = pd.read_csv(chunk_path)
    chunk_dfs.append(df_chunk)

final2_df = pd.concat(chunk_dfs, ignore_index=True)

# Save the final dataframe to a CSV file
final2_output_path = 'boligsiden_glemt.csv'
final2_df.to_csv(final_output_path, index=False)
#And now we look at it and feel joy, that it actually worked :D
final2_df


Unnamed: 0,Adress,City,Street name,Postal code,Latitude,Longitude,Year built,Bathrooms
0,Nørregade 27,Agerskov,Nørregade 27,6534,55.130960,9.136025,Opført 1947,1 toilet
1,Keldsvej 5,Store Fuglede,Keldsvej 5,4480,55.577656,11.180144,Opført 1900,2 toiletter
2,Tofteengen 8,Roskilde,Tofteengen 8,4000,55.693270,12.147028,Opført 2017,2 toiletter
3,Længstedal 16,Skjern,Længstedal 16,6900,56.003277,8.696856,Opført 1979,2 toiletter
4,Åskrænten 22,Egå,Åskrænten 22,8250,56.203278,10.264264,Opført 1962,2 toiletter
...,...,...,...,...,...,...,...,...
9988,Bødkerstræde 10,Karlslunde,Bødkerstræde 10,2690,55.570120,12.224564,Opført 1788,2 toiletter
9989,Odinsvej 2,Kirke Såby,Odinsvej 2,4060,55.651460,11.880332,Opført 1953,1 toilet
9990,Purupvej 14,Østbirk,Purupvej 14,8752,55.987175,9.734029,Opført 1910,1 toilet
9991,Vinkelvej 9,Ulfborg,Vinkelvej 9,6990,56.375076,8.121837,Opført 1953,1 toilet


In [29]:
#Now we fix all the little errors in the formating 
final2_df.loc[:, 'Year built'] = final2_df['Year built'].str.replace('Opført ', '').str.strip()
final2_df.loc[:, 'Bathrooms'] = final2_df['Bathrooms'].str.replace('toilet', '').str.strip()
final2_df.loc[:, 'Bathrooms'] = final2_df['Bathrooms'].str.replace('ter', '').str.strip()

final2_df

Unnamed: 0,Adress,City,Street name,Postal code,Latitude,Longitude,Year built,Bathrooms
0,Nørregade 27,Agerskov,Nørregade 27,6534,55.130960,9.136025,1947,1
1,Keldsvej 5,Store Fuglede,Keldsvej 5,4480,55.577656,11.180144,1900,2
2,Tofteengen 8,Roskilde,Tofteengen 8,4000,55.693270,12.147028,2017,2
3,Længstedal 16,Skjern,Længstedal 16,6900,56.003277,8.696856,1979,2
4,Åskrænten 22,Egå,Åskrænten 22,8250,56.203278,10.264264,1962,2
...,...,...,...,...,...,...,...,...
9988,Bødkerstræde 10,Karlslunde,Bødkerstræde 10,2690,55.570120,12.224564,1788,2
9989,Odinsvej 2,Kirke Såby,Odinsvej 2,4060,55.651460,11.880332,1953,1
9990,Purupvej 14,Østbirk,Purupvej 14,8752,55.987175,9.734029,1910,1
9991,Vinkelvej 9,Ulfborg,Vinkelvej 9,6990,56.375076,8.121837,1953,1


In [51]:
final2_df.loc[:, 'Adress'] = final2_df['Street name'] + ', ' + final2_df['City'] + ' ' + final2_df['Postal code'].astype(str)


KeyError: 'Street name'

In [53]:
final2_df = final2_df.drop(columns='City')
final2_df

Unnamed: 0,Adress,Year built,Bathrooms
0,"Nørregade 27, Agerskov 6534",1947,1
1,"Keldsvej 5, Store Fuglede 4480",1900,2
2,"Tofteengen 8, Roskilde 4000",2017,2
3,"Længstedal 16, Skjern 6900",1979,2
4,"Åskrænten 22, Egå 8250",1962,2
...,...,...,...
9988,"Bødkerstræde 10, Karlslunde 2690",1788,2
9989,"Odinsvej 2, Kirke Såby 4060",1953,1
9990,"Purupvej 14, Østbirk 8752",1910,1
9991,"Vinkelvej 9, Ulfborg 6990",1953,1


In [35]:
#Loading the csv file with the maps data and socio-econ data, the one we want to merge with 
df_last = pd.read_csv("Boligsiden_fix.csv")  
df_last


Unnamed: 0,Adress,City,Street name,Postal code,Latitude,Longitude,Rooms,Property type,Energy label,Price,...,PhD and research programs,Special Education expense,Healthcare expense,Green Spaces expense,Infrastructure expense,Social Services expense,Landejendom,Fritidsbolig,Elderly Care expense (mio.),Elementary school expense (mio.)
0,"Nørregade 27, Agerskov 6534",Agerskov,Nørregade,6534,55.130960,9.136025,4,Villa,E,495000,...,2.0,330000.0,7853000.0,0.0,35717000.0,29579000.0,0,0,215.948,216.587
1,"Keldsvej 5, Store Fuglede 4480",Store Fuglede,Keldsvej,4480,55.577656,11.180144,7,Villa,C,1695000,...,5.0,1666000.0,4171000.0,156000.0,20071000.0,46418000.0,0,0,222.362,280.686
2,"Tofteengen 8, Roskilde 4000",Roskilde,Tofteengen,4000,55.693270,12.147028,7,Villa,,6495000,...,92.0,2574000.0,17162000.0,500000.0,94707000.0,54150000.0,0,0,439.051,652.521
3,"Længstedal 16, Skjern 6900",Skjern,Længstedal,6900,56.003277,8.696856,7,Villa,C,1495000,...,6.0,3000000.0,5691000.0,1839000.0,47807000.0,36748000.0,1,0,271.355,370.434
4,"Åskrænten 22, Egå 8250",Egå,Åskrænten,8250,56.203278,10.264264,5,Villa,C,6998000,...,1470.0,4347000.0,68896000.0,5633000.0,75722000.0,113920000.0,0,0,1442.222,2226.985
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9910,"Bødkerstræde 10, Karlslunde 2690",Karlslunde,Bødkerstræde,2690,55.570120,12.224564,6,Villa,C,5995000,...,15.0,382000.0,5283000.0,0.0,12955000.0,26236000.0,0,0,202.342,370.660
9911,"Odinsvej 2, Kirke Såby 4060",Kirke Såby,Odinsvej,4060,55.651460,11.880332,3,Villa,C,1645000,...,23.0,1329000.0,5804000.0,0.0,10511000.0,22759000.0,0,0,151.829,197.365
9912,"Purupvej 14, Østbirk 8752",Østbirk,Purupvej,8752,55.987175,9.734029,5,Villa,E,2495000,...,33.0,2137000.0,22739000.0,1695000.0,52592000.0,48934000.0,0,0,460.442,631.385
9913,"Vinkelvej 9, Ulfborg 6990",Ulfborg,Vinkelvej,6990,56.375076,8.121837,1,Villa,D,695000,...,16.0,625000.0,13353000.0,2156000.0,47265000.0,54212000.0,0,0,473.272,352.497


In [43]:
#Fix the energy label in this one as i didn't realise that A came as A2015 and A2020
filtered_df = df_last[df_last['Energy label'].str.contains(r'A', na=False)]

# Display the filtered DataFrame
filtered_df.head(30)

df_last.loc[:, 'Energy label'] = df_last['Energy label'].str.replace('2015', '').str.strip()
df_last.loc[:, 'Energy label'] = df_last['Energy label'].str.replace('2020', '').str.strip()
filtered_df = df_last[df_last['Energy label'].str.contains(r'A', na=False)]
filtered_df

Unnamed: 0,Adress,City,Street name,Postal code,Latitude,Longitude,Rooms,Property type,Energy label,Price,...,PhD and research programs,Special Education expense,Healthcare expense,Green Spaces expense,Infrastructure expense,Social Services expense,Landejendom,Fritidsbolig,Elderly Care expense (mio.),Elementary school expense (mio.)
34,"Kildedalen 5, Svenstrup J 9230",Svenstrup J,Kildedalen,9230,56.949890,9.897466,5,Villa,A,3998000,...,582.0,5402000.0,58139000.0,1610000.0,132129000.0,134689000.0,0,0,1117.856,1620.359
64,"Rundforbivej 22, Vedbæk 2950",Vedbæk,Rundforbivej,2950,55.836227,12.545156,5,Villa,A,14800000,...,125.0,5509000.0,11262000.0,1112000.0,49340000.0,46992000.0,0,0,409.689,450.739
104,"Enggårdsparken 10, Fredericia 7000",Fredericia,Enggårdsparken,7000,55.621338,9.784511,5,Villa,A,3375000,...,19.0,5668000.0,3736000.0,1567000.0,19500000.0,32855000.0,0,0,282.291,334.054
167,"Hermelintoften 17, Ribe 6760",Ribe,Hermelintoften,6760,55.281640,8.727292,5,Villa,A,2298000,...,50.0,1738000.0,81131000.0,6424000.0,94017000.0,57399000.0,0,0,706.800,918.771
187,"Merianvej 16, Silkeborg 8600",Silkeborg,Merianvej,8600,56.201477,9.603418,5,Villa,A,3895000,...,61.0,1698000.0,14054000.0,5014000.0,43587000.0,52044000.0,0,0,392.819,728.249
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9833,"Fyrretoften 36, Vejle 7100",Vejle,Fyrretoften,7100,55.724390,9.548992,5,Villa,A,5585000,...,55.0,3247000.0,54427000.0,9614000.0,54670000.0,84324000.0,0,0,483.295,835.447
9838,"Strandhusevej 5, Frederikshavn 9900",Frederikshavn,Strandhusevej,9900,57.363750,10.511791,2,Villa,A,4200000,...,7.0,344000.0,18414000.0,996000.0,-905000.0,46091000.0,0,0,346.287,402.201
9875,"Falkevej 18, Frederikshavn 9900",Frederikshavn,Falkevej,9900,57.443630,10.532893,5,Villa,A,4295000,...,7.0,344000.0,18414000.0,996000.0,-905000.0,46091000.0,0,0,346.287,402.201
9886,"Peter Petersens Alle 21, Dragør 2791",Dragør,Peter Petersens Alle,2791,55.592490,12.662625,5,Villa,A,8600000,...,5.0,53000.0,1829000.0,0.0,5005000.0,7321000.0,0,0,81.968,123.485


In [57]:
#We then merge the dataframes 
merged_df = pd.merge(df_last, final2_df, on='Adress', how='inner')
merged_df

Unnamed: 0,Adress,City,Street name,Postal code,Latitude,Longitude,Rooms,Property type,Energy label,Price,...,Healthcare expense,Green Spaces expense,Infrastructure expense,Social Services expense,Landejendom,Fritidsbolig,Elderly Care expense (mio.),Elementary school expense (mio.),Year built,Bathrooms
0,"Nørregade 27, Agerskov 6534",Agerskov,Nørregade,6534,55.130960,9.136025,4,Villa,E,495000,...,7853000.0,0.0,35717000.0,29579000.0,0,0,215.948,216.587,1947,1
1,"Keldsvej 5, Store Fuglede 4480",Store Fuglede,Keldsvej,4480,55.577656,11.180144,7,Villa,C,1695000,...,4171000.0,156000.0,20071000.0,46418000.0,0,0,222.362,280.686,1900,2
2,"Tofteengen 8, Roskilde 4000",Roskilde,Tofteengen,4000,55.693270,12.147028,7,Villa,,6495000,...,17162000.0,500000.0,94707000.0,54150000.0,0,0,439.051,652.521,2017,2
3,"Længstedal 16, Skjern 6900",Skjern,Længstedal,6900,56.003277,8.696856,7,Villa,C,1495000,...,5691000.0,1839000.0,47807000.0,36748000.0,1,0,271.355,370.434,1979,2
4,"Åskrænten 22, Egå 8250",Egå,Åskrænten,8250,56.203278,10.264264,5,Villa,C,6998000,...,68896000.0,5633000.0,75722000.0,113920000.0,0,0,1442.222,2226.985,1962,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9910,"Bødkerstræde 10, Karlslunde 2690",Karlslunde,Bødkerstræde,2690,55.570120,12.224564,6,Villa,C,5995000,...,5283000.0,0.0,12955000.0,26236000.0,0,0,202.342,370.660,1788,2
9911,"Odinsvej 2, Kirke Såby 4060",Kirke Såby,Odinsvej,4060,55.651460,11.880332,3,Villa,C,1645000,...,5804000.0,0.0,10511000.0,22759000.0,0,0,151.829,197.365,1953,1
9912,"Purupvej 14, Østbirk 8752",Østbirk,Purupvej,8752,55.987175,9.734029,5,Villa,E,2495000,...,22739000.0,1695000.0,52592000.0,48934000.0,0,0,460.442,631.385,1910,1
9913,"Vinkelvej 9, Ulfborg 6990",Ulfborg,Vinkelvej,6990,56.375076,8.121837,1,Villa,D,695000,...,13353000.0,2156000.0,47265000.0,54212000.0,0,0,473.272,352.497,1953,1


In [62]:
#and save it to a csv

merged_df.loc[:, 'Energy label'] =merged_df['Energy label'].str.replace('2', '').str.strip()
merged_df.to_csv("Boligsiden_med_årnu.csv", index=False)
