## Setup connection

In [46]:
import requests
import time
session = requests.session()
#session.headers['User-Agent'] = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:58.0) Gecko/20100101 Firefox/58.0' # imitate browser
#session.header['email'] = ''
def ratelimit():
    time.sleep(0.5)
def get(url,iterations=10):
    for i in range(iterations):
        ratelimit()
        try:
            response = session.get(url)
        except:
            continue
        if response.ok:
            return response
    return None

# Collect data

## Roadmap of the page (located using the "network" fan)

In [9]:
# Get map of the page
url = 'https://www.boligsiden.dk/area/getallmunicipalities'
muncipalities = get(url).json()
slugs = [i['encodedName'].lower() for i in muncipalities] # slugs are used to navigate the page
slugs[0:5]

['albertslund', 'alleroed', 'assens', 'ballerup', 'billund']

## Define log

In [18]:
# Define log
done = set()
logfile = open('boligsiden/boligsiden_log','w')
logfile.write('url,path,length,servertime,delta_t\n')

35

In [None]:
# If running more than once, load the logfile
import pandas as pd 
log_df = pd.read_csv('boligsiden/boligsiden_log')
done = set(log_df.url)

## Define path to raw data

In [10]:
! mkdir boligsiden
! mkdir boligsiden/raw_json

## Collect data

### BBR

In [None]:
# Collect the number of results for each muncipality
muncipality2count = {}
for slug in slugs:
    url = 'https://www.boligsiden.dk/salespriceresult/getcountbyarchive?salgspristype=bbr&periode.from=2012-01-01&sortdescending=true&sort=vejnavn&kommune=%s&boligtype=alle&side=1'%slug
    response = get(url)
    muncipality2count[slug] = response.json()['count']
import json
json.dump(muncipality2count,open('slug2count','w'))

In [None]:
muncipality2count = json.load(open('slug2count','r'))

In [None]:
logfile = open('boligsiden/boligsiden_log','a')
last_time = time.time()
for slug in slugs:
    print(slug)
    n_results = muncipality2count[slug]
    for page_n in range(1,int(n_results/30)+2):
        url ='https://www.boligsiden.dk/salespriceresult/getdata?salgspristype=bbr&periode.from=2012-01-01&sortdescending=true&sort=vejnavn&kommune=%s&boligtype=alle&side=%d'%(slug,page_n)
        if url in done:
            continue
        try:
            response = get(url)
        except:
            time.sleep(60) # remote connection error.
            response = get(url)
        response_json = response.json()
        path = 'boligsiden/raw_json/bolig_results_%s_%d'%(slug,page_n)
        json.dump(response_json,open(path,'w'))
        # log
        servertime = time.time()
        delta_t = servertime-last_time
        length  = len(response.text)
        logfile.write('%s,%s,%d,%r,%r\n'%(url,path,length,servertime,delta_t))
        last_time = servertime
        done.add(url)
        if page_n%10==0:
            print(int(n_results/30)+2-page_n,end=' ')


### Salesprices

In [None]:
# first collect the number of results for each muncipality.
muncipality2count_sales = {}

In [None]:
import tqdm
for slug in tqdm.tqdm(slugs):
    if slug in muncipality2count_sales:
        continue
    url = 'https://www.boligsiden.dk/salespriceresult/getcountbypropertySale?salgspristype=arkiv&periode.from=2012-01-01&sortdescending=true&sort=udbudt&kommune=%s&boligtype=alle&side=1'%slug
    response = get(url)
    muncipality2count_sales[slug] = response.json()['count']
json.dump(muncipality2count_sales,open('slug2count_sales','w'))

In [50]:

muncipality2count_sales = json.load(open('slug2count_sales','r'))

### Set log and path to data

In [47]:
! mkdir boligsiden/raw_json_sales/

In [51]:
done_sales = set()
logfile = open('boligsiden/boligsiden_log2','w')
logfile.write('url,path,length,servertime,delta_t\n')

35

In [None]:
logfile = open('boligsiden/boligsiden_log2','a')
last_time = time.time()
for slug in slugs:
    print(slug)
    n_results = muncipality2count_sales[slug]
    for page_n in range(1,int(n_results/30)+2):
        url ='https://www.boligsiden.dk/salespriceresult/getdata?salgspristype=arkiv&periode.from=2012-01-01&sortdescending=true&sort=udbudt&kommune=%s&boligtype=alle&side=%d'%(slug,page_n)
        if url in done_sales:
            continue
        try:
            response = get(url)
        except:
            time.sleep(60) # remote connection error.
            response = get(url)
        response_json = response.json()
        path = 'boligsiden/raw_json_sales/bolig_results_sales_%s_%d'%(slug,page_n)
        json.dump(response_json,open(path,'w'))
        # log
        servertime = time.time()
        delta_t = servertime-last_time
        length  = len(response.text)
        logfile.write('%s,%s,%d,%r,%r\n'%(url,path,length,servertime,delta_t))
        last_time = servertime
        done_sales.add(url)
        if page_n%10==0:
            print(int(n_results/30)+2-page_n,end=' ')


# Format data

In [74]:
from os import listdir
base_path = 'boligsiden/raw_json/'
files = [base_path+i for i in listdir(base_path)]
import pandas as pd
data = []
def flatten_dict(d,depth=2):
    for step in range(depth):
        for key in list(d.keys()):
            if type(d[key])==dict:
                for key2 in d[key]:
                    d['%s_%s'%(key,key2)] = d[key][key2]
    for key in list(d.keys()):
        if type(d[key])==dict:
            del d[key]
    return d

muncipality2name = {muncipality['encodedName']:muncipality['name'] for muncipality in muncipalities}

for filename in files:
    results = json.load(open(filename,'r'))
    for datapoint in results['searchResult']['result']['bbrItems']:
        data.append(flatten_dict(datapoint))
        
df = pd.DataFrame(data)

In [57]:
from os import listdir
base_path = 'boligsiden/raw_json_sales/'
files = [base_path+i for i in listdir(base_path)]
import pandas as pd
data = []
for filename in files:
    results = json.load(open(filename,'r'))
    for datapoint in results['searchResult']['result']['properties']:
        data.append(flatten_dict(datapoint,depth=1))
        
df_sales = pd.DataFrame(data)

In [58]:
df_sales.head()

Unnamed: 0,address,agentChainName,agentsLogoLink,areaParcel,areaPaymentCash,areaResidential,areaWeighted,canShowSalesPeriodTotal,city,dateAnnounced,...,placeName,postal,priceDevelopment,propertyLink,rating_averageRating,rating_roundAverageRating,redirectLink,salesPeriod,salesPeriodTotal,uniqueNumber
0,Musvitvej 6,,https://pic.boligsiden.dk/agent/00000000000000...,964.0,348.0,0,-,True,Kruså,19-01-2004,...,Kollund,6340,0%,~/arkiv/156283111,,,https://www.boligsiden.dk/viderestilling/d5e7c...,4.244,-,156283111
1,Hærvejen 73,,https://pic.boligsiden.dk/agent/00000000000000...,5.0,3.399,387,381,True,Padborg,29-03-2007,...,Gejlå,6330,-56%,~/arkiv/970131189,,,https://www.boligsiden.dk/viderestilling/f4657...,2.458,-,970131189
2,Hærvejen 73,,https://pic.boligsiden.dk/agent/00000000000000...,5.0,3.399,387,381,True,Padborg,22-05-2007,...,Gejlå,6330,-56%,~/arkiv/922404547,,,https://www.boligsiden.dk/viderestilling/d05c1...,2.423,-,922404547
3,Gl. Sottrupvej 15,,https://pic.boligsiden.dk/agent/00000000000000...,997.0,115.0,0,-,True,Bylderup-Bov,24-08-2007,...,,6372,0%,~/arkiv/678814717,,,https://www.boligsiden.dk/viderestilling/2d1b0...,2.931,-,678814717
4,Flensborgvej 1 A,,https://pic.boligsiden.dk/agent/00000000000000...,2.206,4.125,120,120,True,Kruså,28-09-2007,...,,6340,-50%,~/arkiv/125373332,,,https://www.boligsiden.dk/viderestilling/c35a4...,2.131,-,125373332


## Merge

In [77]:
df_final = df_sales.merge(df,on=['postal','address'])
! mkdir final_datasets
df_final.to_csv('final_datasets/bolig_data.csv',index=False)
