# Websraping notebook for extracting data from OLX

In [2]:
import numpy as np
import requests
import bs4
import json

In [3]:
pages = []

for i in range(20):
    r = requests.get("https://www.olx.pl/d/nieruchomosci/mieszkania/sprzedaz/lublin/?page={}".format(i+1))

    if r.status_code == 200:
        pages.append(bs4.BeautifulSoup(r.text, "html.parser"))

In [4]:
links = []

for p in pages:
    for a in p.find_all("a", {"class": "css-1bbgabe"}):
        links.append(a["href"])


Split otodom & olx

In [5]:
olx_links = []
otodom_links = []

for link in links:
    if "otodom" in link:
        otodom_links.append(link)
    else:
        olx_links.append("http://www.olx.pl" + link)

In [6]:
# Extracting flat data from olx.pl

def extract_data_olx(link):

    r1 = bs4.BeautifulSoup(requests.get(link).text, "html.parser")
    script_w_data_json = str(r1.find_all("script", {"id": "olx-init-config"})[0]) #.find("window.__PRERENDERED_STATE__")

    start = script_w_data_json.find("window.__PRERENDERED_STATE__")
    stop = script_w_data_json.find("window.__TAURUS__")

    json_extracted = script_w_data_json[start+30:stop-10]
    j = json.loads(json.loads(json_extracted))

    id = j['ad']['ad']['id']
    createdTime = j['ad']['ad']['createdTime']
    lastRefresh = j['ad']['ad']['lastRefreshTime']
    params = j['ad']['ad']['params']
    price = j['ad']['ad']['price']['regularPrice']['value']
    location = j['ad']['ad']['location']['cityName']

    params_dic = {param['key']: param['normalizedValue'] for param in params}

    collected_data = {
        "id": id,
        "createdTime": createdTime,
        "lastRefresh": lastRefresh,
        "price": price,
        "location": location,
        **params_dic
        }

    

    return collected_data

In [13]:
extract_data_olx(olx_links[15])

{'id': 763876389,
 'createdTime': '2022-06-21T18:02:33+02:00',
 'lastRefresh': '2022-07-21T18:20:42+02:00',
 'price': 506000,
 'location': 'Lublin',
 'price_per_m': '11500',
 'floor_select': 'floor_1',
 'furniture': 'yes',
 'market': 'secondary',
 'builttype': 'blok',
 'm': '44',
 'rooms': 'two'}

Extract from otodom

In [45]:
def extract_data_otodom(link):
    r1 = bs4.BeautifulSoup(requests.get(link).text, "html.parser")
    j = json.loads(r1.find("script", {"id": "__NEXT_DATA__"}).text)

    data = j['props']['pageProps']['ad']['target']
    
    area = data['Area']
    build_year = data['Build_year']
    city = data['City']
    floor = data['Floor_no']
    id = data['Id']
    markettype = data['MarketType']
    price = data['Price']
    price_per_m2 = data['Price_per_m']
    date_created = j['props']['pageProps']['ad']['dateCreated']
    date_lastmod = j['props']['pageProps']['ad']['dateModified']
    rooms = data['Rooms_num'][0]
    builttype = data['Building_type'][0]

    collected_data = {
        "m": area, 
        "build_year": build_year, 
        "location": city, 
        "floor": floor[0], 
        "id": id, 
        "market": markettype, 
        "price": price, 
        "price_per_m2": price_per_m2, 
        "date_created": date_created, 
        "date_lastmod": date_lastmod,
        "rooms": rooms,
        "builttype": builttype
        }

    return collected_data


    


In [46]:
extract_data_otodom(otodom_links[15])

{'m': '41.18',
 'build_year': '2021',
 'location': 'lublin',
 'floor': 'floor_3',
 'id': '62711539',
 'market': 'secondary',
 'price': 442000,
 'price_per_m2': 10733,
 'date_created': '2022-01-28 20:45:22',
 'date_lastmod': '2022-07-21 15:56:23',
 'rooms': '2',
 'builttype': 'block'}

In [1]:
from tqdm import tqdm

In [2]:
for i in tqdm(range(100)):
    print(i)

100%|██████████| 100/100 [00:00<00:00, 38223.86it/s]

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99



