In [24]:
# pylint: disable=missing-module-docstring, missing-function-docstring, missing-class-docstring, unused-argument

import json
import sys
import time

from scrapy import signals  # type: ignore
from scrapy.crawler import CrawlerProcess  # type: ignore
from scrapy.exporters import JsonItemExporter  # type: ignore

import pandas as pd

from geopy.geocoders import Nominatim  # type: ignore
from geopy.distance import geodesic  # type: ignore
from database_wrapper import DatabaseWrapper

from bezrealitky_scraper.bezrealitky.spiders.search_flats import SearchFlatsSpider
from sreality_scraper.sreality.spiders.sreality_spider import SrealitySpider

from listing import Disposition, UserPreferences, Listing

# from sreality_scraper.sreality.spiders.sreality_spider import SrealitySpider

items = []


def get_coordinates(address):
    geolocator = Nominatim(user_agent="distance_calculator")
    location = geolocator.geocode(address)
    if location:
        return (location.latitude, location.longitude)  # type: ignore
    else:
        return None


def calculate_distance(address1, address2):
    coord1 = get_coordinates(address1)
    coord2 = get_coordinates(address2)

    if coord1 and coord2:
        return geodesic(coord1, coord2).kilometers
    else:
        return None


def clean_listings(listings):
    cleaned_listings = []
    seen_listings = set()

    for listing in listings:
        # Remove duplicates
        if str(listing) in seen_listings:
            continue
        seen_listings.add(str(listing))

        # Handle missing values
        for attr, value in listing.__dict__.items():
            if value == "":
                listing.__dict__[attr] = None  # or some default value

        # Validate data types
        # This is just an example for the 'area' attribute
        if listing.area is not None:
            try:
                listing.area = int(listing.area)
            except ValueError:
                continue  # skip this listing

        # Normalize text
        if listing.description is not None:
            listing.description = listing.description.lower().strip()

        cleaned_listings.append(listing)

    return cleaned_listings


def balcony_filter(listings: list[Listing]):
    l1 = []
    l2 = []
    l3 = []
    l4 = []
    for advert in listings:
        if "balk" in advert.description.lower():
            l1.append(advert)
        if advert.balcony:
            l2.append(advert)
        if "balk" in advert.description.lower() and advert.balcony:
            l3.append(advert)
        if "balk" in advert.description.lower() and not advert.balcony:
            l4.append(advert)

    print(f"{len(l1)} listings contain balk in description")
    print(f"{len(l2)} listings contain contain balk in the table")
    print(f"{len(l3)} listings contain contain balk in description and in table")
    print(f"{len(l4)} listings contain contain balk in description and not in table")
    return


def item_scraped(item):
    print(item["url"])
    items.append(item)


if __name__ == "__main__":

    CRAWL = False
    # FILE = "bezrealitky_items.json"
    # FILE = "sreality_items.json"
    FILE = "all_items.json"
    POI = "NTK Praha"
    start = 0.0
    end = 0.0

    if CRAWL:
        process = CrawlerProcess(
            settings={
                "LOG_LEVEL": "INFO",
                "DEFAULT_REQUEST_HEADERS": {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
                },
            }
        )

        start = time.time()

        crawler = process.create_crawler(SearchFlatsSpider)
        crawler.signals.connect(item_scraped, signal=signals.item_scraped)
        crawler2 = process.create_crawler(SrealitySpider)
        crawler2.signals.connect(item_scraped, signal=signals.item_scraped)
        process.crawl(crawler)
        process.crawl(crawler2)
        process.start()

        with open(file=FILE, mode="wb") as f:
            exporter = JsonItemExporter(f)
            exporter.start_exporting()
            for i in items:
                exporter.export_item(i)
            exporter.finish_exporting()

        end = time.time()
    else:
        with open(FILE, "r", encoding="utf-8") as f:
            items = json.load(f)

    listings = []
    for i in items:
        listings.append(Listing(i))

    dist = calculate_distance(POI, items[0]["address"])

    preferences = UserPreferences(
        dispositions=[
            Disposition.TWO_PLUS_ONE,
            Disposition.THREE_PLUS_KK,
            Disposition.THREE_PLUS_ONE,
        ],
        weight_area=0.5,
        weight_rent=0.4,
        weight_location=0.1,
        min_area=50,
        max_price=30000,
        balcony=True,
    )

    balcony_filter(listings)

    listings = clean_listings(listings=listings)

    db = DatabaseWrapper("listings.db")
    db.create_table()
    # if not db.verify_table_columns():
    #     print("Table columns are not correct")
    #     sys.exit(1)
    for listing in listings:
        if db.get_listing(listing.id):
            continue
        db.insert_listing(listing)
        print(f"found a new listing: {listing.id}")

    df = db.get_df()
    db.close_conn()

    if start != 0.0 and end != 0.0:
        print(f"crawling finished in {end - start}s")

1258 listings contain balk in description
1473 listings contain contain balk in the table
1079 listings contain contain balk in description and in table
179 listings contain contain balk in description and not in table


In [26]:
df

Unnamed: 0,id,address,area,available_from,description,disposition,floor,furnished,rent,security_deposit,...,front_garden,terrace,elevator,parking,garage,pets,loggie,public_transport,gps_lat,gps_lon
0,4084540748,"Mukařovského, Praha 5 - Stodůlky",109.0,07.06.2024,luka living - jsme tu pro vás! nájem bez provi...,8,9. podlaží z celkem 15 včetně 3 podzemních,1,46900,,...,,1,1,0,0,,0,,50.0457891667,14.3221994444
1,1595331916,"Musílkova, Praha - Košíře",111.0,Ihned,hledáte-li pohodlné bydlení v klidné lokalitě ...,6,3. podlaží z celkem 6 včetně 1 podzemního,1,33000,,...,,1,1,0,1,,0,,50.0674952282425,14.3688191055274
2,1886266700,"Mukařovského, Praha 5 - Stodůlky",31.0,08.04.2024,luka living - jsme tu pro vás! nájem bez provi...,2,6. podlaží z celkem 15 včetně 3 podzemních,1,18900,,...,,0,1,0,0,,0,,50.0457891667,14.3221994444
3,253433164,"Drahňovická, Praha 4 - Chodov",47.0,Ihned,nájemné 19.000 kč/měs. + 2.000kč za garážové s...,4,5. podlaží,2,19000,,...,,0,0,0,0,,0,,50.0434757914014,14.5008434403595
4,3991414092,"Ječná, Praha 2 - Nové Město",130.0,15.03.2024,prostorný byt 4+1 po aktuální rekonstrukci s v...,9,3. podlaží,2,45000,,...,,0,1,0,0,,1,,50.0757179753,14.423165612
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5171,457491788,"Staropramenná, Praha 5 - Smíchov",62.0,Ihned,toužíte po prostorném a luxusním bytě v atrakt...,3,4. podlaží z celkem 5,1,23800,,...,,0,1,1,0,,0,,50.0707722216,14.4072667074
5172,494200140,"Seifertova, Praha 3 - Žižkov",32.0,Ihned,je ve vašem hledáčku útulný byt v centru města...,2,4. podlaží z celkem 5,2,15200,,...,,0,2,0,0,,0,,50.0848367051,14.4475672832
5173,794871116,"Nuselská, Praha 4 - Michle",43.0,Ihned,hledáte moderní designový byt s garážovým stán...,2,6. podlaží z celkem 7,2,14500,,...,,0,1,1,0,,0,,50.0551902581,14.4518020883
5174,4157597004,"Řeporyjská, Praha 5 - Jinonice",52.0,Ihned,"hledáte krásný, světlý a designový mezonetový ...",4,2. podlaží z celkem 3,2,29000,,...,,0,2,1,1,,0,,50.0529782118,14.3567999026


In [40]:
# print unique disposition values
print(df["disposition"].unique())
dispositions = df["disposition"].unique()

['8' '6' '2' '4' '9' '7' '5' '3' '16' '47' '11' '2+kk' '3+kk' '1+kk' '1+1'
 '2+1' '3+1' '10' '4+1' 'Garsoniéra' '4+kk' '12' '5+1' '5+kk' 'Ostatní'
 None]


In [47]:
from sreality_scraper.sreality.spiders.sreality_spider import SrealityUrlBuilder

In [75]:
# for each unique disposition value try running the map_category_sub_cb from SrealityUrlBuilder
for disposition in dispositions:
    if not disposition:
        continue
    if disposition.isnumeric():
        df['disposition'] = df['disposition'].replace(disposition, SrealityUrlBuilder.map_category_sub_cb(int(disposition)))
    if disposition == 47:
        df['disposition'] = df['disposition'].replace(disposition, SrealityUrlBuilder.map_category_sub_cb(int(disposition)))
        

In [76]:
df['disposition'].unique()

array(['4+kk', '3+kk', '1+kk', '2+kk', '4+1', '3+1', '2+1', '1+1',
       'neobvykle', 47, '5+1', '5+kk', 'Garsoniéra', '6-a-vice',
       'Ostatní', None], dtype=object)