In [1]:
# pylint: disable=missing-module-docstring, missing-function-docstring, missing-class-docstring, unused-argument

import json
import sys
import time

from scrapy import signals  # type: ignore
from scrapy.crawler import CrawlerProcess  # type: ignore
from scrapy.exporters import JsonItemExporter  # type: ignore

import pandas as pd

from geopy.geocoders import Nominatim  # type: ignore
from geopy.distance import geodesic  # type: ignore
from database_wrapper import DatabaseWrapper

from bezrealitky_scraper.bezrealitky.spiders.search_flats import SearchFlatsSpider
from sreality_scraper.sreality.spiders.sreality_spider import SrealitySpider

from listing import Disposition, UserPreferences, Listing

# from sreality_scraper.sreality.spiders.sreality_spider import SrealitySpider

items = []


def get_coordinates(address):
    geolocator = Nominatim(user_agent="distance_calculator")
    location = geolocator.geocode(address)
    if location:
        return (location.latitude, location.longitude)  # type: ignore
    else:
        return None


def calculate_distance(address1, address2):
    coord1 = get_coordinates(address1)
    coord2 = get_coordinates(address2)

    if coord1 and coord2:
        return geodesic(coord1, coord2).kilometers
    else:
        return None


def clean_listings(listings):
    cleaned_listings = []
    seen_listings = set()

    for listing in listings:
        # Remove duplicates
        if str(listing) in seen_listings:
            continue
        seen_listings.add(str(listing))

        # Handle missing values
        for attr, value in listing.__dict__.items():
            if value == "":
                listing.__dict__[attr] = None  # or some default value

        # Validate data types
        # This is just an example for the 'area' attribute
        if listing.area is not None:
            try:
                listing.area = int(listing.area)
            except ValueError:
                continue  # skip this listing

        # Normalize text
        if listing.description is not None:
            listing.description = listing.description.lower().strip()

        cleaned_listings.append(listing)

    return cleaned_listings


def balcony_filter(listings: list[Listing]):
    l1 = []
    l2 = []
    l3 = []
    l4 = []
    for advert in listings:
        if "balk" in advert.description.lower():
            l1.append(advert)
        if advert.balcony:
            l2.append(advert)
        if "balk" in advert.description.lower() and advert.balcony:
            l3.append(advert)
        if "balk" in advert.description.lower() and not advert.balcony:
            l4.append(advert)

    print(f"{len(l1)} listings contain balk in description")
    print(f"{len(l2)} listings contain contain balk in the table")
    print(f"{len(l3)} listings contain contain balk in description and in table")
    print(f"{len(l4)} listings contain contain balk in description and not in table")
    return


def item_scraped(item):
    print(item["url"])
    items.append(item)


if __name__ == "__main__":

    CRAWL = False
    # FILE = "bezrealitky_items.json"
    # FILE = "sreality_items.json"
    FILE = "all_items.json"
    POI = "NTK Praha"
    start = 0.0
    end = 0.0

    if CRAWL:
        process = CrawlerProcess(
            settings={
                "LOG_LEVEL": "INFO",
                "DEFAULT_REQUEST_HEADERS": {
                    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.113 Safari/537.36",
                },
            }
        )

        start = time.time()

        crawler = process.create_crawler(SearchFlatsSpider)
        crawler.signals.connect(item_scraped, signal=signals.item_scraped)
        crawler2 = process.create_crawler(SrealitySpider)
        crawler2.signals.connect(item_scraped, signal=signals.item_scraped)
        process.crawl(crawler)
        process.crawl(crawler2)
        process.start()

        with open(file=FILE, mode="wb") as f:
            exporter = JsonItemExporter(f)
            exporter.start_exporting()
            for i in items:
                exporter.export_item(i)
            exporter.finish_exporting()

        end = time.time()
    else:
        with open(FILE, "r", encoding="utf-8") as f:
            items = json.load(f)

    listings = []
    for i in items:
        listings.append(Listing(i))

    dist = calculate_distance(POI, items[0]["address"])

    preferences = UserPreferences(
        dispositions=[
            Disposition.TWO_PLUS_ONE,
            Disposition.THREE_PLUS_KK,
            Disposition.THREE_PLUS_ONE,
        ],
        weight_area=0.5,
        weight_rent=0.4,
        weight_location=0.1,
        min_area=50,
        max_price=30000,
        balcony=True,
    )

    balcony_filter(listings)

    listings = clean_listings(listings=listings)

    db = DatabaseWrapper("listings.db")
    db.create_table()
    # if not db.verify_table_columns():
    #     print("Table columns are not correct")
    #     sys.exit(1)
    for listing in listings:
        if db.get_listing(listing.id):
            continue
        db.insert_listing(listing)
        print(f"found a new listing: {listing.id}")

    df = db.get_df()
    db.close_conn()

    if start != 0.0 and end != 0.0:
        print(f"crawling finished in {end - start}s")

1255 listings contain balk in description
1476 listings contain contain balk in the table
1079 listings contain contain balk in description and in table
176 listings contain contain balk in description and not in table


In [2]:
df

Unnamed: 0,id,address,area,available_from,description,disposition,floor,furnished,rent,security_deposit,...,front_garden,terrace,elevator,parking,garage,pets,loggie,public_transport,gps_lat,gps_lon
0,2198021452,"Perštejnská, Praha 8 - Dolní Chabry",60.0,01.05.2024,"dolní chabry, perštejnská, byt 2+kk, v rodinné...",4,1. podlaží z celkem 3,2,14500,,...,,0,0,1,0,,0,,50.1458760467582,14.4495986678747
1,2915575116,"Ke Klimentce, Praha 5 - Smíchov",150.0,Ihned,prostorný byt ve vile v klidné čtvrti v blízko...,9,2. podlaží z celkem 4,2,52000,,...,,0,2,1,1,,1,,50.0710749,14.3685367
2,4242748748,"Krakovská, Praha 1 - Nové Město",25.0,Ihned,"nové město, krakovská, byt 1+kk, 25 m2, po rek...",2,2. podlaží z celkem 5,1,17000,,...,,0,0,0,0,,0,,50.0784286473904,14.4282364057947
3,2403345740,"Stodůlecká, Praha 5 - Jinonice",55.0,Ihned,"jinonice, stodůlecká, velmi hezký byt 3+kk, 55...",6,1. podlaží z celkem 2,1,22000,,...,,0,0,0,0,,0,,50.0485576732353,14.3635756132237
4,2377659724,"Lucemburská, Praha 3 - Vinohrady",50.0,15.04.2024,"kompletně zrekonstruovaný, 1-ložnicový, nezaří...",4,3. podlaží,2,25800,,...,,0,1,0,0,,0,,50.0791577417438,14.4560147518341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5177,2363000140,"Náchodská, Praha 9 - Horní Počernice",120.0,Ihned,k pronájmu nabízíme byt v novostavbě o rozloze...,8,1. podlaží z celkem 2,0,29000,,...,,0,2,0,1,,0,,50.1152917873775,14.6116369446454
5178,3524535628,"Cimburkova, Praha - Žižkov",54.0,01.04.2024,nabízíme k pronájmu atypickou jednotku (dispoz...,5,1. podlaží,3,15900,,...,,0,0,0,0,,0,,50.085710763185,14.4502951508619
5179,3254465868,"Psohlavců, Praha 4 - Braník",293.0,Ihned,k okamžitému pronájmu nabízíme prostorný klima...,12,2. podlaží z celkem 3,3,50000,,...,,0,2,1,1,,0,,50.0277757,14.4151185
5180,4124259660,"Na Kocínce, Praha 6 - Dejvice",56.0,01.04.2024,"světlý, 1-ložnicový, zařízený (nebo po dohodě ...",4,9. podlaží,1,24880,,...,,0,1,0,0,,1,,50.106007614155,14.3864145796067


In [3]:
df.columns

Index(['id', 'address', 'area', 'available_from', 'description', 'disposition',
       'floor', 'furnished', 'rent', 'security_deposit', 'service_fees',
       'status', 'type', 'url', 'balcony', 'cellar', 'front_garden', 'terrace',
       'elevator', 'parking', 'garage', 'pets', 'loggie', 'public_transport',
       'gps_lat', 'gps_lon'],
      dtype='object')

In [4]:
df['rent'] = df['rent'].str.replace('€', '').str.replace('Kč', '').str.replace(' ', '').str.replace(',', '.').astype(float)

In [5]:
df

Unnamed: 0,id,address,area,available_from,description,disposition,floor,furnished,rent,security_deposit,...,front_garden,terrace,elevator,parking,garage,pets,loggie,public_transport,gps_lat,gps_lon
0,2198021452,"Perštejnská, Praha 8 - Dolní Chabry",60.0,01.05.2024,"dolní chabry, perštejnská, byt 2+kk, v rodinné...",4,1. podlaží z celkem 3,2,14500.0,,...,,0,0,1,0,,0,,50.1458760467582,14.4495986678747
1,2915575116,"Ke Klimentce, Praha 5 - Smíchov",150.0,Ihned,prostorný byt ve vile v klidné čtvrti v blízko...,9,2. podlaží z celkem 4,2,52000.0,,...,,0,2,1,1,,1,,50.0710749,14.3685367
2,4242748748,"Krakovská, Praha 1 - Nové Město",25.0,Ihned,"nové město, krakovská, byt 1+kk, 25 m2, po rek...",2,2. podlaží z celkem 5,1,17000.0,,...,,0,0,0,0,,0,,50.0784286473904,14.4282364057947
3,2403345740,"Stodůlecká, Praha 5 - Jinonice",55.0,Ihned,"jinonice, stodůlecká, velmi hezký byt 3+kk, 55...",6,1. podlaží z celkem 2,1,22000.0,,...,,0,0,0,0,,0,,50.0485576732353,14.3635756132237
4,2377659724,"Lucemburská, Praha 3 - Vinohrady",50.0,15.04.2024,"kompletně zrekonstruovaný, 1-ložnicový, nezaří...",4,3. podlaží,2,25800.0,,...,,0,1,0,0,,0,,50.0791577417438,14.4560147518341
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5177,2363000140,"Náchodská, Praha 9 - Horní Počernice",120.0,Ihned,k pronájmu nabízíme byt v novostavbě o rozloze...,8,1. podlaží z celkem 2,0,29000.0,,...,,0,2,0,1,,0,,50.1152917873775,14.6116369446454
5178,3524535628,"Cimburkova, Praha - Žižkov",54.0,01.04.2024,nabízíme k pronájmu atypickou jednotku (dispoz...,5,1. podlaží,3,15900.0,,...,,0,0,0,0,,0,,50.085710763185,14.4502951508619
5179,3254465868,"Psohlavců, Praha 4 - Braník",293.0,Ihned,k okamžitému pronájmu nabízíme prostorný klima...,12,2. podlaží z celkem 3,3,50000.0,,...,,0,2,1,1,,0,,50.0277757,14.4151185
5180,4124259660,"Na Kocínce, Praha 6 - Dejvice",56.0,01.04.2024,"světlý, 1-ložnicový, zařízený (nebo po dohodě ...",4,9. podlaží,1,24880.0,,...,,0,1,0,0,,1,,50.106007614155,14.3864145796067
