In [None]:
from database_wrapper import DatabaseWrapper
from sreality_scraper.sreality.spiders.sreality_spider import SrealityUrlBuilder
import pandas as pd
import numpy as np

db = DatabaseWrapper("listings.db")
df = db.get_df()
db.close_conn()
df

In [None]:
# set id as index
df.set_index("id", inplace=True)

In [None]:
# print unique disposition values
print(df["disposition"].unique())
dispositions = df["disposition"].unique()

In [None]:
# for each unique disposition value try running the map_category_sub_cb from SrealityUrlBuilder
for disposition in dispositions:
    if not disposition or isinstance(disposition, str):
        continue

    if isinstance(disposition, int):
        df["disposition"] = df["disposition"].replace(
            disposition, SrealityUrlBuilder.map_category_sub_cb(disposition)
        )

In [None]:
df["disposition"].unique()

In [None]:
df["disposition"] = df["disposition"].replace(
    "Garsoniéra", "1+kk"
)  # bezrealitky specific
df["disposition"] = df["disposition"].replace(
    "Ostatní", "other"
)  # bezrealitky specific
df["disposition"] = df["disposition"].replace("atypicky", "other")  # sreality specific
df["disposition"] = df["disposition"].replace("pokoj", "other")  # sreality specific
df["disposition"] = df["disposition"].replace("6+kk", "6-a-více")
df["disposition"] = df["disposition"].replace("6+1", "6-a-více")
df["disposition"] = df["disposition"].replace("7+kk", "6-a-více")
df["disposition"] = df["disposition"].replace("7+1", "6-a-více")

In [None]:
df["disposition"].sort_values().unique()

In [None]:
df

In [None]:
# drop entries with area bigger than 1000
df = df[df["area"] < 1000]
df["area"].sort_values()

In [None]:
df.loc[:, "available_from"] = df["available_from"].str.replace(" ", "")

In [None]:
# Get the indices of rows with invalid date format

invalid_indices = pd.to_datetime(df["available_from"], errors="coerce").isnull()
# get valid indices

df.loc[~invalid_indices, "available_from"] = pd.to_datetime(
    df.loc[~invalid_indices, "available_from"], format="%d.%m.%Y", errors="coerce"
)
df.loc[invalid_indices, "available_from"].unique()

In [None]:
df.loc[df["available_from"] == "Ihned", "available_from"] = pd.to_datetime(
    df.loc[df["available_from"] == "Ihned", "updated"],
    format="%Y-%m-%d %H:%M:%S.%f",
    errors="coerce",
)

In [None]:
# verify available_from column is in datetime format
df["available_from"] = pd.to_datetime(df["available_from"], errors="coerce")
df["available_from"]

In [None]:
df

In [None]:
df["balcony"].unique()

In [None]:
df["balcony"] = df["balcony"].replace(
    "Balk.*", 1, regex=True
)  # TODO: fix this, the replacement must be ran only on strings
df["balcony"] = df["balcony"].replace(np.nan, 0)
df["balcony"].unique()

In [None]:
df

In [None]:
df["cellar"] = df["cellar"].replace("Sklep.*", 1, regex=True)
df["cellar"] = df["cellar"].replace(np.nan, 0)
df

In [None]:
df["elevator"].unique()

In [None]:
df[df["elevator"] == 2]

In [None]:
df[df["elevator"] == "Výtah"]

In [None]:
df.loc[:, "elevator"] = df["elevator"].replace("Výtah.*", 1, regex=True)
df.loc[:, "elevator"] = df["elevator"].replace(np.nan, 0)
df.loc[:, "elevator"] = df["elevator"].replace(2, 0)
df.loc[:, "elevator"].unique()

In [None]:
df.loc[:, "floor"].sort_values().unique()
df.loc[:, "floor"] = df["floor"].replace(". podlaží.*", "", regex=True)
df.loc[:, "floor"] = df["floor"].replace(" z celkem.*", "", regex=True)
df.loc[:, "floor"] = (
    df["floor"].replace("přízemí", "0", regex=True).astype(int, errors="ignore")
)
df.loc[:, "floor"].sort_values().unique()

In [None]:
df["front_garden"].unique()

In [None]:
df.loc[:, "front_garden"] = df["front_garden"].replace("Předzahrádka .*", 1, regex=True)
df.loc[:, "front_garden"].unique()

In [None]:
df["furnished"].unique()

In [None]:
df.loc[:, "furnished"] = (
    df["furnished"]
    .replace("Nevybaveno", "nevybaveny")
    .replace("Částečně", "castecne")
    .replace("Vybaveno", "vybaveny")
)
df.loc[:, "furnished"] = df["furnished"].apply(
    lambda x: SrealityUrlBuilder.map_furnished_category(x) if isinstance(x, int) else x
)

In [None]:
df["furnished"].unique()

In [None]:
df.garage.unique()

In [None]:
df.loc[:, "garage"] = df["garage"].replace("Garáž.*", 1, regex=True)
df.garage.unique()

In [None]:
df.loc[:, "loggie"] = df["garage"].replace("Lodžie.*", 1, regex=True)
df.loggie.unique()

In [None]:
df.columns

In [None]:
df.loc[:, "parking"] = df["parking"].replace(
    "Parkování.*", 1, regex=True
)  # TODO: fix this
df.parking.unique()

In [None]:
df.loc[:, "pets"] = df["pets"].replace("Domácí mazlíčci vítáni", 1, regex=True)
df.pets.unique()

In [None]:
# drop public_transport column
df.drop(columns=["public_transport"], inplace=True)

In [None]:
df.columns

In [None]:
df.rent.unique()

In [None]:
df.loc[:, "rent"] = df["rent"].apply(
    lambda x: (
        int(x.replace(" ", "").replace("Kč", "").replace("€", ""))
        if isinstance(x, str)
        else x
    )
)

df["rent"].sort_values().unique()

In [None]:
# 'security_deposit'

# print unique security_deposit values for rows wher url contains sreality
df.loc[df["url"].str.contains("bezrealitky")]["security_deposit"].unique()

In [None]:
# drop security_deposit column
df.drop(columns=["security_deposit"], inplace=True)

In [None]:
# 'service_fees', 'status', 'terrace', 'type', 'updated', 'url'

# unique values for service_fees
df.service_fees

In [None]:
df.drop(columns=["service_fees"], inplace=True)

In [None]:
# 'status'

# unique values for status
df.status.unique()

In [None]:
# 'terrace',
df.loc[:, "terrace"] = df["terrace"].apply(
    lambda x: 1 if isinstance(x, str) and "Terasa" in x else x
)
df.loc[:, "terrace"].unique()

In [None]:
# 'type'
df.type.unique()
df.loc[:, "type"] = df["type"].apply(
    lambda x: (
        x.replace("Cihla", "cihlova")
        .replace("Panel", "panelova")
        .replace("Smíšená", "ostatni")
        .replace("Skeletová", "ostatni")
        .replace("Nízkoenergetická", "ostatni")
        .replace("Montovaná", "ostatni")
        .replace("Dřevostavba", "ostatni")
        .replace("Kamenná", "ostatni")
        .replace("Ostatní", "ostatni")
        if isinstance(x, str)
        else x
    )
)
df.loc[:, "type"] = df["type"].apply(
    lambda x: SrealityUrlBuilder.map_building_type(x) if isinstance(x, int) else x
)
df.type.unique()

In [None]:
df.floor = df.floor.apply(lambda x: int(x) if isinstance(x, str) else x)
df.floor

In [None]:
import sqlite3

conn = sqlite3.connect("filtered_listings.db")
df.to_sql("listings", conn, if_exists="replace")
conn.close()

In [None]:
excluded_columns = [
    "url",
    "address",
    "description",
    "created",
    "updated",
    "last_seen",
    "gps_lat",
    "gps_lon",
    "status",
    "type",
    "available_from",
    "front_garden",
]

mapping = {
    "1+1": 1,
    "1+kk": 2,
    "2+1": 3,
    "2+kk": 4,
    "3+1": 5,
    "3+kk": 6,
    "4+1": 7,
    "4+kk": 8,
    "5+kk": 9,
    "5+1": 10,
    "Garsoniéra": 0,
    None: 0,
    "Ostatní": 0,
}
df.disposition = df.disposition.map(mapping)


furnished_mapping = {"nevybaveny": 0, "castecne": 1, "vybaveny": 2}

df.furnished = df.furnished.map(furnished_mapping)

In [None]:
simplified_df = df.drop(columns=excluded_columns)
simplified_df

In [None]:
for col in simplified_df.columns:
    print(col)
    max_val = simplified_df[col].max()
    min_val = simplified_df[col].min()
    denominator = max_val - min_val
    if denominator == 0:
        denominator = 1e-10  # Add a small epsilon value to avoid division by zero
    simplified_df[col] = (simplified_df[col] - min_val) / denominator

    print(simplified_df[col].value_counts(bins=10, sort=False))

# artbitrary weights, change this
weights = {
    "area": 0.2,
    "balcony": 0.1,
    "cellar": 0.1,
    "disposition": 0.3,
    "elevator": 0.1,
    "floor": 0.2,
    "furnished": 0.1,
    "garage": 0.1,
    "loggie": 0.1,
    "parking": 0.1,
    "pets": 0.1,
    "rent": 0.2,
    "terrace": 0.1,
}

df["score"] = (simplified_df * pd.Series(weights)).sum(axis=1)

df

In [None]:
conn = sqlite3.connect("scored_listings.db")
df.to_sql("listings", conn, if_exists="replace")
conn.close()