In [3]:
%load_ext autoreload
%autoreload 2
%xmode Plain

Exception reporting mode: Plain


In [36]:
import pandas as pd
import plotly
import plotly.express as px 
import dtale
import numpy as np
import random
from bazaraki import utils
from tqdm import tqdm
import swifter
from pathlib import Path
import datacompy
from datetime import datetime, date
from parse import parse
from glob import glob 


In [37]:
pd.set_option('display.max_rows', 50)  # Disable row limit
pd.set_option('display.max_columns', 40)  # Disable column limit
pd.set_option('display.width', 20)  # Disable line width limit
pd.set_option('display.max_colwidth', 100)  # Disable column width limit
# Set the display precision to 2 decimal places  
pd.set_option('display.precision', 2)  
  
# Or set the float format to always show 2 decimal places  
pd.set_option('display.float_format', '{:.2f}'.format)  


In [38]:
pd.options.plotting.backend = "plotly"
plotly.io.renderers.default = "notebook_connected"


In [96]:
DT_LEN = len("2024-12-12 18:34:25")
merged = pd.DataFrame()
for file_name in sorted(glob("output/*.jsonl")):
    print(f"Reading {file_name}")
    result = parse("output/{date} {time} {postfix}", file_name)
    if result is None:
        print(f"Failed to parse {file_name}")
        continue
    if result["postfix"].startswith("fast"):
        print(f"Skipping fast run")
        continue
    dt = date.fromisoformat(result["date"])
    
    newdf = pd.read_json(file_name, lines=True)
    assert newdf.ad_id.duplicated().sum() == 0, "Expected no duplicates"
    newdf.set_index("ad_id", inplace=True)
    if merged.empty:
        merged = newdf
        merged["delete_date"] = np.nan
        continue
    
    condition = ~merged.index.isin(newdf.index) & merged['delete_date'].isna()
    merged.loc[condition, 'delete_date'] = dt
    new_deleted_count = condition.sum()

    new = newdf.index.difference(merged.index)
    merged = pd.concat([merged, newdf.loc[new]])
    print(f"Total: {len(merged)} read: {len(newdf)} new: {len(new)} deleted: {new_deleted_count}")
df = merged

Reading output/2024-12-12 18:34:25 real-estate-to-rent_real-estate-for-sale.jsonl


Reading output/2024-12-14 11:44:22 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 35559 read: 34669 new: 745 deleted: 890
Reading output/2024-12-15 18:00:14 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 35734 read: 34398 new: 175 deleted: 490
Reading output/2024-12-16 23:13:52 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 36281 read: 34510 new: 547 deleted: 647
Reading output/2024-12-17 21:31:02 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 37085 read: 34681 new: 804 deleted: 800
Reading output/2024-12-18 23:01:10 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 37577 read: 34809 new: 492 deleted: 464
Reading output/2024-12-19 22:09:26 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 38047 read: 33723 new: 470 deleted: 1661
Reading output/2024-12-22 21:49:27 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 39161 read: 33255 new: 1114 deleted: 1833
Reading output/2024-12-23 19:08:45 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 39512

In [101]:
def add_city_disctrict_cols(df):
    df[["city", "district"]] = df.location.str.split(",", expand=True)
    return df
df = add_city_disctrict_cols(df)
df.iloc[0].T

url                                                 https://www.bazaraki.com/adv/5415213_4-bedroom-detached-house-to-rent/
title                                                                                     4-bedroom detached house to rent
price                                                                                                              1650.00
original_price                                                                                                         NaN
price_per_sqm                                                                                                          NaN
location                                                                                               Larnaca, Dromolaxia
posted                                                                                                     Yesterday 20:57
reference_number                                                                                                       NaN
views           

In [102]:
# remove invalid properties
df = utils.filter_in(df, "`Property area` > 10 and `Property area` < 300")

removing 11414/43667 rows


In [103]:
df.price_per_sqm = df.price / (df["Property area"]).round(2)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [105]:
# selected_df = saledf[saledf.price < 5000]
# print(f"removed {len(saledf) - len(selected_df)} rows")
# df = selected_df

In [106]:
# selected_df = df.query("`Bedrooms` in ('3', '2', '1', 'Studio')")
# print(f"removed {len(df) - len(selected_df)} rows")
# df = selected_df

In [115]:
def stats(saledf, rentdf):
    salestatdf = saledf.query("Condition == 'Resale'").groupby(["city", "Bedrooms"]).price_per_sqm.describe()
    rentstatdf = rentdf.groupby(["city", "Bedrooms"]).price_per_sqm.describe()
    common_index = salestatdf.index.intersection(rentstatdf.index)
    apr = rentstatdf.loc[common_index] / salestatdf.loc[common_index] * 12 * 100
    return pd.concat([apr, rentstatdf, salestatdf], axis=1, keys=['apr', 'rent', 'sale']).sort_values(by=('apr', '50%'), ascending=False).dropna()
    
print("apparts")
display(stats(df.query("cat1 == 'Apartments, flats for sale'"), df.query("cat1 == 'Apartments, flats to rent'")))
print("houses")
display(stats(df.query("cat1 == 'Houses for sale'"), df.query("cat1 == 'Houses to rent'")))

apparts


Unnamed: 0_level_0,Unnamed: 1_level_0,apr,apr,apr,apr,apr,apr,apr,apr,rent,rent,rent,rent,rent,rent,rent,rent,sale,sale,sale,sale,sale,sale,sale,sale
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
city,Bedrooms,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
Limassol,6 and more,1200.0,32.78,185.96,2.13,17.83,29.35,44.03,58.71,3.0,152.67,146.02,8.0,79.0,150.0,225.0,300.0,3.0,5588.05,942.28,4500.0,5316.04,6132.08,6132.08,6132.08
Limassol,5,2160.0,6.54,3.12,8.14,6.35,13.68,6.91,3.59,9.0,35.98,13.21,17.2,20.0,44.78,44.78,44.78,5.0,6600.21,5080.65,2536.23,3777.17,3928.57,7777.78,14981.27
Larnaca,Studio,1623.53,10.75,10.57,9.61,10.06,11.5,11.29,12.04,23.0,17.82,5.53,11.11,13.64,16.43,19.1,35.0,17.0,1989.97,628.48,1387.76,1627.91,1714.29,2029.41,3488.37
Larnaca,4,694.74,8.52,8.8,8.89,9.66,8.36,7.18,9.71,11.0,12.44,4.0,7.41,10.61,11.54,13.57,22.07,19.0,1752.3,545.25,1000.0,1318.24,1655.63,2267.64,2727.27
Paphos,Studio,4440.0,8.77,10.96,8.89,8.48,7.76,8.3,14.88,37.0,18.03,6.13,10.0,13.75,16.67,20.83,40.0,10.0,2466.07,671.76,1350.0,1945.93,2575.76,3012.67,3225.0
Larnaca,3,828.45,6.44,4.11,7.43,6.28,7.19,6.74,3.12,165.0,12.1,5.24,5.59,8.46,10.83,13.68,41.18,239.0,2252.55,1528.16,902.26,1617.98,1808.51,2436.2,15833.33
Limassol,Studio,15428.57,7.85,10.54,3.61,6.66,6.96,8.4,13.75,90.0,26.02,11.31,7.14,18.12,24.02,30.32,70.0,7.0,3975.69,1287.39,2375.0,3267.08,4142.86,4333.33,6111.11
Larnaca,2,1797.17,8.17,86.51,4.94,7.02,6.75,6.66,266.33,635.0,16.85,98.79,2.6,10.0,12.0,15.38,2500.0,424.0,2474.97,1370.35,632.0,1709.37,2133.33,2771.37,11264.37
Larnaca,1,1959.63,6.51,5.2,6.8,6.51,6.65,6.94,4.8,178.0,14.28,5.61,4.93,10.77,12.93,16.21,42.55,109.0,2630.6,1294.27,869.57,1984.13,2333.33,2803.03,10638.3
Limassol,1,5424.0,5.24,3.71,1.21,5.77,6.12,6.0,5.28,565.0,24.35,10.65,2.5,18.32,23.08,28.57,108.47,125.0,5571.04,3445.77,2474.23,3809.52,4528.3,5714.29,24672.73


houses


Unnamed: 0_level_0,Unnamed: 1_level_0,apr,apr,apr,apr,apr,apr,apr,apr,rent,rent,rent,rent,rent,rent,rent,rent,sale,sale,sale,sale,sale,sale,sale,sale
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
city,Bedrooms,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2,Unnamed: 23_level_2,Unnamed: 24_level_2,Unnamed: 25_level_2
Limassol,1,7440.0,9.19,5.85,8.82,12.62,10.74,7.74,10.37,62.0,13.96,6.15,3.0,10.0,12.93,17.25,37.14,10.0,1823.23,1262.18,408.33,950.54,1445.0,2674.63,4300.0
Nicosia,5,276.92,6.34,3.55,11.75,6.79,7.71,6.51,2.97,6.0,10.67,3.4,6.5,8.27,10.58,12.59,15.62,26.0,2021.39,1150.21,663.72,1462.36,1647.73,2321.74,6315.79
Larnaca,5,292.68,5.89,3.66,10.34,6.51,7.31,5.98,3.84,10.0,14.27,5.91,9.09,9.17,13.08,17.53,26.79,41.0,2908.09,1938.13,1054.85,1689.36,2148.72,3517.86,8362.99
Nicosia,1,2100.0,5.99,7.6,4.18,5.36,7.07,5.89,6.99,7.0,9.13,3.42,4.38,6.38,10.91,11.06,13.75,4.0,1829.76,540.35,1257.14,1429.15,1850.39,2250.99,2361.11
Paphos,1,2800.0,4.63,1.72,7.73,6.46,6.55,4.14,3.55,7.0,11.17,3.32,6.84,8.58,11.6,13.17,16.25,3.0,2895.99,2317.14,1061.54,1593.99,2126.44,3813.22,5500.0
Larnaca,4,463.92,5.25,3.02,6.49,6.03,6.52,6.47,2.21,75.0,12.59,5.18,4.55,8.18,11.5,17.65,22.73,194.0,2877.69,2058.12,840.0,1627.27,2115.52,3272.73,12352.94
Larnaca,3,822.41,6.38,7.67,9.09,5.89,6.42,6.46,8.84,159.0,12.48,9.85,3.75,7.65,10.56,14.31,109.09,232.0,2345.9,1540.45,494.79,1557.12,1973.68,2660.98,14814.81
Paphos,5,553.85,5.67,5.76,14.4,5.16,6.39,4.99,5.94,12.0,23.04,12.76,8.0,12.67,21.6,26.38,52.17,26.0,4877.13,2656.83,666.67,2945.88,4055.56,6337.73,10535.71
Larnaca,2,1098.8,5.95,2.58,10.71,7.41,6.17,6.58,2.31,76.0,11.76,3.29,5.0,9.0,11.07,14.94,18.95,83.0,2373.73,1530.13,560.0,1458.04,2152.78,2726.1,9844.56
Limassol,2,1578.38,5.53,4.55,5.36,5.44,5.83,5.8,3.7,146.0,14.74,6.01,3.0,10.0,14.17,18.51,34.71,111.0,3198.53,1586.35,671.43,2205.23,2916.67,3832.0,11244.02


In [129]:
rentdf = df.query("cat1 == 'Apartments, flats for sale' and price < 1000000")
px.histogram(rentdf, x="price", barmode='overlay', facet_col="city", color="Bedrooms", title="Apts for sale per city/bedrooms").show()


In [130]:
rentdf = df.query("cat1 == 'Apartments, flats to rent' and price < 3000")
px.histogram(rentdf, x="price", barmode='overlay', facet_col="city", color="Bedrooms", title="Apts for rent per city/bedrooms").show()


In [111]:
df.sort_values("views", ascending=False)


Unnamed: 0_level_0,url,title,price,original_price,price_per_sqm,location,posted,reference_number,views,lat,lng,sold,cat0,cat1,Property area,Pets,Type,Parking,Plot area,Furnishing,...,Postal code,Construction year,Reference number,Condition,Square meter price,Minimum stay,Land type,Plot Type,Parcel number,Planning zone,Registration number,Share,Density,Coverage,Registration block,Area,Pick a point,delete_date,city,district
ad_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1
4025924,https://www.bazaraki.com/adv/4025924_1-bedroom-apartment-to-rent/,Studio apartment to rent,600.00,,17.14,"Limassol, Trimiklini",18.11.2024 13:34,,27310,34.85,32.91,False,Cyprus real estate to rent,"Apartments, flats to rent",35.00,Not allowed,Apartment,Covered,,Fully Furnished,...,4730.00,2023,,,,,,,,,,,,,,,,,Limassol,Trimiklini
1241192,https://www.bazaraki.com/adv/1241192_serviced-offices-for-rent/,Serviced offices in limassol,650.00,,59.09,"Limassol, Limassol - Agia Zoni",05.12.2024 15:10,,22598,34.69,33.04,False,Cyprus real estate to rent,Commercial property,11.00,,Offices,,,,...,3027.00,,,,€59 /,,,,,,,,,,,,,,Limassol,Limassol - Agia Zoni
4095853,https://www.bazaraki.com/adv/4095853_1-bedroom-detached-house-for-sale/,2-bedroom detached house fоr sаle,70000.00,,1272.73,"Paphos, Filousa Kelokedaron",29.11.2024 18:41,,18964,,,False,Cyprus real estate for sale,Houses for sale,55.00,,Detached house,,156.00,Fully Furnished,...,,,,Resale,€1.273 /,,,,,,,,,,,,,2025-01-02,Paphos,Filousa Kelokedaron
2291746,https://www.bazaraki.com/adv/2291746_2-preserved-houses-2-diateretees-oikies/,3-bedroom detached house fоr sаle,75000.00,80000.00,294.12,"Nicosia, Moni Kato",12.11.2024 07:25,,18630,35.07,33.10,False,Cyprus real estate for sale,Houses for sale,255.00,,Detached house,No,,Unfurnished,...,2776.00,,,Resale,€294 /,,,,,,,,,,,,,2024-12-14,Nicosia,Moni Kato
4256623,https://www.bazaraki.com/adv/4256623_2-bedroom-detached-house-to-rent/,2-bedroom detached house to rent,700.00,,5.83,"Limassol, Kaminaria",02.12.2024 06:19,,16077,,,False,Cyprus real estate to rent,Houses to rent,120.00,Not allowed,Detached house,No,,Fully Furnished,...,,2012,,,,,,,,,,,,,,,,2025-01-02,Limassol,Kaminaria
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5599290,https://www.bazaraki.com/adv/5599290_2-bedroom-apartment-for-sale/,2-bedroom apartment fоr sаle,460000.00,,4466.02,"Limassol, Germasogeia",14:16,,0,34.71,33.08,False,Cyprus real estate for sale,"Apartments, flats for sale",103.00,,Apartment,Covered,,,...,,,551733,Under construction,€4.466 /,,,,,,,,,,,,,,Limassol,Germasogeia
5599297,https://www.bazaraki.com/adv/5599297_3-bedroom-apartment-for-sale/,3-bedroom apartment fоr sаle,385000.00,,3318.97,"Limassol, Ypsonas",14:16,,0,34.68,32.97,False,Cyprus real estate for sale,"Apartments, flats for sale",116.00,,Apartment,Covered,,,...,,,639614,Under construction,€3.319 /,,,,,,,,,,,,,,Limassol,Ypsonas
5599299,https://www.bazaraki.com/adv/5599299_2-bedroom-apartment-for-sale/,2-bedroom apartment fоr sаle,1189500.00,,11548.54,"Limassol, Germasogeia",14:16,,0,34.70,33.10,False,Cyprus real estate for sale,"Apartments, flats for sale",103.00,,Apartment,Covered,,,...,,,693389,Under construction,€11.549 /,,,,,,,,,,,,,,Limassol,Germasogeia
5599300,https://www.bazaraki.com/adv/5599300_4-bedroom-apartment-for-sale/,4-bedroom apartment fоr sаle,3505600.00,,16614.22,"Limassol, Germasogeia",14:16,,0,34.70,33.10,False,Cyprus real estate for sale,"Apartments, flats for sale",211.00,,Apartment,Covered,,,...,,,693393,Under construction,€16.614 /,,,,,,,,,,,,,,Limassol,Germasogeia


In [19]:
saledf = pd.read_json("output/_real-estate-for-sale_apartments-flats__20241207_133456.jsonl", lines=True)
saledf

ValueError: Expected object or value

In [20]:
saledf.iloc[0].T

url                   https://www.bazaraki.com/adv/5311477_6-bedroom-detached-house-for-sale/
title                                                       6-bedroom detached house fоr sаle
price                                                                                 5900000
original_price                                                                            NaN
price_per_sqm                                                                         7338.31
                                                       ...                                   
Registration block                                                                        NaN
Area                                                                                      NaN
Pick a point                                                                              NaN
city                                                                                 Limassol
district                                                    

In [21]:
saledf[["city", "district"]] = saledf.location.str.split(",", expand=True)



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

