In [45]:
%load_ext autoreload
%autoreload 2
%xmode Plain

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Exception reporting mode: Plain


In [46]:
import pandas as pd
import plotly
import plotly.express as px 
import dtale
import numpy as np
import random
from bazaraki import utils
from tqdm import tqdm
import swifter
from pathlib import Path
import datacompy
from datetime import datetime, date
from parse import parse
from glob import glob 


In [47]:
tqdm.pandas()
pd.set_option('display.max_rows', 100)  # Disable row limit
pd.set_option('display.max_columns', 60)  # Disable column limit
pd.set_option('display.width', 20)  # Disable line width limit
pd.set_option('display.max_colwidth', 100)  # Disable column width limit
pd.set_option('display.precision', 2)  
pd.set_option('display.float_format', '{:.4f}'.format)  
np.random.seed(42)
random.seed(42)
pd.options.plotting.backend = "plotly"
plotly.io.renderers.default = "notebook_connected"
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


In [78]:
DT_LEN = len("2024-12-12 18:34:25")
merged = pd.DataFrame()
for file_name in sorted(glob("output/*.jsonl")):
    print(f"Reading {file_name}")
    result = parse("output/{date} {time} {postfix}", file_name)
    if result is None:
        print(f"Failed to parse {file_name}")
        continue
    if result["postfix"].startswith("fast"):
        print(f"Skipping fast run")
        continue
    dt = date.fromisoformat(result["date"])
    
    newdf = pd.read_json(file_name, lines=True)
    assert newdf.ad_id.duplicated().sum() == 0, "Expected no duplicates"
    newdf.set_index("ad_id", inplace=True)
    if merged.empty:
        merged = newdf
        merged["delete_date"] = np.nan
        continue
    
    condition = ~merged.index.isin(newdf.index) & merged['delete_date'].isna()
    merged.loc[condition, 'delete_date'] = dt
    new_deleted_count = condition.sum()

    new = newdf.index.difference(merged.index)
    merged = pd.concat([merged, newdf.loc[new]])
    print(f"Total: {len(merged)} read: {len(newdf)} new: {len(new)} deleted: {new_deleted_count}")
df = merged

Reading output/2024-12-12 18:34:25 real-estate-to-rent_real-estate-for-sale.jsonl
Reading output/2024-12-14 11:44:22 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 35559 read: 34669 new: 745 deleted: 890
Reading output/2024-12-15 18:00:14 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 35734 read: 34398 new: 175 deleted: 490
Reading output/2024-12-16 23:13:52 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 36281 read: 34510 new: 547 deleted: 647
Reading output/2024-12-17 21:31:02 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 37085 read: 34681 new: 804 deleted: 800
Reading output/2024-12-18 23:01:10 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 37577 read: 34809 new: 492 deleted: 464
Reading output/2024-12-19 22:09:26 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 38047 read: 33723 new: 470 deleted: 1661
Reading output/2024-12-22 21:49:27 real-estate-to-rent_real-estate-for-sale.jsonl
Total: 39161 read: 33255 new: 1114 deleted: 1833
Reading outp

In [79]:
df.price_per_sqm = df.price / (df["Property area"]).round(2)

In [80]:
def add_city_disctrict_cols(df):
    df[["city", "district"]] = df.location.str.split(",", expand=True)
    return df
merged = add_city_disctrict_cols(merged)


In [81]:
saledf = df[df.cat1.isin(['Apartments, flats for sale'])]
rentdf = df[df.cat1.isin(['Apartments, flats to rent'])]


In [85]:
# rent speed total
n_total = rentdf.groupby(["city", "Bedrooms"]).size()
n_deleted = rentdf.query("not delete_date.isna()").groupby(["city", "Bedrooms"]).size()
speed = n_deleted / n_total 
pd.concat([speed, n_total, n_deleted], axis=1, keys=["speed", "n_total", "n_deleted"]).sort_values("speed", ascending=False).dropna()

Unnamed: 0_level_0,Unnamed: 1_level_0,speed,n_total,n_deleted
city,Bedrooms,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Famagusta,Studio,0.75,8,6.0
Larnaca,Studio,0.75,24,18.0
Limassol,6 and more,0.75,4,3.0
Nicosia,4,0.6364,33,21.0
Nicosia,1,0.6257,505,316.0
Famagusta,1,0.625,24,15.0
Paphos,1,0.6187,139,86.0
Limassol,1,0.6119,572,350.0
Paphos,3,0.6111,108,66.0
Paphos,Studio,0.5946,37,22.0


In [7]:
df1 = pd.read_json("output/2024-12-12 18:34:25 real-estate-to-rent_real-estate-for-sale.jsonl", lines=True)

In [8]:
df2 = pd.read_json("output/2024-12-14 11:44:22 real-estate-to-rent_real-estate-for-sale.jsonl", lines=True)

In [None]:
new_indices = df2.index.difference(df1.index)
print(f"{len(new_indices)} new records")
largedf = pd.concat([df1, df2.loc[new_indices]])
len(largedf)

745 new records


35559

In [46]:
for i, r1 in df1.iterrows():
    try:
        r2 = df2.loc[i]
    except KeyError:
        #print(f"KeyError: {i}")
        continue
    if r1.title != r2.title:
        print(f"df1 {i} {r1.url} {r1.title} {r1.posted}")
        print(f"df2 {i} {r2.url} {r2.title} {r2.posted}")


df1 5527956 https://www.bazaraki.com/adv/5527956_4-bedroom-detached-house-for-sale/ 4-bedroom detached house fоr sаle 13.11.2024 18:12
df2 5527956 https://www.bazaraki.com/adv/5527956_4-bedroom-detached-house-for-sale/ 3-bedroom detached house fоr sаle 12.12.2024 20:52
df1 5185306 https://www.bazaraki.com/adv/5185306_residential-land-5342-m2/ Residential land 5342 m² 25.11.2024 02:38
df2 5185306 https://www.bazaraki.com/adv/5185306_residential-land-5342-m2/ Residential land 627 m² 25.11.2024 02:38
df1 5100938 https://www.bazaraki.com/adv/5100938_1-bedroom-apartment-for-sale/ 1-bedroom apartment fоr sаle 23.11.2024 17:45
df2 5100938 https://www.bazaraki.com/adv/5100938_1-bedroom-apartment-for-sale/ 2-bedroom apartment fоr sаle 23.11.2024 17:45
df1 5548384 https://www.bazaraki.com/adv/5548384_2-bedroom-apartment-for-sale/ 2- Bedroom apartment in Laiki Lefkothea with 2 storage rooms and 2 ... 27.11.2024 14:23
df2 5548384 https://www.bazaraki.com/adv/5548384_2-bedroom-apartment-for-sale/ 2

In [34]:
df1.loc[5564097]

url                                                      https://www.bazaraki.com/adv/5564097_1-bedroom-apartment-to-rent/
title                                                                                          1-bedroom apartment to rent
price                                                                                                                 1250
original_price                                                                                                         NaN
price_per_sqm                                                                                                          NaN
location                                                                                          Limassol, Polemidia Kato
posted                                                                                                    09.12.2024 07:41
reference_number                                                                                                       NaN
views           

In [35]:
df2.loc[5564097]

KeyError: 5564097

In [36]:
import pandas as pd

# Example DataFrames
df1 = pd.DataFrame({'A': [1, 2]}, index=[1, 2])
df2 = pd.DataFrame({'B': [3, 4]}, index=[2, 3])

# Union of indexes
union_indexes = df1.index.union(df2.index)

print("Union of indexes:", union_indexes)

Union of indexes: Index([1, 2, 3], dtype='int64')


In [18]:
print(datacompy.Compare(df1, df2, on_index=True, df1_name='df1', df2_name='df2').report())

DataComPy Comparison
--------------------

DataFrame Summary
-----------------

  DataFrame  Columns   Rows
0       df1       46  34814
1       df2       46  34669

Column Summary
--------------

Number of columns in common: 46
Number of columns in df1 but not in df2: 0 []
Number of columns in df2 but not in df1: 0 []

Row Summary
-----------

Matched on: index
Any duplicates on match values: No
Absolute Tolerance: 0
Relative Tolerance: 0
Number of rows in common: 33,924
Number of rows in df1 but not in df2: 890
Number of rows in df2 but not in df1: 745

Number of rows with some compared columns unequal: 33,924
Number of rows with all compared columns equal: 0

Column Comparison
-----------------

Number of columns compared with some values unequal: 36
Number of columns compared with all values equal: 10
Total number of values which compare unequal: 72,707

Columns with Unequal Values or Types
------------------------------------

                Column df1 dtype df2 dtype  # Unequal  