In [1]:
import os
import csv
import pandas as pd
import altair as alt
import geopandas as gpd

import crawl_realestate_data

In [2]:
# Fetch dataset
map_data_filename = "mapdata_005.csv"
fetch_dataset = False

if fetch_dataset:
    crawl_realestate_data.main(
        dst_name=map_data_filename,
        resolution_lat=0.005,
        resolution_lon=0.005,
        sleep_time_secs=0.01,
        verbose=True
    )

In [3]:
# Load dataset
with open(map_data_filename) as map_data_f:
    map_data = [row for row in csv.DictReader(map_data_f)]

map_df = pd.DataFrame(data=map_data)
map_df.head(5)

Unnamed: 0,TopLeftLat,TopLeftLon,BottomRightLat,BottomRightLon,WeightPrice,WeightSqr,Price,PriceSqr,Hits,PointsForSqrPriceInterval,PointsForPriceInterval,PointsForZoom,PointsForDays,PointsForHits,PointsForDistance,SumPointsPrice,SumPointsSqr
0,59.415335,17.868951,59.410335,17.873950999999998,4,4,2250000,46000,12,15,30,0,10,20,20,80,65
1,59.415335,17.873950999999998,59.410335,17.878950999999997,0,0,0,0,0,0,0,0,0,0,0,0,0
2,59.415335,17.878950999999997,59.410335,17.883950999999996,0,0,0,0,0,0,0,0,0,0,0,0,0
3,59.415335,17.883950999999996,59.410335,17.888950999999995,0,0,0,0,0,0,0,0,0,0,0,0,0
4,59.415335,17.888950999999995,59.410335,17.893950999999994,0,0,0,0,0,0,0,0,0,0,0,0,0


In [4]:
# Basic dataset transformations
map_df = map_df.apply(pd.to_numeric)
map_df["CenterLat"] = (map_df["TopLeftLat"] + map_df["BottomRightLat"]) / 2
map_df["CenterLon"] = (map_df["TopLeftLon"] + map_df["BottomRightLon"]) / 2
map_df["PriceVariance"] = map_df["WeightSqr"].max() - map_df["WeightSqr"] + 1
map_df.head(5)

Unnamed: 0,TopLeftLat,TopLeftLon,BottomRightLat,BottomRightLon,WeightPrice,WeightSqr,Price,PriceSqr,Hits,PointsForSqrPriceInterval,PointsForPriceInterval,PointsForZoom,PointsForDays,PointsForHits,PointsForDistance,SumPointsPrice,SumPointsSqr,CenterLat,CenterLon,PriceVariance
0,59.415335,17.868951,59.410335,17.873951,4,4,2250000,46000,12,15,30,0,10,20,20,80,65,59.412835,17.871451,1
1,59.415335,17.873951,59.410335,17.878951,0,0,0,0,0,0,0,0,0,0,0,0,0,59.412835,17.876451,5
2,59.415335,17.878951,59.410335,17.883951,0,0,0,0,0,0,0,0,0,0,0,0,0,59.412835,17.881451,5
3,59.415335,17.883951,59.410335,17.888951,0,0,0,0,0,0,0,0,0,0,0,0,0,59.412835,17.886451,5
4,59.415335,17.888951,59.410335,17.893951,0,0,0,0,0,0,0,0,0,0,0,0,0,59.412835,17.891451,5


In [5]:
# Let's have a look at price/sqm distribution
alt.Chart(map_df).mark_bar().encode(
    alt.X("PriceSqr:Q", bin=alt.Bin(extent=[0, 150000], step=5000)),
    y='count()',
)

In [6]:
# Remove rows with unknown prices
map_df = map_df[map_df.PriceSqr != 0]

# Clip the prices to 1-95 percentile
upper_percentile = map_df["PriceSqr"].quantile(0.95)
lower_percentile = map_df["PriceSqr"].quantile(0.01)
print("Quantiles 1%: ", lower_percentile, ", 95%: ", upper_percentile)

map_df.loc[map_df["PriceSqr"] > upper_percentile, "PriceSqr"] = upper_percentile
map_df.loc[map_df["PriceSqr"] < lower_percentile, "PriceSqr"] = lower_percentile

# Let's have a look at price/sqm distribution again
alt.Chart(map_df).mark_bar().encode(
    alt.X("PriceSqr:Q", bin=alt.Bin(extent=[lower_percentile, upper_percentile], step=2000)),
    y='count()',
)

Quantiles 1%:  32000.0 , 95%:  94000.0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [7]:
# Plot the prices on a map projection
prices = alt.Chart(map_df).mark_circle().encode(
    latitude="CenterLat",
    longitude="CenterLon",
    color=alt.Color("PriceSqr", scale=alt.Scale(scheme="plasma")),
    size="PriceVariance",
    tooltip=["CenterLat", "CenterLon", "PriceSqr", "PriceVariance"]
).project()
prices

In [8]:
# Plot prices on a map with districts
# Load a chart per district
rootDir = "./geojson"
charts = []
for _, _, geojson_filenames in os.walk(rootDir):
    for geojson_name in geojson_filenames:
        gdf = gpd.read_file(f"./geojson/{geojson_name}", driver="GeoJSON")
        chart = alt.Chart(gdf).mark_geoshape(
            stroke="gray",
            fill=None
        )
        charts.append(chart)

# Add prices on top
prices_legend = alt.Legend(orient="left", labelColor="white", titleColor="white", symbolFillColor="gray", symbolStrokeColor="gray")
prices = alt.Chart(map_df).mark_circle().encode(
    latitude="CenterLat",
    longitude="CenterLon",
    color=alt.Color("PriceSqr", scale=alt.Scale(scheme="plasma"), legend=prices_legend),
    size=alt.Size("PriceVariance", legend=prices_legend),
    tooltip=["CenterLat", "CenterLon", "PriceSqr", "PriceVariance"]
).project()

final_chart = prices
for chart in charts:
    final_chart += chart

final_chart.properties(width=1500, height=1200).configure(background="black")