In [15]:
import pandas as pd
import numpy as np
import re

# Load the TSV file
df = pd.read_csv("/Users/ducan/Documents/test/cleaning/output/visualization_data.tsv", sep="\t")

# # --- Clean Area ---
# def clean_area(area_str):
#     if pd.isna(area_str): return np.nan
#     area_str = area_str.replace("m²", "").replace("m2", "").strip()
#     try:
#         return float(area_str.replace(",", "").strip())
#     except:
#         return np.nan

# df["area_m2"] = df["area"].apply(clean_area)

# # --- Clean Price ---
# def clean_price(price_str):
#     if pd.isna(price_str): return np.nan
#     price_str = price_str.lower().replace(",", "").strip()
#     match = re.search(r"([\d\.]+)\s*(tỷ|triệu)", price_str)
#     if not match:
#         return np.nan
#     value, unit = match.groups()
#     try:
#         value = float(value.replace(".", ""))
#         if unit == "tỷ":
#             return value * 1_000  # Convert to million VND
#         elif unit == "triệu":
#             return value
#     except:
#         return np.nan
#     return np.nan

# df["price_million_vnd"] = df["price"].apply(clean_price)

# # --- Extract Bedrooms ---
# def extract_number(text):
#     if pd.isna(text): return np.nan
#     match = re.search(r"(\d+)", text)
#     return int(match.group(1)) if match else np.nan

# df["bedrooms"] = df["number_of_bedrooms"].apply(extract_number)
# df["toilets"] = df["number_of_toilets"].apply(extract_number)

# --- Infer Property Type ---
def infer_type(title):
    title = title.lower()
    if "chung cư" in title or "căn hộ" in title: return "Chung cư"
    if "biệt thự" in title or "liền kề" in title: return "Biệt thự"
    if "nhà riêng" in title or "nhà mặt phố" in title: return "Nhà riêng"
    if "đất" in title: return "Đất"
    return "Khác"

df["property_type"] = df["title"].apply(infer_type)

# --- Result Preview ---
print(df[["area", "price", "number_of_bedrooms", "number_of_toilets", "property_type"]].head())



import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load preprocessed data (or use existing df)
# df = pd.read_csv("hn_batdongsan.tsv", sep="\t")

# Re-run preprocessing (or assume it's already done)
# -- Include here the preprocessing code from previous step if needed --

# Only keep cleaned rows
df_clean = df.dropna(subset=["area", "price", "lat", "lon", "property_type", "district"])

# 1. Price Distribution
fig_price = px.histogram(df_clean, x="price", nbins=50, title="Distribution of Property Prices (in billion VND)")
fig_price.update_layout(xaxis_title="Price (tỷ VND)", yaxis_title="Count")
fig_price.show()

# 2. Area Distribution
fig_area = px.histogram(df_clean, x="area", nbins=50, title="Distribution of Property Areas (m²)")
fig_area.update_layout(xaxis_title="Area (m²)", yaxis_title="Count")
fig_area.show()

# 3. Property Type Pie Chart
fig_type = px.pie(df_clean, names="property_type", title="Property Type Distribution")
fig_type.show()


# 4. Number of Listings per District
district_counts = df_clean["district"].value_counts().reset_index()
district_counts.columns = ["district", "count"]

fig_district = px.bar(district_counts,
                      x="district", y="count", title="Number of Listings per District")
fig_district.update_layout(xaxis_title="District", yaxis_title="Number of Listings")
fig_district.show()

# fig_district = px.bar(df_clean["district"].value_counts().reset_index(),
#                       x="index", y="district", title="Number of Listings per District")
# fig_district.update_layout(xaxis_title="District", yaxis_title="Number of Listings")
# fig_district.show()

# 5. Price vs Area Scatter Plot
fig_scatter = px.scatter(df_clean, x="area", y="price", color="property_type",
                         title="Price vs Area by Property Type", hover_data=["district", "title"])
fig_scatter.update_layout(xaxis_title="Area (m²)", yaxis_title="Price (tỷ VND)")
fig_scatter.show()

# 6. Map of Listings by District
fig_map = px.scatter_mapbox(
    df_clean,
    lat="lat",
    lon="lon",
    color="property_type",
    size="price",
    hover_name="title",
    hover_data=["district", "price", "area"],
    zoom=10,
    height=600,
    title="Property Listings Map"
)

fig_map.update_layout(mapbox_style="open-street-map")
fig_map.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig_map.show()



import pandas as pd
import json
import plotly.express as px
import re

def get_centroid(feature):
    geometry = feature["geometry"]
    coords = geometry["coordinates"]

    if geometry["type"] == "Polygon":
        ring = coords[0]
    elif geometry["type"] == "MultiPolygon":
        ring = coords[0][0]
    else:
        return None, None

    lon = [pt[0] for pt in ring]
    lat = [pt[1] for pt in ring]
    return sum(lon) / len(lon), sum(lat) / len(lat)


# List of Hà Nội districts (to match geojson)
hanoi_districts = [
    'Ba Đình', 'Ba Vì', 'Cầu Giấy', 'Chương Mỹ', 'Đan Phượng', 'Đông Anh', 'Đống Đa',
    'Gia Lâm', 'Hà Đông', 'Hai Bà Trưng', 'Hoài Đức', 'Hoàn Kiếm', 'Hoàng Mai',
    'Long Biên', 'Mê Linh', 'Mỹ Đức', 'Phú Xuyên', 'Phúc Thọ', 'Quốc Oai', 'Sóc Sơn',
    'Sơn Tây', 'Tây Hồ', 'Thạch Thất', 'Thanh Oai', 'Thanh Trì', 'Thanh Xuân',
    'Thường Tín', 'Từ Liêm', 'Ứng Hòa'
]

# Mapping GeoJSON English district names to Vietnamese
geojson_name_map = {
    "Ba Dinh": "Ba Đình",
    "Ba Vi": "Ba Vì",
    "Cau Giay": "Cầu Giấy",
    "Chuong My": "Chương Mỹ",
    "Dan Phuong": "Đan Phượng",
    "Dong Anh": "Đông Anh",
    "Dong Da": "Đống Đa",
    "Gia Lam": "Gia Lâm",
    "Ha Dong": "Hà Đông",
    "Hai Ba Trung": "Hai Bà Trưng",
    "Hoai Duc": "Hoài Đức",
    "Hoan Kiem": "Hoàn Kiếm",
    "Hoang Mai": "Hoàng Mai",
    "Long Bien": "Long Biên",
    "Me Linh": "Mê Linh",
    "My Duc": "Mỹ Đức",
    "Phu Xuyen": "Phú Xuyên",
    "Phuc Tho": "Phúc Thọ",
    "Quoc Oai": "Quốc Oai",
    "Soc Son": "Sóc Sơn",
    "Son Tay": "Sơn Tây",
    "Tay Ho": "Tây Hồ",
    "Thach That": "Thạch Thất",
    "Thanh Oai": "Thanh Oai",
    "Thanh Tri": "Thanh Trì",
    "Thanh Xuan": "Thanh Xuân",
    "Thuong Tin": "Thường Tín",
    "Tu Liem": "Từ Liêm",
    "Ung Hoa": "Ứng Hòa"
}

# Combine Bắc Từ Liêm and Nam Từ Liêm into Từ Liêm
df["district"] = df["district"].replace({
    "Bắc Từ Liêm": "Từ Liêm",
    "Nam Từ Liêm": "Từ Liêm"
})


# Filter and group
avg_price = df[df["district"].isin(hanoi_districts)] \
    .groupby("district")["price"] \
    .mean().reset_index()
avg_price.columns = ["district", "avg_price"]

# --- Step 2: Load GeoJSON and extract Hà Nội districts ---
with open("diaphanhuyen.geojson", "r", encoding="utf-8") as f:
    geojson_data = json.load(f)

# Filter to Hà Nội
hanoi_features = [f for f in geojson_data["features"] if f["properties"].get("Ten_Tinh") == "Hà Nội"]
hanoi_geojson = {
    "type": "FeatureCollection",
    "features": hanoi_features
}

# Rename GeoJSON key to match district names
for f in hanoi_geojson["features"]:
    eng_name = f["properties"]["Ten_Huyen"].title()
    f["id"] = geojson_name_map.get(eng_name)

# --- Step 3: Plot Choropleth ---
fig = px.choropleth_mapbox(
    avg_price,
    geojson=hanoi_geojson,
    locations="district",
    color="avg_price",
    featureidkey="id",
    mapbox_style="open-street-map",
    zoom=9,
    center={"lat": 21.0285, "lon": 105.8542},
    color_continuous_scale="YlOrRd",
    title="Average Real Estate Price by District (tỷ VND)",
    labels={"avg_price": "Avg Price (tỷ VND)"},
    height=600
)


import plotly.graph_objects as go

# Step 1: Prepare centroid lists
lat_list = []
lon_list = []
text_list = []

for feature in hanoi_geojson["features"]:
    district_vn = feature["id"]
    centroid_lon, centroid_lat = get_centroid(feature)
    if centroid_lon is not None and centroid_lat is not None:
        lat_list.append(centroid_lat)
        lon_list.append(centroid_lon)
        text_list.append(district_vn)

# Step 2: Create the label layer
text_layer = go.Scattermapbox(
    lat=lat_list,
    lon=lon_list,
    mode="text",
    text=text_list,
    textfont=dict(size=11, color="black"),
    hoverinfo="skip"
)

# Step 3: Add to the existing choropleth
fig.add_trace(text_layer)
# fig.show()


fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig.show()






   area  price  number_of_bedrooms  number_of_toilets property_type
0  96.0   16.8                   5                  3      Biệt thự
1  80.0   13.2                   3                  3     Nhà riêng
2  35.0    8.0                   5                  4     Nhà riêng
3  65.0   16.5                   4                  4     Nhà riêng
4  80.0   11.0                   6                  6      Biệt thự



*scatter_mapbox* is deprecated! Use *scatter_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/




*choropleth_mapbox* is deprecated! Use *choropleth_map* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/


*scattermapbox* is deprecated! Use *scattermap* instead. Learn more at: https://plotly.com/python/mapbox-to-maplibre/



In [1]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load preprocessed data (or use existing df)
# df = pd.read_csv("hn_batdongsan.tsv", sep="\t")

# Re-run preprocessing (or assume it's already done)
# -- Include here the preprocessing code from previous step if needed --

# Only keep cleaned rows
df_clean = df.dropna(subset=["area", "price", "lat", "lon", "property_type", "district"])

# 1. Price Distribution
fig_price = px.histogram(df_clean, x="price", nbins=50, title="Distribution of Property Prices (in billion VND)")
fig_price.update_layout(xaxis_title="Price (tỷ VND)", yaxis_title="Count")
fig_price.show()

# 2. Area Distribution
fig_area = px.histogram(df_clean, x="area", nbins=50, title="Distribution of Property Areas (m²)")
fig_area.update_layout(xaxis_title="Area (m²)", yaxis_title="Count")
fig_area.show()

# 3. Property Type Pie Chart
fig_type = px.pie(df_clean, names="property_type", title="Property Type Distribution")
fig_type.show()


# 4. Number of Listings per District
district_counts = df_clean["district"].value_counts().reset_index()
district_counts.columns = ["district", "count"]

fig_district = px.bar(district_counts,
                      x="district", y="count", title="Number of Listings per District")
fig_district.update_layout(xaxis_title="District", yaxis_title="Number of Listings")
fig_district.show()

# fig_district = px.bar(df_clean["district"].value_counts().reset_index(),
#                       x="index", y="district", title="Number of Listings per District")
# fig_district.update_layout(xaxis_title="District", yaxis_title="Number of Listings")
# fig_district.show()

# 5. Price vs Area Scatter Plot
fig_scatter = px.scatter(df_clean, x="area", y="price", color="property_type",
                         title="Price vs Area by Property Type", hover_data=["district", "title"])
fig_scatter.update_layout(xaxis_title="Area (m²)", yaxis_title="Price (tỷ VND)")
fig_scatter.show()

# 6. Map of Listings by District
fig_map = px.scatter_mapbox(
    df_clean,
    lat="lat",
    lon="lon",
    color="property_type",
    size="price",
    hover_name="title",
    hover_data=["district", "price", "area"],
    zoom=10,
    height=600,
    title="Property Listings Map"
)

fig_map.update_layout(mapbox_style="open-street-map")
fig_map.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig_map.show()




import pandas as pd
import json
import plotly.express as px
import re

def get_centroid(feature):
    geometry = feature["geometry"]
    coords = geometry["coordinates"]

    if geometry["type"] == "Polygon":
        ring = coords[0]
    elif geometry["type"] == "MultiPolygon":
        ring = coords[0][0]
    else:
        return None, None

    lon = [pt[0] for pt in ring]
    lat = [pt[1] for pt in ring]
    return sum(lon) / len(lon), sum(lat) / len(lat)


# List of Hà Nội districts (to match geojson)
hanoi_districts = [
    'Ba Đình', 'Ba Vì', 'Cầu Giấy', 'Chương Mỹ', 'Đan Phượng', 'Đông Anh', 'Đống Đa',
    'Gia Lâm', 'Hà Đông', 'Hai Bà Trưng', 'Hoài Đức', 'Hoàn Kiếm', 'Hoàng Mai',
    'Long Biên', 'Mê Linh', 'Mỹ Đức', 'Phú Xuyên', 'Phúc Thọ', 'Quốc Oai', 'Sóc Sơn',
    'Sơn Tây', 'Tây Hồ', 'Thạch Thất', 'Thanh Oai', 'Thanh Trì', 'Thanh Xuân',
    'Thường Tín', 'Từ Liêm', 'Ứng Hòa'
]

# Mapping GeoJSON English district names to Vietnamese
geojson_name_map = {
    "Ba Dinh": "Ba Đình",
    "Ba Vi": "Ba Vì",
    "Cau Giay": "Cầu Giấy",
    "Chuong My": "Chương Mỹ",
    "Dan Phuong": "Đan Phượng",
    "Dong Anh": "Đông Anh",
    "Dong Da": "Đống Đa",
    "Gia Lam": "Gia Lâm",
    "Ha Dong": "Hà Đông",
    "Hai Ba Trung": "Hai Bà Trưng",
    "Hoai Duc": "Hoài Đức",
    "Hoan Kiem": "Hoàn Kiếm",
    "Hoang Mai": "Hoàng Mai",
    "Long Bien": "Long Biên",
    "Me Linh": "Mê Linh",
    "My Duc": "Mỹ Đức",
    "Phu Xuyen": "Phú Xuyên",
    "Phuc Tho": "Phúc Thọ",
    "Quoc Oai": "Quốc Oai",
    "Soc Son": "Sóc Sơn",
    "Son Tay": "Sơn Tây",
    "Tay Ho": "Tây Hồ",
    "Thach That": "Thạch Thất",
    "Thanh Oai": "Thanh Oai",
    "Thanh Tri": "Thanh Trì",
    "Thanh Xuan": "Thanh Xuân",
    "Thuong Tin": "Thường Tín",
    "Tu Liem": "Từ Liêm",
    "Ung Hoa": "Ứng Hòa"
}

# Combine Bắc Từ Liêm and Nam Từ Liêm into Từ Liêm
df["district"] = df["district"].replace({
    "Bắc Từ Liêm": "Từ Liêm",
    "Nam Từ Liêm": "Từ Liêm"
})


# Filter and group
avg_price = df[df["district"].isin(hanoi_districts)] \
    .groupby("district")["price"] \
    .mean().reset_index()
avg_price.columns = ["district", "avg_price"]

# --- Step 2: Load GeoJSON and extract Hà Nội districts ---
with open("diaphanhuyen.geojson", "r", encoding="utf-8") as f:
    geojson_data = json.load(f)

# Filter to Hà Nội
hanoi_features = [f for f in geojson_data["features"] if f["properties"].get("Ten_Tinh") == "Hà Nội"]
hanoi_geojson = {
    "type": "FeatureCollection",
    "features": hanoi_features
}

# Rename GeoJSON key to match district names
for f in hanoi_geojson["features"]:
    eng_name = f["properties"]["Ten_Huyen"].title()
    f["id"] = geojson_name_map.get(eng_name)

# --- Step 3: Plot Choropleth ---
fig = px.choropleth_mapbox(
    avg_price,
    geojson=hanoi_geojson,
    locations="district",
    color="avg_price",
    featureidkey="id",
    mapbox_style="open-street-map",
    zoom=9,
    center={"lat": 21.0285, "lon": 105.8542},
    color_continuous_scale="YlOrRd",
    title="Average Real Estate Price by District (tỷ VND)",
    labels={"avg_price": "Avg Price (tỷ VND)"},
    height=600
)


import plotly.graph_objects as go

# Step 1: Prepare centroid lists
lat_list = []
lon_list = []
text_list = []

for feature in hanoi_geojson["features"]:
    district_vn = feature["id"]
    centroid_lon, centroid_lat = get_centroid(feature)
    if centroid_lon is not None and centroid_lat is not None:
        lat_list.append(centroid_lat)
        lon_list.append(centroid_lon)
        text_list.append(district_vn)

# Step 2: Create the label layer
text_layer = go.Scattermapbox(
    lat=lat_list,
    lon=lon_list,
    mode="text",
    text=text_list,
    textfont=dict(size=11, color="black"),
    hoverinfo="skip"
)

# Step 3: Add to the existing choropleth
fig.add_trace(text_layer)
# fig.show()


fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig.show()


NameError: name 'df' is not defined

In [2]:
import pandas as pd
import json
import plotly.express as px
import re

def get_centroid(feature):
    geometry = feature["geometry"]
    coords = geometry["coordinates"]

    if geometry["type"] == "Polygon":
        ring = coords[0]
    elif geometry["type"] == "MultiPolygon":
        ring = coords[0][0]
    else:
        return None, None

    lon = [pt[0] for pt in ring]
    lat = [pt[1] for pt in ring]
    return sum(lon) / len(lon), sum(lat) / len(lat)


# List of Hà Nội districts (to match geojson)
hanoi_districts = [
    'Ba Đình', 'Ba Vì', 'Cầu Giấy', 'Chương Mỹ', 'Đan Phượng', 'Đông Anh', 'Đống Đa',
    'Gia Lâm', 'Hà Đông', 'Hai Bà Trưng', 'Hoài Đức', 'Hoàn Kiếm', 'Hoàng Mai',
    'Long Biên', 'Mê Linh', 'Mỹ Đức', 'Phú Xuyên', 'Phúc Thọ', 'Quốc Oai', 'Sóc Sơn',
    'Sơn Tây', 'Tây Hồ', 'Thạch Thất', 'Thanh Oai', 'Thanh Trì', 'Thanh Xuân',
    'Thường Tín', 'Từ Liêm', 'Ứng Hòa'
]

# Mapping GeoJSON English district names to Vietnamese
geojson_name_map = {
    "Ba Dinh": "Ba Đình",
    "Ba Vi": "Ba Vì",
    "Cau Giay": "Cầu Giấy",
    "Chuong My": "Chương Mỹ",
    "Dan Phuong": "Đan Phượng",
    "Dong Anh": "Đông Anh",
    "Dong Da": "Đống Đa",
    "Gia Lam": "Gia Lâm",
    "Ha Dong": "Hà Đông",
    "Hai Ba Trung": "Hai Bà Trưng",
    "Hoai Duc": "Hoài Đức",
    "Hoan Kiem": "Hoàn Kiếm",
    "Hoang Mai": "Hoàng Mai",
    "Long Bien": "Long Biên",
    "Me Linh": "Mê Linh",
    "My Duc": "Mỹ Đức",
    "Phu Xuyen": "Phú Xuyên",
    "Phuc Tho": "Phúc Thọ",
    "Quoc Oai": "Quốc Oai",
    "Soc Son": "Sóc Sơn",
    "Son Tay": "Sơn Tây",
    "Tay Ho": "Tây Hồ",
    "Thach That": "Thạch Thất",
    "Thanh Oai": "Thanh Oai",
    "Thanh Tri": "Thanh Trì",
    "Thanh Xuan": "Thanh Xuân",
    "Thuong Tin": "Thường Tín",
    "Tu Liem": "Từ Liêm",
    "Ung Hoa": "Ứng Hòa"
}

# Combine Bắc Từ Liêm and Nam Từ Liêm into Từ Liêm
df["district"] = df["district"].replace({
    "Bắc Từ Liêm": "Từ Liêm",
    "Nam Từ Liêm": "Từ Liêm"
})


# Filter and group
avg_price = df[df["district"].isin(hanoi_districts)] \
    .groupby("district")["price"] \
    .mean().reset_index()
avg_price.columns = ["district", "avg_price"]

# --- Step 2: Load GeoJSON and extract Hà Nội districts ---
with open("diaphanhuyen.geojson", "r", encoding="utf-8") as f:
    geojson_data = json.load(f)

# Filter to Hà Nội
hanoi_features = [f for f in geojson_data["features"] if f["properties"].get("Ten_Tinh") == "Hà Nội"]
hanoi_geojson = {
    "type": "FeatureCollection",
    "features": hanoi_features
}

# Rename GeoJSON key to match district names
for f in hanoi_geojson["features"]:
    eng_name = f["properties"]["Ten_Huyen"].title()
    f["id"] = geojson_name_map.get(eng_name)

# --- Step 3: Plot Choropleth ---
fig = px.choropleth_mapbox(
    avg_price,
    geojson=hanoi_geojson,
    locations="district",
    color="avg_price",
    featureidkey="id",
    mapbox_style="open-street-map",
    zoom=9,
    center={"lat": 21.0285, "lon": 105.8542},
    color_continuous_scale="YlOrRd",
    title="Average Real Estate Price by District (tỷ VND)",
    labels={"avg_price": "Avg Price (tỷ VND)"},
    height=600
)


import plotly.graph_objects as go

# Step 1: Prepare centroid lists
lat_list = []
lon_list = []
text_list = []

for feature in hanoi_geojson["features"]:
    district_vn = feature["id"]
    centroid_lon, centroid_lat = get_centroid(feature)
    if centroid_lon is not None and centroid_lat is not None:
        lat_list.append(centroid_lat)
        lon_list.append(centroid_lon)
        text_list.append(district_vn)

# Step 2: Create the label layer
text_layer = go.Scattermapbox(
    lat=lat_list,
    lon=lon_list,
    mode="text",
    text=text_list,
    textfont=dict(size=11, color="black"),
    hoverinfo="skip"
)

# Step 3: Add to the existing choropleth
fig.add_trace(text_layer)
# fig.show()


fig.update_layout(margin={"r":0,"t":40,"l":0,"b":0})
fig.show()

NameError: name 'df' is not defined