In [1]:
import pandas as pd
import pyarrow
from dotenv import load_dotenv
import os

load_dotenv()

file_path = os.getenv('FILE_PATH')

filenames = [
    'bag_ids_no_funda',
    'random_online_sample',
    'special_house_types_class_sample',
    'detailed_woning_type_sample',
    'not_online_listings', # Download from slack
]

df = pd.read_csv(file_path + "bag_image_summary.csv")
df_sample = pd.read_parquet(file_path + "detailed_woning_type_sample.parquet")
df = pd.read_csv(file_path + "bag_image_summary.csv", dtype="string")
df_joined = pd.merge(df_sample, df, how="left", right_on="bag_id", left_on="bag_nummeraanduidingid")
df_sample_with_urls = df_joined[df_joined["frontview_exists"].notna()]

print(df_sample_with_urls.columns)

Index(['bag_nummeraanduidingid', 'source_data_result_id', 'special_house_type',
       'woningtype', 'straatnaam', 'postcode', 'huisnr', 'huisnr_bag_letter',
       'huisnr_bag_toevoeging', 'plaatsnaam', 'opp_pand', 'oppervlakte',
       'build_year', 'build_type', 'is_monument', 'is_protected',
       'source_data_timestamp', 'geometry', 'random_rank', 'bag_id',
       'num_funda_images', 'frontview_exists', 'frontview_funda_url',
       'frontview_google_url', 'frontview_funda_in_business_url'],
      dtype='object')


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Numeric interpretation: count unique values and their frequencies
special_house_type_counts = df_sample_with_urls['special_house_type'].value_counts(dropna=False)
woningtype_counts = df_sample_with_urls['woningtype'].value_counts(dropna=False)
build_type_counts = df_sample_with_urls['build_type'].value_counts(dropna=False)

In [None]:
print("Special House Type Counts:\n", special_house_type_counts)

In [None]:
print("Woningtype Counts:\n", woningtype_counts)

In [None]:
print("Build Type Counts:\n", build_type_counts)

In [None]:
# Bar chart for special_house_type
plt.figure(figsize=(10, 6))
sns.barplot(
    y=special_house_type_counts.index[:15],  # show top 15 for readability
    x=special_house_type_counts.values[:15],
)
plt.title("Top 15 Special House Types")
plt.xlabel("Count")
plt.ylabel("Special House Type")
plt.tight_layout()
plt.show()

# Bar chart for woningtype
plt.figure(figsize=(10, 6))
sns.barplot(
    y=woningtype_counts.index,
    x=woningtype_counts.values,
)
plt.title("Woningtype Counts")
plt.xlabel("Count")
plt.ylabel("Woningtype")
plt.tight_layout()
plt.show()

# Bar chart for build_type
plt.figure(figsize=(10, 6))
sns.barplot(
    y=build_type_counts.index,
    x=build_type_counts.values,
)
plt.title("Build Type Counts")
plt.xlabel("Count")
plt.ylabel("Build Type")
plt.tight_layout()
plt.show()

In [None]:
build_woning_counts = df_sample_with_urls.groupby(['build_type', 'woningtype']).size().unstack(fill_value=0)
woning_build_counts = df_sample_with_urls.groupby(['woningtype', 'build_type']).size().unstack(fill_value=0)

# Plot stacked bar chart
build_woning_counts.plot(kind='bar', stacked=True, figsize=(12, 7))
plt.title("Woningtype Counts by Build Type")
plt.xlabel("Build Type")
plt.ylabel("Count")
plt.legend(title="Woningtype", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

# Plot stacked bar chart (woningtype as index)
woning_build_counts.plot(kind='bar', stacked=True, figsize=(12, 7))
plt.title("Build Type Counts by Woningtype")
plt.xlabel("Woningtype")
plt.ylabel("Count")
plt.legend(title="Build Type", bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
# Numeric representation
print("build_type subdivided into woningtype (build_type x woningtype):\n", build_woning_counts)

In [None]:
print("woningtype subdivided into buildtype (woningtype x build_type):\n", woning_build_counts)