In [139]:
import pandas as pd
import pyarrow
from dotenv import load_dotenv
import os
import re
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.preprocessing import OneHotEncoder

load_dotenv()

file_path = '../Desktop/Matrix/DATA/'

df_sample = pd.read_parquet(file_path + "detailed_woning_type_sample.parquet")
df = pd.read_csv(file_path + "bag_image_summary.csv", dtype="string")
df_joined = pd.merge(df_sample, df, how="left", right_on="bag_id", left_on="bag_nummeraanduidingid")
df_sample_with_urls = df_joined[df_joined["frontview_exists"].notna()]

# If you want to add the file path to the URLs, set this to True
add_file_path_to_urls = False

# Currently a funda sourced Url goes from: 
# frontview/0797/2000/0002/3888/0797200000023888.jpg
# to: img_dataset/07/079720000002-funda.jpg
def extract_path(url, source):
    if pd.isna(url) or url == '' or url is None:
        return ''
    id = url.rstrip('/').split('/')[-1]
    id, *_ = id.split('.')
    m = re.match(r'(\d{2})', id)
    first_two_digits = m.group(1) if m else ''
    return f"img_dataset/{first_two_digits}/{id}-{source}.jpg"

link_cols = ['frontview_funda_url', 'frontview_google_url', 'frontview_funda_in_business_url']
link_sources = ['funda', 'google', 'funda-in-business'] # Sources are in file name, so need to be added to filename for correct name

for col, source in zip(link_cols, link_sources):
    df_sample_with_urls[f'{col}_split'] = df_sample_with_urls[col].map(lambda url: extract_path(url, source))

# If you want to add the file path to the URLs, set add_file_path_to_urls to True
if add_file_path_to_urls:
    df_sample_with_urls[[f'{col}_split' for col in link_cols]] = df_sample_with_urls[[f'{col}_split' for col in link_cols]].map(lambda x: file_path + x if x else '')
    add_file_path_to_urls = False

df_sample_with_urls.to_csv(
    file_path + "Full_preprocessed_detailed_house.csv",
    index=False,
    encoding='utf-8',
)

df = pd.read_csv(file_path + "Full_preprocessed_detailed_house.csv", dtype="string")

# Verschillend is a special case, so we remove it from the dataset
df = df[df['build_type'] != 'Verschillend']

def pick_first_url(row):
    for col in [f"{c}_split" for c in link_cols]:
        val = row[col]
        if pd.notna(val) and val != '':
            return val
    return ''

df['frontview_url'] = df.apply(pick_first_url, axis=1)
df = df[df['frontview_url'] != '']

# Ensure 'opp_pand' and 'oppervlakte' are numeric before division
df['procent_ingenomen'] = pd.to_numeric(df['opp_pand'], errors='coerce') / pd.to_numeric(df['oppervlakte'], errors='coerce')


df['huisnr_bag_letter'] = df['huisnr_bag_letter'].notna().astype(int)
df['huisnr_bag_toevoeging'] = df['huisnr_bag_toevoeging'].notna().astype(int)

df['is_monument'] = df['is_monument'].fillna(0).astype(int)
df['is_protected'] = df['is_protected'].fillna(0).astype(int)


df = df.drop(columns=['bag_nummeraanduidingid', 'frontview_exists', 'random_rank', 'num_funda_images',
                      'frontview_funda_url', 'frontview_google_url', 'frontview_funda_in_business_url', 
                      'frontview_funda_url_split', 'frontview_google_url_split', 'frontview_funda_in_business_url_split',
                      'special_house_type', 'source_data_result_id',
                      'straatnaam', 'postcode', 'plaatsnaam', 'source_data_timestamp', 'frontview_url', 'bag_id'
                      ])

# CURRENTLY DELETING GEOMETRY COLUMN, CHANGE IF NEEDED
df = df.drop(columns=['geometry'])

# Oversample corriderflat door random te dupliceren
df_majority = df[df['woningtype'] != 'Corridorflat']  
df_minority = df[df['woningtype'] == 'Corridorflat'] 

df_minority_oversampled = resample(
    df_minority,
    replace=True,                
    n_samples=500,               
    random_state=42            
)

# Voeg ze weer samen en shuffle, change random state for different shuffle
df_balanced = pd.concat([df_majority, df_minority_oversampled])
df = df_balanced.sample(frac=1, random_state=42).reset_index(drop=True)

# Full preprocessed dataset with URLS, can be loaded into pipeline.
df.to_csv(
    file_path + "Full_preprocessed_detailed_house.csv",
    index=False,
    encoding='utf-8',
)

In [140]:
# Normalizing and feature engineering are applied to the dataset separately for training, validation, and testing.
# This is to ensure that the model does not learn from the validation and test sets during training.

# adjust random_state for reproducibility
train_df, temp_df = train_test_split(df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Check if standardization is needed for huisnr and pocent_ingenomen, large numbers can be encoded as categorical?
for dataframe in [train_df, val_df, test_df]:
    for col in ['opp_pand', 'oppervlakte', 'build_year', 'huisnr']:
        scaler = StandardScaler()
        dataframe[col] = scaler.fit_transform(dataframe[[col]])

    if dataframe is train_df:
        build_type_train = train_df[['build_type']]
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoder.fit(build_type_train)
        raw_feature_names = encoder.get_feature_names_out(['build_type'])
        clean_feature_names = [name.replace(' ', '_') for name in raw_feature_names]

    build_type = dataframe[['build_type']]
    encoded_array = encoder.transform(build_type)
    encoded_df = pd.DataFrame(encoded_array, columns=clean_feature_names, index=dataframe.index)
    dataframe.drop('build_type', axis=1, inplace=True)
    dataframe[encoded_df.columns] = encoded_df

In [126]:
features = [
    'huisnr',                                   # 0 - inf
    'huisnr_bag_letter',                        # 0 - 1
    'huisnr_bag_toevoeging',                    # 0 - 1
    'opp_pand',                                 # StandardScaler
    'oppervlakte',                              # StandardScaler
    'build_year',                               # StandardScaler
    'build_type_Appartement',                   # OneHotEncoder
    'build_type_Hoekwoning',                    # OneHotEncoder
    'build_type_Tussen_of_geschakelde_woning',   # OneHotEncoder
    'build_type_Tweeonder1kap',                 # OneHotEncoder
    'build_type_Verschillend',                  # OneHotEncoder
    'is_monument',                              # 0 - 1
    'is_protected',                             # 0 - 1
    'procent_ingenomen'                         # 0 - 1
]

target = 'woningtype'

In [127]:
X_train = train_df.drop(columns=["woningtype"])
print(X_train.dtypes)


huisnr                                     float64
huisnr_bag_letter                            int64
huisnr_bag_toevoeging                        int64
opp_pand                                   float64
oppervlakte                                float64
build_year                                 float64
is_monument                                  int64
is_protected                                 int64
bag_id                                       Int64
procent_ingenomen                          float64
build_type_Appartement                     float64
build_type_Hoekwoning                      float64
build_type_Tussen_of_geschakelde_woning    float64
build_type_Tweeonder1kap                   float64
build_type_Vrijstaande_woning              float64
dtype: object


In [141]:
#classical naive bayas probability model for baseline

from sklearn.naive_bayes import GaussianNB, CategoricalNB
# Suppose 'col_name' is your column with number-strings



X_train = train_df.drop(columns=["woningtype"])
y_train = train_df["woningtype"]

X_val = val_df.drop(columns=["woningtype"])
y_val = val_df["woningtype"]

X_test = test_df.drop(columns=["woningtype"])
y_test = test_df["woningtype"]

#wow moeilijke code (niet)
#guassian NB ipv categorical omdat onze data niet alleen categorical is (WOW)
NB = GaussianNB()
NB.fit(X_train, y_train)

In [142]:
#discriminitive classical model (probabky xgboost or random forest)
#random forest is denk beter als deze modellen alleen als baseline worden gebruikt
#xg is meer accurate tho
#heb de categorical features niet geencode want dacht dat koen dat al gedaan had? anders ga ik er nog achteraan

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

X_train = train_df.drop(columns=["woningtype"])
y_train = train_df["woningtype"]

X_val = val_df.drop(columns=["woningtype"])
y_val = val_df["woningtype"]

X_test = test_df.drop(columns=["woningtype"])
y_test = test_df["woningtype"]

#random forest is minder accuraat maar meer interpeteerbaar en minder complex
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

#accurater maar moet ff meer hyperparamter tuningg doen





In [143]:

from sklearn.metrics import (
        accuracy_score, precision_score, recall_score, f1_score,
        log_loss, cohen_kappa_score, 
    )

y_pred = NB.predict(X_val)
y_proba = NB.predict_proba(X_val)
    
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='macro', zero_division=0)
recall = recall_score(y_val, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_val, y_pred, average='macro', zero_division=0)
kappa = cohen_kappa_score(y_val, y_pred)
logloss = log_loss(y_val, y_proba)
    
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (macro): {precision:.4f}")
print(f"Recall (macro): {recall:.4f}")
print(f"F1 Score (macro): {f1:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"Log Loss: {logloss:.4f}")

Accuracy: 0.2556
Precision (macro): 0.2647
Recall (macro): 0.2515
F1 Score (macro): 0.1534
Cohen's Kappa: 0.2019
Log Loss: 11.4448


In [148]:
y_pred = rf.predict(X_val)
y_proba = rf.predict_proba(X_val)
    
accuracy = accuracy_score(y_val, y_pred)
precision = precision_score(y_val, y_pred, average='macro', zero_division=0)
recall = recall_score(y_val, y_pred, average='macro', zero_division=0)
f1 = f1_score(y_val, y_pred, average='macro', zero_division=0)
kappa = cohen_kappa_score(y_val, y_pred)
logloss = log_loss(y_val, y_proba)
    
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision (macro): {precision:.4f}")
print(f"Recall (macro): {recall:.4f}")
print(f"F1 Score (macro): {f1:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"Log Loss: {logloss:.4f}")

Accuracy: 0.4603
Precision (macro): 0.4701
Recall (macro): 0.4604
F1 Score (macro): 0.4531
Cohen's Kappa: 0.4218
Log Loss: 2.0873
