In [None]:
from tqdm import tqdm
import os
import json
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt
from srai.neighbourhoods import H3Neighbourhood
import torch as torch
import geopandas as gpd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn.decomposition import PCA
import warnings
import threading

This code is a sped up version, made by claude sonnet 3.5, which uses poi, roadnetwork and gtfs embeddings to create a new embedding for each region. The new embedding is created by taking the exponential weighted average of the region's embedding and its neighbors' embeddings. The code is optimized to run on GPU and uses the H3 hexagonal grid to find neighbors of each region.

In [None]:
class EmbeddingProcessor:
    def __init__(self, resolution, poi_embedding='geovex', use_finetuned_aerial=False, use_finetuned_streetview=False, image_pca_dim=None):
        self.resolution = resolution
        self.embeddings = {}
        self.regions_buffered_gdf = None
        self.regions_gdf = None
        self.region_id_to_index = None
        self.neighborhood = None
        self.all_neighbors = None
        self.poi_embedding = poi_embedding
        self.use_finetuned_aerial = use_finetuned_aerial
        self.use_finetuned_streetview = use_finetuned_streetview
        self.image_pca_dim = image_pca_dim

    def load_data(self):
        print(f"Loading data for resolution {self.resolution}...")

        # Load POI embeddings
        poi_file = f'embeddings_POI_{self.poi_embedding}_{self.resolution}.csv'
        self.embeddings['POI'] = pd.read_csv(poi_file, index_col='region_id')

        # Common embeddings for both resolutions
        common_embeddings = ['roadnetwork', 'GTFS']
        for emb in common_embeddings:
            self.embeddings[emb] = pd.read_csv(f'embeddings_{emb}_{self.resolution}.csv', index_col='region_id')

        # Load and reduce aerial embeddings
        aerial_file = f'embeddings_aerial_{self.resolution}'
        if self.use_finetuned_aerial:
            aerial_file += '_finetune'
        aerial_file += '.csv'
        self.embeddings['aerial'] = self._load_and_reduce_image_embedding(aerial_file)

        # Resolution-specific embeddings (using ConvNext & only mean pooling of panoid per region_id)
        if self.resolution == 9:
            streetview_suffix = '_finetune' if self.use_finetuned_streetview else ''
            #self.embeddings['streetview_max'] = self._load_and_reduce_image_embedding(f'embeddings_streetview_max_{self.resolution}{streetview_suffix}.csv')
            self.embeddings['streetview_mean'] = self._load_and_reduce_image_embedding(f'embeddings_streetview_mean_{self.resolution}{streetview_suffix}.csv')

        self.regions_buffered_gdf = gpd.read_file(f'selected_regions_buffered_{self.resolution}.geojson').set_index('region_id')
        self.regions_gdf = gpd.read_file(f'selected_regions_{self.resolution}.geojson').set_index('region_id')

        print(f"Original shapes: regions_gdf: {self.regions_gdf.shape}, regions_buffered_gdf: {self.regions_buffered_gdf.shape}")

        # Handle duplicate indices
        self.regions_buffered_gdf = self.regions_buffered_gdf[~self.regions_buffered_gdf.index.duplicated(keep='first')]
        self.regions_gdf = self.regions_gdf[~self.regions_gdf.index.duplicated(keep='first')]

        # Ensure all embeddings have representations for every region in the buffered area
        for key in self.embeddings:
            # Remove duplicate indices in embeddings
            self.embeddings[key] = self.embeddings[key][~self.embeddings[key].index.duplicated(keep='first')]
            # Reindex with the unique indices from regions_buffered_gdf
            self.embeddings[key] = self.embeddings[key].reindex(self.regions_buffered_gdf.index, fill_value=0)

        print("Data loaded and aligned successfully.")
        print(f"Final shapes: regions_gdf: {self.regions_gdf.shape}, regions_buffered_gdf: {self.regions_buffered_gdf.shape}")
        for key, embedding in self.embeddings.items():
            print(f"Embedding {key} shape: {embedding.shape}")

    def _load_and_reduce_image_embedding(self, file_path):
        # Read the CSV file, using the first column as the index
        df = pd.read_csv(file_path, index_col=0)
    
        # Rename the index to 'region_id'
        df.index.name = 'region_id'
    
        if self.image_pca_dim is not None:
            pca = PCA(n_components=self.image_pca_dim)
            reduced_data = pca.fit_transform(df)
            return pd.DataFrame(reduced_data, index=df.index)
        return df

    def prepare_data(self):
        print("Preparing data...")
        self.region_id_to_index = {region_id: idx for idx, region_id in enumerate(self.regions_gdf.index)}
        self.neighborhood = H3Neighbourhood(self.regions_buffered_gdf)
        print("Data prepared.")

    def get_all_neighbors(self, k):
        print(f"Calculating neighbors for k={k}...")
        self.all_neighbors = {}
        for region_id in self.regions_gdf.index:
            self.all_neighbors[region_id] = [list(self.neighborhood.get_neighbours_at_distance(str(region_id), i)) for i in range(1, k+1)]
        print("Neighbors calculated.")

    def get_embeddings_for_region(self, region_id, k, embeddings_tensor):
        region_embeddings = []
        region_index = self.region_id_to_index[region_id]

        region_embeddings.append(embeddings_tensor[region_index])

        for i in range(1, k + 1):
            neighbor_ids = self.all_neighbors[region_id][i-1]
            neighbor_indices = [self.regions_buffered_gdf.index.get_loc(neighbor_id) for neighbor_id in neighbor_ids if neighbor_id in self.regions_buffered_gdf.index]
            if neighbor_indices:
                embeddings = embeddings_tensor[neighbor_indices].mean(dim=0)
            else:
                embeddings = torch.zeros(embeddings_tensor.shape[1])
            region_embeddings.append(embeddings)

        return torch.stack(region_embeddings)

    @staticmethod
    def calculate_weighted_average(embeddings, weight_type):
        num_embeddings = len(embeddings)
    
        if weight_type == 'exponential_e':
            weights = torch.exp(torch.arange(num_embeddings, dtype=torch.float32) * -1)
        elif weight_type == 'logarithm':
            weights = 1 / torch.log2(torch.arange(num_embeddings, dtype=torch.float32) + 2)
        elif weight_type == 'linear':
            weights = torch.linspace(1, 0, num_embeddings)
        elif weight_type == 'flat':
            weights = torch.ones(num_embeddings, dtype=torch.float32)
        else:
            raise ValueError(f"Unsupported weight_type: {weight_type}")
    
        # Ensure the first weight (corresponding to ring 0) is the highest by reversing the weights
        weights = weights.flip(0)
    
        # Calculate the weighted sum of embeddings
        weighted_sum = torch.sum(embeddings * weights.unsqueeze(1), dim=0)
    
        # Return the normalized weighted sum
        return weighted_sum / weights.sum()

    def process_embeddings(self, k=15, weight_type='exponential_e', data_source='all'):
        if self.all_neighbors is None or len(next(iter(self.all_neighbors.values()))) != k:
            self.get_all_neighbors(k)

        if data_source == 'all':
            embeddings_concatenated = pd.concat(list(self.embeddings.values()), axis=1)
        elif data_source in self.embeddings:
            embeddings_concatenated = self.embeddings[data_source]
        else:
            raise ValueError(f"Unsupported data_source: {data_source}")

        embeddings_tensor = torch.tensor(embeddings_concatenated.values, dtype=torch.float32)

        all_region_embeddings = []

        for region_id in self.regions_gdf.index:
            region_embeddings = self.get_embeddings_for_region(region_id, k, embeddings_tensor)
            weighted_embedding = self.calculate_weighted_average(region_embeddings, weight_type)
            all_region_embeddings.append(weighted_embedding.numpy())

        output_embeddings_df = pd.DataFrame(all_region_embeddings, index=self.regions_gdf.index)
        return output_embeddings_df

        for region_id in tqdm(self.regions_gdf.index, desc="Processing regions"):
            region_embeddings = self.get_embeddings_for_region(region_id, k, embeddings_tensor)
            weighted_embedding = self.calculate_weighted_average(region_embeddings, weight_type)
            all_region_embeddings.append(weighted_embedding.numpy())

        output_embeddings_df = pd.DataFrame(all_region_embeddings, index=self.regions_gdf.index)
        print("Embedding processing complete.")
        return output_embeddings_df

In [None]:
class EmbeddingExperiment:
    def __init__(self, resolution, poi_embedding='geovex', use_finetuned_aerial=False, use_finetuned_streetview=False, image_pca_dim=None):
        self.resolution = resolution
        self.processor = EmbeddingProcessor(resolution,
                                            poi_embedding=poi_embedding,
                                            use_finetuned_aerial=use_finetuned_aerial,
                                            use_finetuned_streetview=use_finetuned_streetview,
                                            image_pca_dim=image_pca_dim)
        self.processor.load_data()
        self.processor.prepare_data()
        self.experiment_dir = self.create_experiment_directory()


        self.target_columns = ['afw', 'vrz', 'fys', 'soc', 'onv', 'won']
        self.target_names = {
            'afw': 'Liveability',
            'vrz': 'Amenities',
            'fys': 'Physical Environment',
            'soc': 'Social Cohesion',
            'onv': 'Safety',
            'won': 'Housing Stock'
        }
        self.colors = {
            'afw': '#808080',  # Dark Grey for Liveability
            'vrz': '#FF4500',  # Orange Red for Amenities
            'fys': '#32CD32',  # Lime Green for Physical Environment
            'soc': '#8A2BE2',  # Blue Violet for Social Cohesion
            'onv': '#1E90FF',  # Dodger Blue for Safety
            'won': '#FFA500'   # Orange for Housing Stock
        }
        self.markers = {
            'exponential_e': 'o',
            'linear': 's',
            'flat': '^',
            'logarithm': 'D'
        }
        
    def create_experiment_directory(self):
        date_str = datetime.now().strftime("%Y%m%d")
        base_dir = "experiments"
        if not os.path.exists(base_dir):
            os.makedirs(base_dir)
        run_number = 1
        while True:
            dir_name = f"{date_str}_run{run_number:02d}_res{self.resolution}"
            full_path = os.path.join(base_dir, dir_name)
            if not os.path.exists(full_path):
                os.makedirs(full_path)
                return full_path
            run_number += 1

    def run_experiment(self, k_values, weight_types, data_sources):
        params_list = [(k, wt, ds) for k in k_values for wt in weight_types for ds in data_sources]
        total_experiments = len(params_list)

        print(f"Starting experiments with {total_experiments} parameter combinations...")

        all_embeddings = {}
        all_results = {}

        # Suppress warnings
        warnings.filterwarnings("ignore")

        # Use a single progress bar for all experiments
        with tqdm(total=total_experiments, desc="Experiments Progress", ncols=100) as pbar:
            for params in params_list:
                k, weight_type, data_source = params

                try:
                    result = self.run_single_experiment(params)
                    all_embeddings[params] = result[1]
                    all_results[params] = result[2]
                except Exception as exc:
                    print(f"\nExperiment {params} generated an exception: {exc}")
                    import traceback
                    traceback.print_exc()

                pbar.update(1)
                pbar.set_description(f"Completed: k={k}, weight={weight_type}, source={data_source}")

        # Re-enable warnings
        warnings.resetwarnings()

        self.save_results(all_results, all_embeddings)
        return all_results, all_embeddings

    def run_single_experiment(self, params):
        k, weight_type, data_source = params
        embeddings = self.processor.process_embeddings(k=k, weight_type=weight_type, data_source=data_source)
        r2_scores = self.evaluate_embeddings(embeddings)
        return (k, weight_type, data_source), embeddings, r2_scores

    def evaluate_embeddings(self, embeddings):
        r2_scores = {}

        # Determine the number of components to use (minimum dimensionality of all data sources)
        min_dim = min(emb.shape[1] for emb in self.processor.embeddings.values())

        # Apply PCA & print the total explained variance ratio
        pca = PCA(n_components=min_dim)
        X_pca = pca.fit_transform(embeddings)
    
        for column in self.target_columns:
            y = self.processor.regions_gdf[column]

            mask = ~(np.isnan(y) | np.isnan(X_pca).any(axis=1))
            X_valid = X_pca[mask]
            y_valid = y[mask]

            if len(y_valid) == 0:
                print(f"Warning: No valid data for {column} after removing NaN values.")
                r2_scores[column] = np.nan
                continue

            X_train, X_test, y_train, y_test = train_test_split(X_valid, y_valid, test_size=0.3, random_state=42)
            model = LinearRegression()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            r2_scores[column] = r2_score(y_test, y_pred)
        return r2_scores

    def save_results(self, results, all_embeddings):
        with open(os.path.join(self.experiment_dir, 'results.json'), 'w') as f:
            json.dump({str(k): v for k, v in results.items()}, f)
        for key, embeddings in all_embeddings.items():
            filename = f"embeddings_k{key[0]}_{key[1]}_{key[2]}.csv"
            embeddings.to_csv(os.path.join(self.experiment_dir, filename))

    def load_results(self):
        with open(os.path.join(self.experiment_dir, 'results.json'), 'r') as f:
            results = json.load(f)
        return {eval(k): v for k, v in results.items()}

    def plot_results(self, results):
        self.plot_k_rings_performance(results)
        self.plot_weight_type_performance(results)
        self.plot_data_source_performance(results)

    def plot_k_rings_performance(self, results):
        plt.figure(figsize=(16, 10))

        k_values = sorted(set(key[0] for key in results.keys()))
        weight_types = sorted(set(key[1] for key in results.keys()))

        for target in self.target_columns:
            for weight_type in weight_types:
                r2_scores = []
                for k in k_values:
                    best_score = max(results.get((k, weight_type, ds), {}).get(target, -np.inf)
                                     for ds in set(key[2] for key in results.keys()))
                    r2_scores.append(best_score)

                plt.plot(k_values, r2_scores,
                         label=f"{self.target_names[target]} ({weight_type})",
                         color=self.colors[target],
                         marker=self.markers[weight_type],
                         linestyle='-',
                         linewidth=2,
                         markersize=8)

        plt.xlabel('Number of k-rings', fontsize=12)
        plt.ylabel('Best R² Score', fontsize=12)
        plt.title(f'Best Performance vs Number of k-rings (Resolution {self.resolution})', fontsize=14)

        # Create a custom legend for target variables
        legend_elements = []
        for target in self.target_columns:
            legend_elements.append(plt.Line2D([0], [0], color=self.colors[target], lw=4, label=self.target_names[target]))
        plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)

        # Add a secondary legend for weight types
        weight_legend = plt.legend([plt.Line2D([0], [0], marker=self.markers[wt], color='grey', linestyle='None', markersize=8)
                                    for wt in weight_types],
                                   weight_types,
                                   loc='lower right',
                                   title='Weight Types',
                                   fontsize=8)
        plt.gca().add_artist(weight_legend)

        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(os.path.join(self.experiment_dir, 'k_rings_performance.png'), dpi=300, bbox_inches='tight')
        plt.close()

    def plot_weight_type_performance(self, results):
        plt.style.use('default')
        plt.figure(figsize=(16, 10), facecolor='white')
        plt.gca().set_facecolor('white')

        weight_types = sorted(set(key[1] for key in results.keys()))
        x = np.arange(len(weight_types))
        width = 0.1

        for i, target in enumerate(self.target_columns):
            best_r2 = [max(results[k, wt, ds][target]
                           for k in set(key[0] for key in results.keys())
                           for ds in set(key[2] for key in results.keys())
                           if (k, wt, ds) in results)
                       for wt in weight_types]

            plt.bar(x + i*width, best_r2, width, label=self.target_names[target],
                    color=self.colors[target], edgecolor='none')

        plt.xlabel('Weighted Average Type', fontsize=12)
        plt.ylabel('Best R² Score', fontsize=12)
        plt.title(f'Best Performance vs Weighted Average Type (Resolution {self.resolution})', fontsize=14)
        plt.xticks(x + width * (len(self.target_columns) - 1) / 2, weight_types, rotation=45, ha='right')
        plt.legend(loc='upper right', fontsize=10)
        plt.grid(axis='y', linestyle=':', color='gray', alpha=0.3)
        plt.ylim(0, 1)  # Set y-axis limits from 0 to 1 for R² scores
        plt.tight_layout()
        plt.savefig(os.path.join(self.experiment_dir, 'weight_type_performance.png'), dpi=300, bbox_inches='tight')
        plt.close()

    def plot_data_source_performance(self, results):
        plt.style.use('default')
        plt.figure(figsize=(16, 10), facecolor='white')
        plt.gca().set_facecolor('white')

        data_sources = sorted(set(key[2] for key in results.keys()))
        x = np.arange(len(data_sources))
        width = 0.1

        for i, target in enumerate(self.target_columns):
            best_r2 = [max(results[k, wt, ds][target]
                           for k in set(key[0] for key in results.keys())
                           for wt in set(key[1] for key in results.keys())
                           if (k, wt, ds) in results)
                       for ds in data_sources]

            plt.bar(x + i*width, best_r2, width, label=self.target_names[target],
                    color=self.colors[target], edgecolor='none')

        plt.xlabel('Data Source', fontsize=12)
        plt.ylabel('Best R² Score', fontsize=12)
        plt.title(f'Best Performance vs Data Source (Resolution {self.resolution})', fontsize=14)
        plt.xticks(x + width * (len(self.target_columns) - 1) / 2, data_sources, rotation=45, ha='right')
        plt.legend(loc='best', fontsize=10)
        plt.grid(axis='y', linestyle=':', color='gray', alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(self.experiment_dir, 'data_source_performance.png'), dpi=300, bbox_inches='tight')
        plt.close()
        
    def save_top_bottom_embeddings(self, results, all_embeddings, top_n=3):
        print(f"\nSaving top and bottom embeddings...")
        avg_scores = {key: np.mean(list(scores.values())) for key, scores in results.items()}
        sorted_configs = sorted(avg_scores.items(), key=lambda x: x[1], reverse=True)
        total_embeddings = len(sorted_configs)
        save_n = min(top_n, total_embeddings // 2)
        top_configs = sorted_configs[:save_n]
        bottom_configs = sorted_configs[-save_n:]
        best_worst_dir = os.path.join(self.experiment_dir, 'best_worst_embeddings')
        os.makedirs(best_worst_dir, exist_ok=True)
        for i, (config, score) in enumerate(top_configs, 1):
            filename = os.path.join(best_worst_dir, f'top_{i}_embedding_k{config[0]}_{config[1]}_{config[2]}.csv')
            all_embeddings[config].to_csv(filename)
            print(f"Saved top {i} embedding to {filename} (Avg R² = {score:.4f})")
        for i, (config, score) in enumerate(bottom_configs, 1):
            filename = os.path.join(best_worst_dir, f'bottom_{i}_embedding_k{config[0]}_{config[1]}_{config[2]}.csv')
            all_embeddings[config].to_csv(filename)
            print(f"Saved bottom {i} embedding to {filename} (Avg R² = {score:.4f})")
        print(f"Finished saving {save_n} top and {save_n} bottom embeddings.")

    def save_experiment_info(self, k_values, weight_types, data_sources):
        info = {
            'Date': datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'Resolution': self.resolution,
            'POI embedding type': self.processor.poi_embedding,
            'Use finetuned aerial': self.processor.use_finetuned_aerial,
            'Use finetuned streetview': self.processor.use_finetuned_streetview,
            'Number of k-ring values': len(k_values),
            'Number of weight types': len(weight_types),
            'Number of data sources': len(data_sources),
            'k-ring values': k_values,
            'Weight types': weight_types,
            'Data sources': data_sources
        }
        with open(os.path.join(self.experiment_dir, 'experiment_info.txt'), 'w') as f:
            for key, value in info.items():
                f.write(f"{key}: {value}\n")

In [None]:
if __name__ == "__main__":
    # Configuration
    resolution = 9  # Change this to 9 if you want to run for resolution 9
    poi_embedding = 'hex2vec'  # Choose between 'geovex' and 'hex2vec'
    use_finetuned_aerial = False  # Set to True to use finetuned aerial embeddings
    use_finetuned_streetview = False  # Set to True to use finetuned streetview embeddings (only for resolution 9)
    image_pca_dim = 100  # Set to None to use original dimensionality

    experiment = EmbeddingExperiment(resolution,
                                     poi_embedding=poi_embedding,
                                     use_finetuned_aerial=use_finetuned_aerial,
                                     use_finetuned_streetview=use_finetuned_streetview,
                                     image_pca_dim=image_pca_dim)

    # Experiment parameters
    if resolution == 9:
        k_values = [1, 2, 3, 4, 5]
    elif resolution == 10:
        k_values = [1, 3, 5, 7, 10, 15]
    else:
        raise ValueError(f"Unsupported resolution: {resolution}")

    weight_types = ['exponential_e', 'logarithm', 'linear', 'flat']

    # Define data sources based on resolution
    if resolution == 9:
        data_sources = ['all', 'POI', 'roadnetwork', 'GTFS', 'aerial', 'streetview_mean']
    elif resolution == 10:
        data_sources = ['all', 'POI', 'roadnetwork', 'GTFS', 'aerial']
    else:
        raise ValueError(f"Unsupported resolution: {resolution}")

    # Save experiment info
    experiment.save_experiment_info(k_values, weight_types, data_sources)

    print(f"Starting sequential experiments for resolution {resolution}...")

    # Run the sequential experiments
    results, all_embeddings = experiment.run_experiment(k_values, weight_types, data_sources)

    print("All experiments completed. Processing results...")

    # Plot results
    experiment.plot_results(results)

    # Print detailed results
    with open(os.path.join(experiment.experiment_dir, 'detailed_results.txt'), 'w') as f:
        for key, scores in results.items():
            f.write(f"k={key[0]}, weight_type={key[1]}, data_source={key[2]}\n")
            for column, score in scores.items():
                f.write(f"  {experiment.target_names[column]}: R² = {score:.4f}\n")
            f.write("\n")

    # Get best performing configuration
    best_key = max(results, key=lambda k: np.mean(list(results[k].values())))
    best_score = np.mean(list(results[best_key].values()))

    with open(os.path.join(experiment.experiment_dir, 'best_configuration.txt'), 'w') as f:
        f.write(f"Best configuration: k={best_key[0]}, weight_type={best_key[1]}, data_source={best_key[2]}\n")
        f.write(f"Average R² score: {best_score:.4f}\n")

    # Save best embeddings
    best_embeddings = all_embeddings[best_key]
    best_embeddings.to_csv(os.path.join(experiment.experiment_dir, 'best_embeddings.csv'))

    # Save top and bottom embeddings
    experiment.save_top_bottom_embeddings(results, all_embeddings, top_n=3)

    # Print dimensionality information
    with open(os.path.join(experiment.experiment_dir, 'dimensionality_info.txt'), 'w') as f:
        f.write(f"Final embedding dimensionality: {best_embeddings.shape[1]}\n")
        f.write(f"Original dimensionalities:\n")
        for key, embeddings in all_embeddings.items():
            f.write(f"  k={key[0]}, weight_type={key[1]}, data_source={key[2]}: {embeddings.shape[1]}\n")

    print(f"Experiment results saved in: {experiment.experiment_dir}")
    print("Experiment completed successfully.")

Recalculate R squared values with updated evaluation method (PCA)

In [None]:
# # disable warnings
# warnings.filterwarnings("ignore")
# 
# # Load existing results
# experiment_dir = r"D:\tu delft\Afstuderen\Phase 5 learning strategy comparison\experiments\20240701_run07"
# with open(os.path.join(experiment_dir, 'results.json'), 'r') as f:
#     old_results = json.load(f)
# 
# # Convert string keys back to tuples
# old_results = {eval(k): v for k, v in old_results.items()}
# 
# # Create EmbeddingExperiment instance
# experiment = EmbeddingExperiment(resolution=10)     # DO NOT FORGET TO CHANGE RESOLUTION IF NEEDED
# 
# # Override the experiment directory with the existing one
# experiment.experiment_dir = experiment_dir
# 
# # Recalculate R-squared scores with PCA
# new_results = {}
# 
# # Determine the number of components to use (minimum dimensionality of all data sources)
# min_dim = min(emb.shape[1] for emb in experiment.processor.embeddings.values())
# 
# for params, _ in tqdm(old_results.items(), desc="Recalculating R-squared scores"):
#     k, weight_type, data_source = params
# 
#     # Load the corresponding embedding file
#     embedding_file = os.path.join(experiment_dir, f"embeddings_k{k}_{weight_type}_{data_source}.csv")
#     embeddings = pd.read_csv(embedding_file, index_col=0)
# 
#     # Apply PCA
#     pca = PCA(n_components=min_dim)
#     X_pca = pca.fit_transform(embeddings)
# 
#     r2_scores = {}
#     for column in experiment.target_columns:
#         y = experiment.processor.regions_gdf[column]
# 
#         mask = ~(np.isnan(y) | np.isnan(X_pca).any(axis=1))
#         X_valid = X_pca[mask]
#         y_valid = y[mask]
# 
#         if len(y_valid) == 0:
#             print(f"Warning: No valid data for {column} after removing NaN values.")
#             r2_scores[column] = np.nan
#             continue
# 
#         X_train, X_test, y_train, y_test = train_test_split(X_valid, y_valid, test_size=0.3, random_state=42)
#         model = LinearRegression()
#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)
#         r2_scores[column] = r2_score(y_test, y_pred)
# 
#     new_results[params] = r2_scores
# 
# # Save updated results
# with open(os.path.join(experiment_dir, 'results_with_pca.json'), 'w') as f:
#     json.dump({str(k): v for k, v in new_results.items()}, f)
# 
# # Plot updated results
# experiment.plot_results(new_results)
# 
# print(f"Updated results and plots saved in: {experiment_dir}")
# 
# # Print best configuration
# best_key = max(new_results, key=lambda k: np.mean(list(new_results[k].values())))
# best_score = np.mean(list(new_results[best_key].values()))
# 
# print(f"\nBest configuration: k={best_key[0]}, weight_type={best_key[1]}, data_source={best_key[2]}")
# print(f"Average R² score: {best_score:.4f}")
# 
# # Print detailed results for best configuration
# print("\nDetailed R² scores for best configuration:")
# for column, score in new_results[best_key].items():
#     print(f"  {experiment.target_names[column]}: R² = {score:.4f}")

Recalculate plots using refactored code which takes best results for streamlined comparison

In [None]:
# Load existing results
experiment_dir = r"D:\tu delft\Afstuderen\Phase 5 learning strategy comparison\experiments\20240701_run01_res9"  
with open(os.path.join(experiment_dir, 'results.json'), 'r') as f:
    results = json.load(f)

# Convert string keys back to tuples
results = {eval(k): v for k, v in results.items()}

# Create EmbeddingExperiment instance
experiment = EmbeddingExperiment(resolution=9)

# Override the experiment directory with the existing one
experiment.experiment_dir = experiment_dir

# Plot results
experiment.plot_results(results)

print(f"Updated plots saved in: {experiment_dir}")