In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import geopandas as gpd
import numpy as np
from tqdm import tqdm
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from srai.neighbourhoods import H3Neighbourhood
from scipy.spatial.distance import pdist, squareform
import os
import datetime
import json
import matplotlib.pyplot as plt
import warnings
import concurrent.futures
warnings.filterwarnings("ignore")

This code is a sped up version, made by claude sonnet 3.5, which uses poi, roadnetwork and gtfs embeddings to create a new embedding for each region. The new embedding is created by taking the exponential weighted average of the region's embedding and its neighbors' embeddings. The code is optimized to run on GPU and uses the H3 hexagonal grid to find neighbors of each region.

It is parallel for both k-ring and datasources. It also has multiple options in the execution loop to choose specific data sources.

In [None]:
class WithinRingNN(nn.Module):
    def __init__(self, full_input_dim, hidden_dim):
        super(WithinRingNN, self).__init__()
        self.bn_input = nn.BatchNorm1d(full_input_dim)
        self.fc1 = nn.Linear(full_input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim)

    def forward(self, x):
        x = self.bn_input(x)
        x = F.gelu(self.fc1(x))
        x = self.fc2(x)
        return x

class BetweenRingNN(nn.Module):
    def __init__(self, hidden_dim, output_dim):
        super(BetweenRingNN, self).__init__()
        self.fc1 = nn.Linear(hidden_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        x = F.gelu(self.fc1(x))
        x = self.fc2(x)
        return x

class RingAggregationNN(nn.Module):
    def __init__(self, full_input_dim, hidden_dim, output_dim, k, weight_type='exponential_e'):
        super(RingAggregationNN, self).__init__()
        self.k = k
        self.weight_type = weight_type
        self.within_ring_nn = WithinRingNN(full_input_dim, hidden_dim)
        self.between_ring_nn = BetweenRingNN(hidden_dim, output_dim)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        
    def forward(self, embeddings):
        batch_size, num_rings, max_neighbors, _ = embeddings.size()
        transformed_and_pooled_embeddings = []

        for i in range(num_rings):
            ring_embeddings = embeddings[:, i, :, :].reshape(batch_size * max_neighbors, -1)
            transformed_embeddings = self.within_ring_nn(ring_embeddings)
            pooled_embeddings = transformed_embeddings.view(batch_size, max_neighbors, -1).mean(dim=1)
            transformed_and_pooled_embeddings.append(pooled_embeddings)

        ring_means = torch.stack(transformed_and_pooled_embeddings, dim=1)
        transformed_ring_means = []

        for i in range(num_rings):
            transformed_mean = self.between_ring_nn(ring_means[:, i, :])
            transformed_ring_means.append(transformed_mean)

        # Calculate weights
        weights = torch.zeros(num_rings, device=self.device)
        for i in range(num_rings):
            if self.weight_type == 'exponential_e':
                weights[i] = torch.exp(torch.tensor(-i, dtype=torch.float32))
            elif self.weight_type == 'logarithm':
                weights[i] = 1 / torch.log2(torch.tensor(i + 2, dtype=torch.float32))
            elif self.weight_type == 'linear':
                weights[i] = 1 - i / (num_rings - 1)
            elif self.weight_type == 'flat':
                weights[i] = 1
            else:
                raise ValueError(f"Unsupported weight_type: {self.weight_type}")

        # Normalize weights
        weights = weights / weights.sum()

        # Apply weights and sum
        weighted_sum = torch.zeros_like(transformed_ring_means[0])
        for i in range(num_rings):
            weighted_sum += weights[i] * transformed_ring_means[i]

        return weighted_sum

class CircleLoss(nn.Module):
    def __init__(self, m=0.15, gamma=256):  # maybe reduce m to 0.15? See circle loss paper, 0.25 was best but had hard dropoff so better play safe?
        super().__init__()
        self.m, self.gamma = m, gamma
        self.soft_plus = nn.Softplus()

    def forward(self, sp, sn):
        ap = torch.clamp_min(-sp.detach() + 1 + self.m, min=0.)
        an = torch.clamp_min(sn.detach() + self.m, min=0.)
        delta_p, delta_n = 1 - self.m, self.m
        logit_p = -ap * (sp - delta_p) * self.gamma
        logit_n = an * (sn - delta_n) * self.gamma
        return self.soft_plus(torch.logsumexp(logit_n, dim=0) + torch.logsumexp(logit_p, dim=0))

class TripletDataset(Dataset):
    def __init__(self, region_ids, positive_lookup_table, k, embeddings_concatenated):
        self.region_ids = region_ids
        self.positive_lookup_table = positive_lookup_table
        self.k = k
        self.embeddings_concatenated = embeddings_concatenated
        self.neighborhood = H3Neighbourhood(embeddings_concatenated)

    def __len__(self):
        return len(self.region_ids)

    def __getitem__(self, idx):
        anchor_id = self.region_ids[idx]
        possible_positives = self.positive_lookup_table.loc[anchor_id].index[self.positive_lookup_table.loc[anchor_id] == 1].tolist()
        possible_negatives = self.positive_lookup_table.loc[anchor_id].index[self.positive_lookup_table.loc[anchor_id] == 0].tolist()

        if not possible_positives or not possible_negatives:
            return self.__getitem__((idx + 1) % len(self.region_ids))

        positive_id = np.random.choice(possible_positives)
        negative_id = np.random.choice(possible_negatives)

        anchor_tensor = self.get_embeddings_for_region_tensor(anchor_id)
        positive_tensor = self.get_embeddings_for_region_tensor(positive_id)
        negative_tensor = self.get_embeddings_for_region_tensor(negative_id)

        return anchor_tensor, positive_tensor, negative_tensor

    def get_embeddings_for_region_tensor(self, region_id):
        group_embeddings = []
        max_neighbors = 6 * self.k

        for i in range(self.k + 1):
            if i == 0:
                embeddings = torch.tensor(self.embeddings_concatenated.loc[[region_id]].values)
            else:
                neighbor_ids = self.get_neighbors(region_id, i)
                if len(neighbor_ids) > 0:
                    embeddings = torch.tensor(self.embeddings_concatenated.loc[neighbor_ids].values)
                else:
                    embeddings = torch.zeros((0, self.embeddings_concatenated.shape[1]))

            padded_embeddings = F.pad(embeddings, (0, 0, 0, max(max_neighbors - embeddings.shape[0], 0)))
            group_embeddings.append(padded_embeddings)

        padded_group_embeddings = torch.stack([F.pad(x, (0, 0, 0, max_neighbors - x.size(0))) for x in group_embeddings])
        return padded_group_embeddings

    def get_neighbors(self, region_id, k):
        return list(self.neighborhood.get_neighbours_at_distance(str(region_id), k))

class RegionEmbeddingsDataset(Dataset):
    def __init__(self, region_ids, k, embeddings_concatenated, padding_size=0):
        self.region_ids = region_ids
        self.k = k
        self.embeddings_concatenated = embeddings_concatenated
        self.padding_size = padding_size
        self.neighborhood = H3Neighbourhood(embeddings_concatenated)

    def __len__(self):
        return len(self.region_ids)

    def __getitem__(self, idx):
        region_id = self.region_ids[idx]
        embeddings_tensor = self.get_embeddings_for_region_tensor(region_id)
        return embeddings_tensor

    def get_embeddings_for_region_tensor(self, region_id):
        group_embeddings = []
        max_neighbors = 6 * self.k

        for i in range(self.k + 1):
            if i == 0:
                embeddings = torch.tensor(self.embeddings_concatenated.loc[[region_id]].values)
            else:
                neighbor_ids = self.get_neighbors(region_id, i)
                if len(neighbor_ids) > 0:
                    embeddings = torch.tensor(self.embeddings_concatenated.loc[neighbor_ids].values)
                else:
                    embeddings = torch.zeros((0, self.embeddings_concatenated.shape[1]))

            # Add padding if necessary
            if self.padding_size > 0:
                padding = torch.zeros(embeddings.shape[0], self.padding_size)
                embeddings = torch.cat([embeddings, padding], dim=1)

            padded_embeddings = F.pad(embeddings, (0, 0, 0, max(max_neighbors - embeddings.shape[0], 0)))
            group_embeddings.append(padded_embeddings)

        padded_group_embeddings = torch.stack([F.pad(x, (0, 0, 0, max_neighbors - x.size(0))) for x in group_embeddings])
        return padded_group_embeddings

    def get_neighbors(self, region_id, k):
        return list(self.neighborhood.get_neighbours_at_distance(str(region_id), k))

In [None]:
class LearntAggregationExperiment:
    def __init__(self, resolution=9, use_euclidean=False, poi_embedding='geovex',
                 use_finetuned_aerial=False, use_finetuned_streetview=False,
                 image_pca_dim=None, k_values=[1,3,5]):
        self.resolution = resolution
        self.use_euclidean = use_euclidean
        self.k_values = k_values
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.experiment_dir = self.create_experiment_directory()
        self.image_pca_dim = image_pca_dim
        self.poi_embedding = poi_embedding
        self.verbose = True
        self.use_finetuned_aerial = use_finetuned_aerial
        self.use_finetuned_streetview = use_finetuned_streetview
        self.load_data()
        self.prepare_data()

        self.target_columns = ['afw', 'vrz', 'fys', 'soc', 'onv', 'won']
        self.target_names = {
            'afw': 'Liveability',
            'vrz': 'Amenities',
            'fys': 'Physical Environment',
            'soc': 'Social Cohesion',
            'onv': 'Safety',
            'won': 'Housing Stock'
        }
        self.colors = {
            'afw': '#808080',  # Dark Grey for Liveability
            'vrz': '#FF4500',  # Orange Red for Amenities
            'fys': '#32CD32',  # Lime Green for Physical Environment
            'soc': '#8A2BE2',  # Blue Violet for Social Cohesion
            'onv': '#1E90FF',  # Dodger Blue for Safety
            'won': '#FFA500'   # Orange for Housing Stock
        }
        self.markers = {
            'exponential_e': 'o',
            'linear': 's',
            'flat': '^',
            'logarithm': 'D'
        }

    def create_experiment_directory(self):
        date_str = datetime.datetime.now().strftime("%Y%m%d")
        base_dir = "experiments"
        if not os.path.exists(base_dir):
            os.makedirs(base_dir)
        run_number = 1
        while True:
            dir_name = f"{date_str}_run{run_number:02d}_res{self.resolution}_learnt"
            full_path = os.path.join(base_dir, dir_name)
            if not os.path.exists(full_path):
                os.makedirs(full_path)
                return full_path
            run_number += 1

    def load_data(self):
        if self.verbose:
            print(f"Loading data for resolution {self.resolution}...")
        self.od_matrix_accessibilities = pd.read_csv('od_matrix_accessibilities_neighborhooddensity.csv', index_col=0)
    
        self.embeddings = {}
    
        def load_csv_with_index(file_path):
            df = pd.read_csv(file_path)
            if 'region_id' in df.columns:
                df.set_index('region_id', inplace=True)
            else:
                df.set_index(df.columns[0], inplace=True)
                df.index.name = 'region_id'
            return df
    
        # Load POI embeddings
        poi_file = f'embeddings_POI_{self.poi_embedding}_{self.resolution}.csv'
        self.embeddings['POI'] = load_csv_with_index(poi_file)
    
        # Common embeddings for both resolutions
        common_embeddings = ['roadnetwork', 'GTFS']
        for emb in common_embeddings:
            self.embeddings[emb] = load_csv_with_index(f'embeddings_{emb}_{self.resolution}.csv')
    
        # Load aerial embeddings
        aerial_suffix = '_finetune' if self.use_finetuned_aerial else ''
        self.embeddings['aerial'] = load_csv_with_index(f'embeddings_aerial_{self.resolution}{aerial_suffix}.csv')
    
        # Resolution-specific embeddings
        if self.resolution == 9:
            streetview_suffix = '_finetune' if self.use_finetuned_streetview else ''
            self.embeddings['streetview'] = load_csv_with_index(f'embeddings_streetview_mean_{self.resolution}{streetview_suffix}.csv')
    
        self.regions_buffered_gdf = gpd.read_file(f'selected_regions_buffered_{self.resolution}.geojson').set_index('region_id')
        self.regions_gdf = gpd.read_file(f'selected_regions_{self.resolution}.geojson').set_index('region_id')
    
        if self.verbose:
            print("Data loaded successfully.")
            print(f"Loaded embeddings: {list(self.embeddings.keys())}")

    def apply_pca(self, embedding_df, name, n_components):
        if self.verbose:
            print(f"Applying PCA to {name} embeddings...")
        pca = PCA(n_components=n_components)
        pca_result = pca.fit_transform(embedding_df)
        pca_df = pd.DataFrame(pca_result, index=embedding_df.index,
                              columns=[f"{name}_pca_{i}" for i in range(n_components)])

        explained_variance_ratio = pca.explained_variance_ratio_.sum()
        if self.verbose:
            print(f"PCA on {name} embeddings: {explained_variance_ratio:.2%} of variance explained")

        return pca_df

    def prepare_data(self):
        if self.verbose:
            print("Preparing data...")

        image_embeddings = ['aerial', 'streetview']
    
        for key in self.embeddings:
            if key in image_embeddings and self.image_pca_dim is not None:
                self.embeddings[key] = self.apply_pca(self.embeddings[key], f'{key}_pca', self.image_pca_dim)
            if key == 'streetview':
                self.embeddings[key] = self.embeddings[key][~self.embeddings[key].index.duplicated(keep='first')]
            self.embeddings[key] = self.embeddings[key].reindex(index=self.regions_buffered_gdf.index, fill_value=0)
    
        self.embeddings_concatenated = pd.concat(list(self.embeddings.values()), axis=1)
        self.embeddings_concatenated = self.embeddings_concatenated.loc[self.regions_buffered_gdf.index]
        self.embeddings_concatenated.fillna(0, inplace=True)
    
        self.input_dim = self.embeddings_concatenated.shape[1]
        self.output_dim = min(emb.shape[1] for emb in self.embeddings.values())
    
        self.max_neighbors = 6 * max(self.k_values)
    
        self.input_dim = self.input_dim * (self.max_neighbors + 1)

        if self.use_euclidean:
            self.distance_matrix = self.calculate_euclidean_distances()
            threshold = np.percentile(self.distance_matrix.values, 2)
            # For Euclidean, we want distances BELOW the threshold
            self.positive_lookup_table = (self.distance_matrix < threshold).astype(int)
        else:
            accessibility_stacked = self.od_matrix_accessibilities.stack()
            threshold = accessibility_stacked.quantile(0.98)
            # For accessibility, we want values ABOVE the threshold
            self.positive_lookup_table = (self.od_matrix_accessibilities > threshold).astype(int)

        if self.verbose:
            print("Data prepared.")

    def calculate_euclidean_distances(self):
        if self.verbose:
            print("Calculating Euclidean distances...")
        centroids = self.regions_gdf.geometry.centroid
        coords = np.column_stack((centroids.x, centroids.y))
        distances = pdist(coords, metric='euclidean')
        distance_matrix = pd.DataFrame(squareform(distances), index=self.regions_gdf.index, columns=self.regions_gdf.index)
        return distance_matrix

    def select_embeddings(self, data_sources):
        if data_sources == 'all':
            return self.embeddings_concatenated
        elif isinstance(data_sources, str):
            return self.embeddings[data_sources]
        else:
            return pd.concat([self.embeddings[source] for source in data_sources], axis=1)
        
    def train_multiple_models(self, k_values, weight_type, hidden_dim=96, num_epochs=1, batch_size=256, lr=0.0001):
        models = {}
        losses = {}
        max_k = max(k_values)

        full_input_dim = self.embeddings_concatenated.shape[1]
        output_dim = min(emb.shape[1] for emb in self.embeddings.values())

        for k in k_values:
            models[k] = RingAggregationNN(full_input_dim, hidden_dim, output_dim, k, weight_type=weight_type).to(self.device)
            losses[k] = []

        dataset_train = TripletDataset(self.regions_gdf.index.values, self.positive_lookup_table, max_k, self.embeddings_concatenated)
        dataloader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, pin_memory=True)

        optimizers = {k: torch.optim.Adam(models[k].parameters(), lr=lr, weight_decay=1e-5) for k in k_values}
        circle_loss = CircleLoss().to(self.device)

        for epoch in range(num_epochs):
            for model in models.values():
                model.train()

            batch_losses = {k: [] for k in k_values}

            for anchor_batch, positive_batch, negative_batch in dataloader_train:
                anchor_batch = anchor_batch.float().to(self.device)
                positive_batch = positive_batch.float().to(self.device)
                negative_batch = negative_batch.float().to(self.device)

                for k in k_values:
                    optimizers[k].zero_grad()

                    anchor_output = models[k](anchor_batch[:, :k+1])
                    positive_output = models[k](positive_batch[:, :k+1])
                    negative_output = models[k](negative_batch[:, :k+1])

                    # Calculate cosine similarities
                    pos_sim = F.cosine_similarity(anchor_output, positive_output)
                    neg_sim = F.cosine_similarity(anchor_output, negative_output)

                    # Apply CircleLoss
                    loss = circle_loss(pos_sim, neg_sim)
                    loss.backward()
                    optimizers[k].step()

                    batch_losses[k].append(loss.item())
                    losses[k].append(loss.item())

            if self.verbose:
                avg_losses = {k: sum(batch_losses[k]) / len(batch_losses[k]) for k in k_values}
                print(f"Epoch {epoch + 1}/{num_epochs}, Average Losses: {avg_losses}")

        return models, losses

    def generate_embeddings(self, model, k, embeddings_to_use, batch_size=64):
        padding_size = self.embeddings_concatenated.shape[1] - embeddings_to_use.shape[1]

        dataset = RegionEmbeddingsDataset(self.regions_gdf.index.values, k, embeddings_to_use, padding_size)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    
        model.eval()
        output_embeddings = []
    
        with torch.no_grad():
            for embeddings_batch in dataloader:
                embeddings_batch = embeddings_batch.float().to(self.device)
                output = model(embeddings_batch)
                output_embeddings.append(output.cpu())
    
        all_output_embeddings = torch.cat(output_embeddings, dim=0)
        output_embeddings_df = pd.DataFrame(all_output_embeddings.numpy(), index=self.regions_gdf.index[:all_output_embeddings.size(0)],
                                            columns=[f"dim_{i}" for i in range(all_output_embeddings.size(1))])
        return output_embeddings_df


    def evaluate_embeddings(self, embeddings):
        r2_scores = {}
        target_columns = ['afw', 'vrz', 'fys', 'soc', 'onv', 'won']

        for column in target_columns:
            y = self.regions_gdf[column]
            mask = ~(np.isnan(y) | np.isnan(embeddings).any(axis=1))
            X_valid = embeddings[mask]
            y_valid = y[mask]

            if len(y_valid) == 0:
                if self.verbose:
                    print(f"Warning: No valid data for {column} after removing NaN values.")
                r2_scores[column] = np.nan
                continue

            X_train, X_test, y_train, y_test = train_test_split(X_valid, y_valid, test_size=0.3, random_state=42)
            model = LinearRegression()
            model.fit(X_train, y_train)
            y_pred = model.predict(X_test)
            r2_scores[column] = r2_score(y_test, y_pred)

        return r2_scores

    def run_inference(self, models, k_values, data_sources):
        results = {}
        embeddings = {}
        selected_embeddings = self.select_embeddings(data_sources)
        for k in k_values:
            embeddings[k] = self.generate_embeddings(models[k], k, selected_embeddings)
            results[k] = self.evaluate_embeddings(embeddings[k])
        return results, embeddings

    def run_experiment(self, k_values, weight_types, data_sources, num_epochs=2):
        results = {}
        all_embeddings = {}
        total_experiments = len(weight_types) * len(data_sources)
    
        with tqdm(total=total_experiments, desc='Experiment Progress', unit='experiment') as progress_bar:
            for weight_type in weight_types:
                models, losses = self.train_multiple_models(k_values, weight_type, num_epochs=num_epochs)
    
                with concurrent.futures.ThreadPoolExecutor(max_workers=os.cpu_count()) as executor:
                    future_to_data_source = {
                        executor.submit(self.run_inference, models, k_values, ds): ds
                        for ds in data_sources
                    }
    
                    for future in concurrent.futures.as_completed(future_to_data_source):
                        ds = future_to_data_source[future]
                        try:
                            inference_results, inference_embeddings = future.result()
                            for k in k_values:
                                key = (k, weight_type, ds)
                                results[key] = inference_results[k]
                                all_embeddings[key] = inference_embeddings[k]
                            progress_bar.update(1)
                            progress_bar.set_description(f"Completed: weight_type={weight_type}, data_sources={ds}")
                        except Exception as exc:
                            print(f"Data source {ds} generated an exception: {exc}")
    
                # Cleanup
                for model in models.values():
                    model.cpu()
                del models
                torch.cuda.empty_cache()
    
        self.save_results(results, all_embeddings)
        return results, all_embeddings

    def save_results(self, results, all_embeddings):
        with open(os.path.join(self.experiment_dir, 'results.json'), 'w') as f:
            json.dump({str(k): v for k, v in results.items()}, f)
        for key, embeddings in all_embeddings.items():
            # Change this line
            filename = f"embeddings_k{key[0]}_{key[1]}_{key[2]}.csv"
            embeddings.to_csv(os.path.join(self.experiment_dir, filename))

    def plot_results(self, results):
        self.plot_k_rings_performance(results)
        self.plot_weight_type_performance(results)
        self.plot_data_source_performance(results)

    def plot_k_rings_performance(self, results):
        plt.figure(figsize=(16, 10))

        k_values = sorted(set(key[0] for key in results.keys()))
        weight_types = sorted(set(key[1] for key in results.keys()))

        for target in self.target_columns:
            for weight_type in weight_types:
                r2_scores = []
                for k in k_values:
                    best_score = max(results.get((k, weight_type, ds), {}).get(target, -np.inf)
                                     for ds in set(key[2] for key in results.keys()))
                    r2_scores.append(best_score)

                plt.plot(k_values, r2_scores,
                         label=f"{self.target_names[target]} ({weight_type})",
                         color=self.colors[target],
                         marker=self.markers[weight_type],
                         linestyle='-',
                         linewidth=2,
                         markersize=8)

        plt.xlabel('Number of k-rings', fontsize=12)
        plt.ylabel('Best R² Score', fontsize=12)
        plt.title(f'Best Performance vs Number of k-rings (Resolution {self.resolution})', fontsize=14)

        # Create a custom legend for target variables
        legend_elements = []
        for target in self.target_columns:
            legend_elements.append(plt.Line2D([0], [0], color=self.colors[target], lw=4, label=self.target_names[target]))
        plt.legend(handles=legend_elements, loc='center left', bbox_to_anchor=(1, 0.5), fontsize=10)

        # Add a secondary legend for weight types
        weight_legend = plt.legend([plt.Line2D([0], [0], marker=self.markers[wt], color='grey', linestyle='None', markersize=8)
                                    for wt in weight_types],
                                   weight_types,
                                   loc='lower right',
                                   title='Weight Types',
                                   fontsize=8)
        plt.gca().add_artist(weight_legend)

        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        plt.savefig(os.path.join(self.experiment_dir, 'k_rings_performance.png'), dpi=300, bbox_inches='tight')
        plt.close()

    def plot_weight_type_performance(self, results):
        plt.style.use('default')
        plt.figure(figsize=(16, 10), facecolor='white')
        plt.gca().set_facecolor('white')
    
        weight_types = sorted(set(key[1] for key in results.keys()))
        x = np.arange(len(weight_types))
        width = 0.1
    
        for i, target in enumerate(self.target_columns):
            best_r2 = [max(results[k, wt, ds][target]
                           for k in set(key[0] for key in results.keys())
                           for ds in set(key[2] for key in results.keys())
                           if (k, wt, ds) in results)
                       for wt in weight_types]
    
            plt.bar(x + i*width, best_r2, width, label=self.target_names[target],
                    color=self.colors[target], edgecolor='none')
    
        plt.xlabel('Weighted Average Type', fontsize=12)
        plt.ylabel('Best R² Score', fontsize=12)
        plt.title(f'Best Performance vs Weighted Average Type (Resolution {self.resolution})', fontsize=14)
        plt.xticks(x + width * (len(self.target_columns) - 1) / 2, weight_types, rotation=45, ha='right')
        plt.legend(loc='upper right', fontsize=10)
        plt.grid(axis='y', linestyle=':', color='gray', alpha=0.3)
        plt.ylim(0, 1)  # Set y-axis limits from 0 to 1 for R² scores
        plt.tight_layout()
        plt.savefig(os.path.join(self.experiment_dir, 'weight_type_performance.png'), dpi=300, bbox_inches='tight')
        plt.close()

    def plot_data_source_performance(self, results):
        plt.style.use('default')
        plt.figure(figsize=(16, 10), facecolor='white')
        plt.gca().set_facecolor('white')
    
        data_sources = sorted(set(key[2] for key in results.keys()))
        x = np.arange(len(data_sources))
        width = 0.1
    
        for i, target in enumerate(self.target_columns):
            best_r2 = [max(results[k, wt, ds][target]
                           for k in set(key[0] for key in results.keys())
                           for wt in set(key[1] for key in results.keys())
                           if (k, wt, ds) in results)
                       for ds in data_sources]
    
            plt.bar(x + i*width, best_r2, width, label=self.target_names[target],
                    color=self.colors[target], edgecolor='none')
    
        plt.xlabel('Data Source', fontsize=12)
        plt.ylabel('Best R² Score', fontsize=12)
        plt.title(f'Best Performance vs Data Source (Resolution {self.resolution})', fontsize=14)
        plt.xticks(x + width * (len(self.target_columns) - 1) / 2, data_sources, rotation=45, ha='right')
        plt.legend(loc='best', fontsize=10)
        plt.grid(axis='y', linestyle=':', color='gray', alpha=0.3)
        plt.tight_layout()
        plt.savefig(os.path.join(self.experiment_dir, 'data_source_performance.png'), dpi=300, bbox_inches='tight')
        plt.close()

    def save_experiment_info(self, k_values, weight_types, data_source_combinations):
        info = {
            'Date': datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            'Resolution': self.resolution,
            'Use Euclidean': self.use_euclidean,
            'Number of k-ring values': len(k_values),
            'Number of weight types': len(weight_types),
            'Number of data source combinations': len(data_source_combinations),
            'k-ring values': k_values,
            'Weight types': weight_types,
            'Data source combinations': [list(combo) if isinstance(combo, tuple) else combo for combo in data_source_combinations],
            'Use finetuned aerial': self.use_finetuned_aerial,
            'Use finetuned streetview': self.use_finetuned_streetview,
            'Image PCA dimensions': self.image_pca_dim,
            'POI embedding': self.poi_embedding,
            'Threshold': 'bottom 2nd percentile if Euclidean, top 98th percentile if accessibility'
        }

        aggregation_nn_description = """
        Aggregation Neural Network Description:
        - The aggregation neural network (RingAggregationNN) processes concatenated embeddings of various data sources.
        - It consists of two sub-networks: WithinRingNN and BetweenRingNN.
        - WithinRingNN architecture:
          - BatchNorm -> Linear -> GELU -> Linear
        - BetweenRingNN architecture:
          - Linear -> GELU -> Linear
        - WithinRingNN transforms and aggregates embeddings within a ring of neighbors.
        - BetweenRingNN processes the pooled embeddings of different rings to generate the final output.
        - The network utilizes batch normalization only in the WithinRingNN to normalize the input embeddings.
        - GELU (Gaussian Error Linear Unit) activation is used instead of ReLU for improved performance and regularization.
        """
    
        with open(os.path.join(self.experiment_dir, 'experiment_info.txt'), 'w') as f:
            for key, value in info.items():
                f.write(f"{key}: {value}\n")
            f.write("\n")  # Add a blank line for better readability
            f.write(aggregation_nn_description)

    def save_top_bottom_embeddings(self, results, all_embeddings, top_n=3):
        print(f"\nSaving top and bottom embeddings...")
        avg_scores = {key: np.mean(list(scores.values())) for key, scores in results.items()}
        sorted_configs = sorted(avg_scores.items(), key=lambda x: x[1], reverse=True)
        total_embeddings = len(sorted_configs)
        save_n = min(top_n, total_embeddings // 2)
        top_configs = sorted_configs[:save_n]
        bottom_configs = sorted_configs[-save_n:]
        best_worst_dir = os.path.join(self.experiment_dir, 'best_worst_embeddings')
        os.makedirs(best_worst_dir, exist_ok=True)
        for i, (config, score) in enumerate(top_configs, 1):
            filename = os.path.join(best_worst_dir, f'top_{i}_embedding_k{config[0]}_{config[1]}_{"_".join(config[2])}.csv')
            all_embeddings[config].to_csv(filename)
            print(f"Saved top {i} embedding to {filename} (Avg R² = {score:.4f})")
        for i, (config, score) in enumerate(bottom_configs, 1):
            filename = os.path.join(best_worst_dir, f'bottom_{i}_embedding_k{config[0]}_{config[1]}_{"_".join(config[2])}.csv')
            all_embeddings[config].to_csv(filename)
            print(f"Saved bottom {i} embedding to {filename} (Avg R² = {score:.4f})")
        print(f"Finished saving {save_n} top and {save_n} bottom embeddings.")

In [None]:
# Main execution
if __name__ == "__main__":
    # Configuration
    resolution = 9
    poi_embedding = 'hex2vec'
    use_finetuned_aerial = False
    use_finetuned_streetview = False
    image_pca_dim = 100
    use_euclidean = True  # True Euclidean distance - False location-based accessibility

    # Experiment parameters
    if resolution == 9:
        k_values = [1, 3, 5]
        all_data_sources = ['POI', 'roadnetwork', 'GTFS', 'aerial', 'streetview']
    elif resolution == 10:
        k_values = [1, 5, 10, 15]
        all_data_sources = ['POI', 'roadnetwork', 'GTFS', 'aerial']
    else:
        raise ValueError(f"Unsupported resolution: {resolution}")
    
    experiment = LearntAggregationExperiment(
        resolution=resolution,
        use_euclidean=use_euclidean,  # Add this parameter
        poi_embedding=poi_embedding,
        use_finetuned_aerial=use_finetuned_aerial,
        use_finetuned_streetview=use_finetuned_streetview,
        image_pca_dim=image_pca_dim,
        k_values=k_values 
    )

    weight_types = ['exponential_e', 'logarithm', 'linear', 'flat']

    # Define data sources
    data_sources = ['all'] + all_data_sources

    # Save experiment info
    experiment.save_experiment_info(k_values, weight_types, data_sources)

    print(f"Starting experiments for resolution {resolution}...")

    # Run the experiments
    results, all_embeddings = experiment.run_experiment(k_values, weight_types, data_sources, num_epochs=1)

    print("All experiments completed. Processing results...")

    # Plot results
    experiment.plot_results(results)

    # Print detailed results
    with open(os.path.join(experiment.experiment_dir, 'detailed_results.txt'), 'w') as f:
        for key, scores in results.items():
            f.write(f"k={key[0]}, weight_type={key[1]}, data_sources={key[2]}\n")
            for column, score in scores.items():
                f.write(f"  {experiment.target_names[column]}: R² = {score:.4f}\n")
            f.write("\n")

    # Get best performing configuration
    best_key = max(results, key=lambda k: np.mean(list(results[k].values())))
    best_score = np.mean(list(results[best_key].values()))

    with open(os.path.join(experiment.experiment_dir, 'best_configuration.txt'), 'w') as f:
        f.write(f"Best configuration: k={best_key[0]}, weight_type={best_key[1]}, data_sources={best_key[2]}\n")
        f.write(f"Average R² score: {best_score:.4f}\n")

    # Save best embeddings
    best_embeddings = all_embeddings[best_key]
    best_embeddings.to_csv(os.path.join(experiment.experiment_dir, 'best_embeddings.csv'))

    # Save top and bottom embeddings
    experiment.save_top_bottom_embeddings(results, all_embeddings, top_n=3)

    # Print dimensionality information
    with open(os.path.join(experiment.experiment_dir, 'dimensionality_info.txt'), 'w') as f:
        f.write(f"Final embedding dimensionality: {best_embeddings.shape[1]}\n")
        f.write(f"Original dimensionalities:\n")
        for key, embeddings in all_embeddings.items():
            f.write(f"  k={key[0]}, weight_type={key[1]}, data_sources={key[2]}: {embeddings.shape[1]}\n")

    print(f"Experiment results saved in: {experiment.experiment_dir}")
    print("Experiment completed successfully.")

Additional functionality for updating existing experiments with new plotting capabilities and recalculating R-squared scores using PCA.

In [None]:
# from Plotting import pca_plot
# import warnings
# warnings.filterwarnings("ignore")
# # plot top 1 embedding
# experiment_dir = r"D:\tu delft\Afstuderen\Phase 5 learning strategy comparison\experiments\20240701_run12_res9_learnt"


In [None]:
# # Import necessary libraries
# import os
# import json
# 
# # Load existing results
# experiment_dir = r"D:\tu delft\Afstuderen\Phase 5 learning strategy comparison\experiments\20240702_run05_res9_learnt"
# with open(os.path.join(experiment_dir, 'results.json'), 'r') as f:
#     results = json.load(f)
# 
# # Convert string keys back to tuples
# results = {eval(k): v for k, v in results.items()}
# 
# # Create LearntAggregationExperiment instance
# experiment = LearntAggregationExperiment(resolution=9)  # Adjust resolution as needed
# 
# # Override the experiment directory with the existing one
# experiment.experiment_dir = experiment_dir
# 
# # Update plots
# experiment.plot_results(results)
# 
# print(f"Updated plots saved in: {experiment_dir}")

In [None]:
# warnings.filterwarnings("ignore")
# 
# # Load existing results
# experiment_dir = r"D:\tu delft\Afstuderen\Phase 5 learning strategy comparison\experiments\20240701_run12_res9_learnt"
# with open(os.path.join(experiment_dir, 'results.json'), 'r') as f:
#     old_results = json.load(f)
# 
# # Convert string keys back to tuples
# old_results = {eval(k): v for k, v in old_results.items()}
# 
# # Create LearntAggregationExperiment instance
# experiment = LearntAggregationExperiment(resolution=9)  # Adjust resolution as needed
# 
# # Override the experiment directory with the existing one
# experiment.experiment_dir = experiment_dir
# 
# # Recalculate R-squared scores with PCA
# new_results = {}
# 
# # Determine the number of components to use (minimum dimensionality of all data sources)
# min_dim = min(emb.shape[1] for emb in experiment.embeddings.values())
# 
# for params, _ in tqdm(old_results.items(), desc="Recalculating R-squared scores"):
#     k, weight_type, data_sources = params
# 
#     # Load the corresponding embedding file
#     filename = f"embeddings_k{k}_{weight_type}_{'-'.join(data_sources)}.csv"
#     embedding_file = os.path.join(experiment_dir, filename)
#     embeddings = pd.read_csv(embedding_file, index_col=0)
# 
#     # Apply PCA
#     pca = PCA(n_components=min_dim)
#     X_pca = pca.fit_transform(embeddings)
# 
#     r2_scores = {}
#     for column in experiment.target_columns:
#         y = experiment.regions_gdf[column]
# 
#         mask = ~(np.isnan(y) | np.isnan(X_pca).any(axis=1))
#         X_valid = X_pca[mask]
#         y_valid = y[mask]
# 
#         if len(y_valid) == 0:
#             print(f"Warning: No valid data for {column} after removing NaN values.")
#             r2_scores[column] = np.nan
#             continue
# 
#         X_train, X_test, y_train, y_test = train_test_split(X_valid, y_valid, test_size=0.3, random_state=42)
#         model = LinearRegression()
#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_test)
#         r2_scores[column] = r2_score(y_test, y_pred)
# 
#     new_results[params] = r2_scores
# 
# # Save updated results
# with open(os.path.join(experiment_dir, 'results_with_pca.json'), 'w') as f:
#     json.dump({str(k): v for k, v in new_results.items()}, f)
# 
# # Plot updated results
# experiment.plot_results(new_results)
# 
# print(f"Updated results and plots saved in: {experiment_dir}")
# 
# # Print best configuration
# best_key = max(new_results, key=lambda k: np.mean(list(new_results[k].values())))
# best_score = np.mean(list(new_results[best_key].values()))
# 
# print(f"\nBest configuration: k={best_key[0]}, weight_type={best_key[1]}, data_sources={best_key[2]}")
# print(f"Average R² score: {best_score:.4f}")
# 
# # Print detailed results for best configuration
# print("\nDetailed R² scores for best configuration:")
# for column, score in new_results[best_key].items():
#     print(f"  {experiment.target_names[column]}: R² = {score:.4f}")