In [None]:
import os
from pathlib import Path
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from torchvision.models import convnext_large, ConvNeXt_Large_Weights
from tqdm import tqdm
from PIL import Image
import pandas as pd
from sklearn.decomposition import PCA

In [None]:
class AerialImageDataset(Dataset):
    def __init__(self, regions_gdf, image_dir):
        self.regions_gdf = regions_gdf
        self.image_dir = image_dir
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
        ])

    def __len__(self):
        return len(self.regions_gdf)

    def __getitem__(self, idx):
        region_id = self.regions_gdf.index[idx]
        img_path = os.path.join(self.image_dir, f"{region_id}.jpg")

        if os.path.exists(img_path):
            image = Image.open(img_path).convert('RGB')
            image = self.transform(image)
        else:
            # If image doesn't exist, return a tensor of zeros
            image = torch.zeros(3, 224, 224)

        return image, region_id

def infer_embeddings(regions_gdf, image_dir, batch_size=128):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model = convnext_large(weights=ConvNeXt_Large_Weights.DEFAULT)
    model = torch.nn.Sequential(*list(model.children())[:-1])
    model = model.to(device)
    model.eval()

    dataset = AerialImageDataset(regions_gdf, image_dir)
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=False
    )

    embeddings = {}
    with torch.no_grad():
        for images, region_ids in tqdm(dataloader, desc="Inferring embeddings"):
            images = images.to(device)
            features = model(images).squeeze(-1).squeeze(-1)

            for i, region_id in enumerate(region_ids):
                embeddings[region_id] = features[i].cpu().numpy()

    df_embeddings = pd.DataFrame.from_dict(embeddings, orient='index')
    return df_embeddings

def apply_pca(df_embeddings, dimensions=[50, 200]):
    results = {'original': df_embeddings}

    for dim in dimensions:
        pca = PCA(n_components=dim)
        embeddings_reduced = pca.fit_transform(df_embeddings)
        results[f'pca_{dim}'] = pd.DataFrame(embeddings_reduced, index=df_embeddings.index)

    return results

In [None]:
# Usage
if __name__ == "__main__":
    import geopandas as gpd

    # Load your regions_buffered_gdf
    regions_buffered_gdf = gpd.read_file("selected_regions_buffered_10.geojson")
    regions_buffered_gdf = regions_buffered_gdf.to_crs(epsg=28992)
    regions_buffered_gdf.set_index('region_id', inplace=True)

    image_dir = "D://tu delft//Afstuderen//aerial_images_10"

    # Infer embeddings with simplified settings
    df_embeddings = infer_embeddings(regions_buffered_gdf, image_dir, batch_size=128)

    # Apply PCA and save (assuming you have these functions defined)
    embedding_versions = apply_pca(df_embeddings)

    for name, df in embedding_versions.items():
        if name == 'original':
            filename = f"embeddings_aerial_10_original.csv"
        else:
            dim = name.split('_')[1]
            filename = f"embeddings_aerial_10_dim{dim}.csv"
        df.to_csv(filename)
        print(f"Saved: {filename}")

    print("Embeddings inference and PCA completed. Results saved to CSV files.")