In [1]:
from pathlib import Path
import argparse
import itertools
from itertools import product
from PIL import Image

import random

from collections import Counter
from collections import defaultdict

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, random_split, DataLoader
from torchvision.io import read_image
from torchvision import transforms

import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import pandas as pd

from tqdm import tqdm

from process_model_weights import *
from celeb_backbone import *

In [2]:
# construct file indexer

classes = list(range(0,10))

dataset = [
    "train",
    "train", 
    "train",
    "train",
    "train", 
    "test",
    "test",
    "test", 
    "test",
    "test",
]

original_files = [
 "057876.jpg", #002514.jpg",
 "039593.jpg", #
 "094741.jpg", #
 "152801.jpg", #
 "126154.jpg", #
 "034026.jpg", #
 "012857.jpg", #
 "016242.jpg", #
 "153728.jpg", #
 "068982.jpg",]

older_files = [
    "002514.jpg",     
    "017554.jpg",     
    "093689.jpg",     
    "052500.jpg",     
    "162586.jpg",     
    "120490.jpg",     
    "048720.jpg",     
    "031455.jpg",     
    "036953.jpg",     
    "158002.jpg",     
]

younger_files = [
    "119296.jpg",     
    "029394.jpg",     
    "014392.jpg",     
    "055349.jpg",     
    "040676.jpg",     
    "042867.jpg",     
    "052814.jpg",     
    "114366.jpg",     
    "041553.jpg",     
    "117606.jpg",     
]

c = np.tile(classes, 3)
f = np.concatenate([original_files, younger_files, older_files])
all_files = pd.DataFrame()
all_files["classes"] = c
all_files["filenames"] = f

In [3]:
df = pd.DataFrame()
df["classes"] = classes
df["og_files"] = original_files 
df["older_files"] = older_files
df["younger_files"] = younger_files
df["dataset"] = dataset

In [10]:
import pandas as pd
from itertools import product

def populate_sequences(sequences, positions, labels, ns_vectors, df):
    def generate_combinations(s, p, label, ns_vector):
        repeat = s[p[0]]
        older_file = df[df["classes"] == repeat]["older_files"].iloc[0]
        younger_file = df[df["classes"] == repeat]["younger_files"].iloc[0]
        
        s_o = s.copy()
        s_y = s.copy()
        s_o[p[0]] = older_file
        s_o[p[1]] = younger_file
        s_y[p[0]] = younger_file
        s_y[p[1]] = older_file
        
        other_positions = [i for i in range(len(s)) if i not in p]
        other_classes = [s[i] for i in other_positions]
        
        choices = []
        for cls in other_classes:
            choices.append([df[df["classes"] == cls]["older_files"].iloc[0], 
                            df[df["classes"] == cls]["younger_files"].iloc[0]])
        
        new_sequences = []
        new_labels = []
        new_positions = []
        new_ns_vectors = []
        
        for combo in product(*choices):
            new_s_o = s_o.copy()
            new_s_y = s_y.copy()
            for i, choice in zip(other_positions, combo):
                new_s_o[i] = choice
                new_s_y[i] = choice
            new_sequences.extend([new_s_o, new_s_y])
            new_labels.extend([label, label])
            new_positions.extend([p, p])
            new_ns_vectors.extend([ns_vector, ns_vector])
        
        return new_sequences, new_labels, new_positions, new_ns_vectors

    all_sequences = []
    all_labels = []
    all_positions = []
    all_ns_vectors = []

    for s, p, label, ns_vector in zip(sequences, positions, labels, ns_vectors):
        seq, lab, pos, ns = generate_combinations(s, p, label, ns_vector)
        all_sequences.extend(seq)
        all_labels.extend(lab)
        all_positions.extend(pos)
        all_ns_vectors.extend(ns)
    
    return all_sequences, all_labels, all_positions, all_ns_vectors


In [28]:
sequence_length = 5

split = "test"
train_classes = np.array(df[df["dataset"] == split]["classes"])
sequences, labels, positions, ns = generate_sequences(train_classes, sequence_length=sequence_length)

completed_sequences = []
for s in sequences: 
    temp = []
    for c in s:
        filename = df[df["classes"] == c]["og_files"].iloc[0]
        temp.append(filename)

    completed_sequences.append(temp)
sequences = completed_sequences

In [29]:


save_dir = "celebA_embeddings"
path = Path(f"{save_dir}/exp_basic/seq{sequence_length}/{split}")
path.mkdir(exist_ok=True, parents=True)
print(path)

backbone = VGGEmbeds()

checkpoint = 1
filenames = [f"{i:06d}.pt" for i in range(checkpoint, len(labels) + 1)]
print(f"Len filenames: {len(filenames)}")

dataset = EmbeddingDataset(sequences, labels, positions, ns)
dataloader = DataLoader(dataset, batch_size=32, num_workers=4, pin_memory=True, collate_fn=custom_collate)

# Process data in batches
with torch.no_grad():  # Disable gradient calculation
    for i, (batch_sequences, batch_labels, batch_positions, batch_ns, ) in enumerate(tqdm(dataloader)):
        batch_tensors = []
        batch_seq_filenames = []
        for sequence in batch_sequences:
            sequence_tensors = []
            sequence_filenames = []
            for file in sequence:
                
                sequence_filenames.append(file)
                sequence_tensors.append(backbone.embedding(file))

            batch_tensors.append(torch.stack(sequence_tensors))
            batch_seq_filenames.append(sequence_filenames)

        batch_tensor = torch.stack(batch_tensors)
        
        for j, tensor in enumerate(batch_tensor):
            idx = i * len(batch_sequences) + j
            saver = {
                "sequence": tensor.cpu(),  # Move back to CPU for saving
                "label": batch_labels[j],
                "positions": batch_positions[j],
                "n-distance": batch_ns[j],
                "sequence_filenames": batch_seq_filenames[j],
            }
            
            # file1 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            # file2 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            # class1 = all_files[all_files["filenames"] == file1]["classes"].iloc[0] 
            # class2 = all_files[all_files["filenames"] == file2]["classes"].iloc[0] 

            # assert class1 == class2

            # dataset = df[df["classes"] == class1]["dataset"].iloc[0]
            save_path = path / split 
            save_path.mkdir(exist_ok=True, parents=True)
            
            torch.save(saver, save_path / filenames[idx])

celebA_embeddings/exp_basic/seq5/test


  state_dict = torch.load(weights_path)


Len filenames: 1200


100%|██████████| 38/38 [00:16<00:00,  2.37it/s]


In [None]:
sequence_length = 5

split = "train"
train_classes = np.array(df[df["dataset"] == split]["classes"])
sequences, labels, positions, ns = generate_sequences(train_classes, sequence_length=sequence_length)

sequences, labels, positions, ns = populate_sequences(sequences, positions, labels, ns, df)

assert len(sequences) == len(labels) == len(positions) == len(ns)


save_dir = "celebA_embeddings"
path = Path(f"{save_dir}/exp_age_final/seq{sequence_length}/{split}")
path.mkdir(exist_ok=True, parents=True)
print(path)

backbone = VGGEmbeds()

checkpoint = 1
filenames = [f"{i:06d}.pt" for i in range(checkpoint, len(labels) + 1)]
print(f"Len filenames: {len(filenames)}")

dataset = EmbeddingDataset(sequences, labels, positions, ns)
dataloader = DataLoader(dataset, batch_size=32, num_workers=4, pin_memory=True, collate_fn=custom_collate)

# Process data in batches
with torch.no_grad():  # Disable gradient calculation
    for i, (batch_sequences, batch_labels, batch_positions, batch_ns, ) in enumerate(tqdm(dataloader)):
        batch_tensors = []
        batch_seq_filenames = []
        for sequence in batch_sequences:
            sequence_tensors = []
            sequence_filenames = []
            for file in sequence:
                
                sequence_filenames.append(file)
                sequence_tensors.append(backbone.embedding(file))

            batch_tensors.append(torch.stack(sequence_tensors))
            batch_seq_filenames.append(sequence_filenames)

        batch_tensor = torch.stack(batch_tensors)
        
        for j, tensor in enumerate(batch_tensor):
            idx = i * len(batch_sequences) + j
            saver = {
                "sequence": tensor.cpu(),  # Move back to CPU for saving
                "label": batch_labels[j],
                "positions": batch_positions[j],
                "n-distance": batch_ns[j],
                "sequence_filenames": batch_seq_filenames[j],
            }
            
            # file1 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            # file2 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            # class1 = all_files[all_files["filenames"] == file1]["classes"].iloc[0] 
            # class2 = all_files[all_files["filenames"] == file2]["classes"].iloc[0] 

            # assert class1 == class2

            # dataset = df[df["classes"] == class1]["dataset"].iloc[0]
            save_path = path / split 
            save_path.mkdir(exist_ok=True, parents=True)
            
            torch.save(saver, save_path / filenames[idx])

celebA_embeddings/exp_age_final/seq5/train


  state_dict = torch.load(weights_path)


Len filenames: 19200


100%|██████████| 600/600 [04:39<00:00,  2.15it/s]


In [17]:
sequence_length = 5

split = "test"
test_classes = np.array(df[df["dataset"] == split]["classes"])
sequences, labels, positions, ns = generate_sequences(test_classes, sequence_length=sequence_length)

sequences, labels, positions, ns = populate_sequences(sequences, positions, labels, ns, df)

assert len(sequences) == len(labels) == len(positions) == len(ns)


save_dir = "celebA_embeddings"
path = Path(f"{save_dir}/exp_age_final/seq{sequence_length}/{split}")
path.mkdir(exist_ok=True, parents=True)
print(path)

backbone = VGGEmbeds()

checkpoint = 1
filenames = [f"{i:06d}.pt" for i in range(checkpoint, len(labels) + 1)]
print(f"Len filenames: {len(filenames)}")

dataset = EmbeddingDataset(sequences, labels, positions, ns)
dataloader = DataLoader(dataset, batch_size=32, num_workers=4, pin_memory=True, collate_fn=custom_collate)

# Process data in batches
with torch.no_grad():  # Disable gradient calculation
    for i, (batch_sequences, batch_labels, batch_positions, batch_ns, ) in enumerate(tqdm(dataloader)):
        batch_tensors = []
        batch_seq_filenames = []
        for sequence in batch_sequences:
            sequence_tensors = []
            sequence_filenames = []
            for file in sequence:
                
                sequence_filenames.append(file)
                sequence_tensors.append(backbone.embedding(file))

            batch_tensors.append(torch.stack(sequence_tensors))
            batch_seq_filenames.append(sequence_filenames)

        batch_tensor = torch.stack(batch_tensors)
        
        for j, tensor in enumerate(batch_tensor):
            idx = i * len(batch_sequences) + j
            saver = {
                "sequence": tensor.cpu(),  # Move back to CPU for saving
                "label": batch_labels[j],
                "positions": batch_positions[j],
                "n-distance": batch_ns[j],
                "sequence_filenames": batch_seq_filenames[j],
            }
            
            # file1 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            # file2 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            # class1 = all_files[all_files["filenames"] == file1]["classes"].iloc[0] 
            # class2 = all_files[all_files["filenames"] == file2]["classes"].iloc[0] 

            # assert class1 == class2

            # dataset = df[df["classes"] == class1]["dataset"].iloc[0]
            save_path = path / split 
            save_path.mkdir(exist_ok=True, parents=True)
            
            torch.save(saver, save_path / filenames[idx])

celebA_embeddings/exp_age_final/seq5/test


  state_dict = torch.load(weights_path)


Len filenames: 19200


100%|██████████| 600/600 [04:33<00:00,  2.20it/s]


# prior effort

In [10]:
sequence_length = 5

def alter_sequences(df, sequence_length):
    sequences, labels, positions, ns = generate_sequences(df['og_files'], sequence_length)

    for i in tqdm(range(len(sequences))):

        og_file = sequences[i][positions[i][0]]
        row = df.loc[df["og_files"] == og_file].iloc[0]
        older_filename = row["older_files"]
        younger_filename = row["younger_files"]

        sequences[i][positions[i][0]] = younger_filename
        sequences[i][positions[i][1]] = older_filename

    return sequences

# 5

In [5]:
sequence_length = 5

sequences, labels, positions, ns = generate_sequences(files, sequence_length=sequence_length)
sequences = alter_sequences(df, sequence_length)

sample_size = 50000

samples = uniform_random_samples(ns, sample_size)
print(Counter(np.array(ns)[samples]))

sequences = np.array(sequences)[samples]
labels = np.array(labels)[samples]
positions = np.array(positions)[samples]
ns = np.array(ns)[samples]


  1%|          | 531/50400 [00:00<00:09, 5306.64it/s]

100%|██████████| 50400/50400 [00:09<00:00, 5413.96it/s]


Counter({4: 12500, 2: 12500, 1: 12500, 3: 12500})


In [6]:
save_dir = "celebA_embeddings"
path = Path(f"{save_dir}/exp_age/seq{sequence_length}")
path.mkdir(exist_ok=True, parents=True)
print(path)

backbone = VGGEmbeds()

checkpoint = 1
filenames = [f"{i:06d}.pt" for i in range(checkpoint, len(labels) + 1)]
print(f"Len filenames: {len(filenames)}")

if checkpoint > 1:
    sequences = sequences[checkpoint-1:]
    labels = labels[checkpoint-1:]
    positions = positions[checkpoint-1:]
    ns = ns[checkpoint-1:]

assert len(filenames) == len(sequences) == len(labels)

dataset = EmbeddingDataset(sequences, labels, positions, ns)
dataloader = DataLoader(dataset, batch_size=32, num_workers=4, pin_memory=True, collate_fn=custom_collate)

# Process data in batches
with torch.no_grad():  # Disable gradient calculation
    for i, (batch_sequences, batch_labels, batch_positions, batch_ns, ) in enumerate(tqdm(dataloader)):
        batch_tensors = []
        batch_seq_filenames = []
        for sequence in batch_sequences:
            sequence_tensors = []
            sequence_filenames = []
            for file in sequence:
                
                sequence_filenames.append(file)
                sequence_tensors.append(backbone.embedding(file))

            batch_tensors.append(torch.stack(sequence_tensors))
            batch_seq_filenames.append(sequence_filenames)

        batch_tensor = torch.stack(batch_tensors)
        
        for j, tensor in enumerate(batch_tensor):
            idx = i * len(batch_sequences) + j
            saver = {
                "sequence": tensor.cpu(),  # Move back to CPU for saving
                "label": batch_labels[j],
                "positions": batch_positions[j],
                "n-distance": batch_ns[j],
                "sequence_filenames": batch_seq_filenames[j],
            }
            
            file1 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            file2 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            class1 = all_files[all_files["filenames"] == file1]["classes"].iloc[0] 
            class2 = all_files[all_files["filenames"] == file2]["classes"].iloc[0] 

            assert class1 == class2

            dataset = df[df["classes"] == class1]["dataset"].iloc[0]
            save_path = path / dataset 
            save_path.mkdir(exist_ok=True, parents=True)
            
            torch.save(saver, save_path / filenames[idx])

celebA_embeddings/exp_age/seq5


  state_dict = torch.load(weights_path)


Len filenames: 50000


100%|██████████| 1563/1563 [12:49<00:00,  2.03it/s]


# 6

In [7]:
sequence_length = 6

sequences, labels, positions, ns = generate_sequences(files, sequence_length=sequence_length)
sequences = alter_sequences(df, sequence_length)

sample_size = 50000

samples = uniform_random_samples(ns, sample_size)
print(Counter(np.array(ns)[samples]))

sequences = np.array(sequences)[samples]
labels = np.array(labels)[samples]
positions = np.array(positions)[samples]
ns = np.array(ns)[samples]


100%|██████████| 453600/453600 [01:26<00:00, 5247.39it/s]


Counter({4: 10000, 3: 10000, 5: 10000, 1: 10000, 2: 10000})


In [8]:
save_dir = "celebA_embeddings"
path = Path(f"{save_dir}/exp_age/seq{sequence_length}")
path.mkdir(exist_ok=True, parents=True)
print(path)

backbone = VGGEmbeds()

checkpoint = 1
filenames = [f"{i:06d}.pt" for i in range(checkpoint, len(labels) + 1)]
print(f"Len filenames: {len(filenames)}")

if checkpoint > 1:
    sequences = sequences[checkpoint-1:]
    labels = labels[checkpoint-1:]
    positions = positions[checkpoint-1:]
    ns = ns[checkpoint-1:]

assert len(filenames) == len(sequences) == len(labels)

dataset = EmbeddingDataset(sequences, labels, positions, ns)
dataloader = DataLoader(dataset, batch_size=32, num_workers=4, pin_memory=True, collate_fn=custom_collate)

# Process data in batches
with torch.no_grad():  # Disable gradient calculation
    for i, (batch_sequences, batch_labels, batch_positions, batch_ns, ) in enumerate(tqdm(dataloader)):
        batch_tensors = []
        batch_seq_filenames = []
        for sequence in batch_sequences:
            sequence_tensors = []
            sequence_filenames = []
            for file in sequence:
                
                sequence_filenames.append(file)
                sequence_tensors.append(backbone.embedding(file))

            batch_tensors.append(torch.stack(sequence_tensors))
            batch_seq_filenames.append(sequence_filenames)

        batch_tensor = torch.stack(batch_tensors)
        
        for j, tensor in enumerate(batch_tensor):
            idx = i * len(batch_sequences) + j
            saver = {
                "sequence": tensor.cpu(),  # Move back to CPU for saving
                "label": batch_labels[j],
                "positions": batch_positions[j],
                "n-distance": batch_ns[j],
                "sequence_filenames": batch_seq_filenames[j],
            }
            
            file1 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            file2 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            class1 = all_files[all_files["filenames"] == file1]["classes"].iloc[0] 
            class2 = all_files[all_files["filenames"] == file2]["classes"].iloc[0] 

            assert class1 == class2

            dataset = df[df["classes"] == class1]["dataset"].iloc[0]
            save_path = path / dataset 
            save_path.mkdir(exist_ok=True, parents=True)
            
            torch.save(saver, save_path / filenames[idx])

celebA_embeddings/exp_age/seq6


  state_dict = torch.load(weights_path)


Len filenames: 50000


100%|██████████| 1563/1563 [15:20<00:00,  1.70it/s]


In [13]:
sequence_length = 10

sequences, labels, positions, ns = generate_sequences(files, sequence_length=sequence_length)
sequences = alter_sequences(df, sequence_length)

sample_size = 50000

samples = uniform_random_samples(ns, sample_size)
print(Counter(np.array(ns)[samples]))

sequences = np.array(sequences)[samples]
labels = np.array(labels)[samples]
positions = np.array(positions)[samples]
ns = np.array(ns)[samples]


  1%|▏         | 2299572/163296000 [07:26<8:40:49, 5151.92it/s] 


KeyboardInterrupt: 

In [None]:
save_dir = "celebA_embeddings"
path = Path(f"{save_dir}/exp_age/seq{sequence_length}")
path.mkdir(exist_ok=True, parents=True)
print(path)

backbone = VGGEmbeds()

checkpoint = 1
filenames = [f"{i:06d}.pt" for i in range(checkpoint, len(labels) + 1)]
print(f"Len filenames: {len(filenames)}")

if checkpoint > 1:
    sequences = sequences[checkpoint-1:]
    labels = labels[checkpoint-1:]
    positions = positions[checkpoint-1:]
    ns = ns[checkpoint-1:]

assert len(filenames) == len(sequences) == len(labels)

dataset = EmbeddingDataset(sequences, labels, positions, ns)
dataloader = DataLoader(dataset, batch_size=32, num_workers=4, pin_memory=True, collate_fn=custom_collate)

# Process data in batches
with torch.no_grad():  # Disable gradient calculation
    for i, (batch_sequences, batch_labels, batch_positions, batch_ns, ) in enumerate(tqdm(dataloader)):
        batch_tensors = []
        batch_seq_filenames = []
        for sequence in batch_sequences:
            sequence_tensors = []
            sequence_filenames = []
            for file in sequence:
                
                sequence_filenames.append(file)
                sequence_tensors.append(backbone.embedding(file))

            batch_tensors.append(torch.stack(sequence_tensors))
            batch_seq_filenames.append(sequence_filenames)

        batch_tensor = torch.stack(batch_tensors)
        
        for j, tensor in enumerate(batch_tensor):
            idx = i * len(batch_sequences) + j
            saver = {
                "sequence": tensor.cpu(),  # Move back to CPU for saving
                "label": batch_labels[j],
                "positions": batch_positions[j],
                "n-distance": batch_ns[j],
                "sequence_filenames": batch_seq_filenames[j],
            }
            
            file1 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            file2 = np.array(batch_seq_filenames[j])[batch_positions[j][1]]
            class1 = all_files[all_files["filenames"] == file1]["classes"].iloc[0] 
            class2 = all_files[all_files["filenames"] == file2]["classes"].iloc[0] 

            assert class1 == class2

            dataset = df[df["classes"] == class1]["dataset"].iloc[0]
            save_path = path / dataset 
            save_path.mkdir(exist_ok=True, parents=True)
            
            torch.save(saver, save_path / filenames[idx])

celebA_embeddings/exp_age/seq8


  state_dict = torch.load(weights_path)


Len filenames: 50000


100%|██████████| 1563/1563 [19:45<00:00,  1.32it/s]
