In [1]:
import torch
from kwisatzHaderach import KwisatzHaderach
import json
import os
import tqdm
import numpy as np
from datagen import generate_dataset, generate_dataset_memory, generate_dataset_memory_bh

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
def euclidean_distance(a, b):
    return torch.sqrt(torch.sum((a - b)**2, dim=-1) + 1e-12)

def loss_fn(pr_acc, gt_acc, num_neighbors):
    gamma = 0.5
    neighbor_scale = 1/50
    importance = torch.exp(-neighbor_scale * num_neighbors) # removed minus sign to give more importance to particles with more neighbors
    euclidean_distances = euclidean_distance(pr_acc, gt_acc)
    if importance.size()[0] == 0:
        importance = 1.0
    return torch.mean(importance *
                        euclidean_distances**gamma), torch.mean(euclidean_distances)

In [3]:
def get_new_pos_vel(acc, pos, vel, dt=0.01):
    new_vel = vel + acc * dt
    new_pos = pos + new_vel * dt
    return new_pos, new_vel

In [4]:
def train_epoch(model, file, batch_size, loss_fn, optimizer, device):

    model.train()
    with open(file) as f:
        data = json.load(f)

    num_batches = len(data) // batch_size
    all_losses = []
    all_dists = []
    for i in tqdm.tqdm(range(num_batches)):
        batch = data[i*batch_size:(i+1)*batch_size]
        m = torch.tensor([b['masses'] for b in batch], dtype=torch.float32).to(device)
        pos0 = torch.tensor([b['pos'] for b in batch], dtype=torch.float32).to(device)
        vel0 = torch.tensor([b['vel'] for b in batch], dtype=torch.float32).to(device)
        pos1 = torch.tensor([b['pos_next1'] for b in batch], dtype=torch.float32).to(device)
        pos2 = torch.tensor([b['pos_next2'] for b in batch], dtype=torch.float32).to(device)

        optimizer.zero_grad()
        losses = []
        for j in range(len(batch)):
            l = 0
            sample_masses = m[j].unsqueeze(1)
            sample_pos0 = pos0[j]
            sample_vel0 = vel0[j]
            sample_pos1 = pos1[j]
            sample_pos2 = pos2[j]

            pr_pos1, pr_vel1 = model(sample_pos0, sample_vel0, sample_masses)

            loss1, dists1 = loss_fn(pr_pos1, sample_pos1, model.num_neighbors)

            all_dists.append(dists1.item()*0.5)

            l += 0.5*loss1

            pr_pos2, pr_vel2 = model(pr_pos1, pr_vel1, sample_masses)

            loss2, dists2 = loss_fn(pr_pos2, sample_pos2, model.num_neighbors)

            all_dists.append(dists2.item()*0.5)

            l += 0.5*loss2


            losses.append(l)

        total_loss = sum(losses) / len(batch)
        all_losses.append(total_loss.item())
        total_loss.backward()

        optimizer.step()

    print(f'Train Loss: {sum(all_losses)/len(all_losses)}, Train L2: {sum(all_dists)/len(all_dists)}')

def train_epoch_memory(model, data, batch_size, loss_fn, optimizer, device, use_custom_loss=False):

    model.train()

    num_batches = len(data) // batch_size
    all_losses = []
    all_dists = []
    for i in tqdm.tqdm(range(num_batches)):
        batch = data[i*batch_size:(i+1)*batch_size]
        m = torch.tensor([b['masses'] for b in batch], dtype=torch.float32).to(device)
        pos0 = torch.tensor([b['pos'] for b in batch], dtype=torch.float32).to(device)
        vel0 = torch.tensor([b['vel'] for b in batch], dtype=torch.float32).to(device)
        acc0 = torch.tensor([b['acc'] for b in batch], dtype=torch.float32).to(device)
        #acc1 = torch.tensor([b['acc_next1'] for b in batch], dtype=torch.float32).to(device)
        #acc2 = torch.tensor([b['acc_next2'] for b in batch], dtype=torch.float32).to(device)

        optimizer.zero_grad()
        losses = []
        for j in range(len(batch)):
            l = 0
            sample_masses = m[j].unsqueeze(1)
            sample_pos0 = pos0[j]
            sample_vel0 = vel0[j]
            sample_acc0 = acc0[j]
            #sample_acc1 = acc1[j]
            #sample_acc2 = acc2[j]

            pr_acc0 = model(sample_pos0, sample_vel0, sample_masses)

            if use_custom_loss:
                loss0, dists0 = loss_fn(pr_acc0, sample_acc0, model.num_neighbors)
            else:
                loss0 = torch.mean(euclidean_distance(pr_acc0, sample_acc0))
                dists0 = loss0

            all_dists.append(dists0)

            l += loss0 

            #pr_pos1, pr_vel1 = get_new_pos_vel(pr_acc0, sample_pos0, sample_vel0)

            #pr_acc1 = model(pr_pos1, pr_vel1, sample_masses)

            #loss1, dists1 = loss_fn(pr_acc1, sample_acc1, model.num_neighbors)

            #all_dists.append(dists1.item())

            #l += loss1 * 0.3

            #pr_pos2, pr_vel2 = get_new_pos_vel(pr_acc1, pr_pos1, pr_vel1)

            #pr_acc2 = model(pr_pos2, pr_vel2, sample_masses)

            #loss2, dists2 = loss_fn(pr_acc2, sample_acc2, model.num_neighbors)

            #all_dists.append(dists2.item())

            #l += loss2 * 0.2

            losses.append(l)

        total_loss = 128 * sum(losses) / len(batch)
        all_losses.append(total_loss.item())
        total_loss.backward()

        optimizer.step()

    print(f'Train Loss: {sum(all_losses)/len(all_losses)}, Train L2: {sum(all_dists)/len(all_dists)}')

def train_epoch_memory_black_hole_info(model, data, batch_size, loss_fn, optimizer, device,use_custom_loss=False):
    
        model.train()
    
        num_batches = len(data) // batch_size
        all_losses = []
        all_dists = []
        for i in tqdm.tqdm(range(num_batches)):
            batch = data[i*batch_size:(i+1)*batch_size]
            m = torch.tensor([b['masses'] for b in batch], dtype=torch.float32).to(device)
            pos0 = torch.tensor([b['pos'] for b in batch], dtype=torch.float32).to(device)
            vel0 = torch.tensor([b['vel'] for b in batch], dtype=torch.float32).to(device)
            acc0 = torch.tensor([b['acc'] for b in batch], dtype=torch.float32).to(device)
            acc1 = torch.tensor([b['acc_next1'] for b in batch], dtype=torch.float32).to(device)
            #acc2 = torch.tensor([b['acc_next2'] for b in batch], dtype=torch.float32).to(device)
            black_hole_indexes = torch.tensor(np.array([b['bh_index'] for b in batch]), dtype=torch.int)
    
            optimizer.zero_grad()
            losses = []
            for j in range(len(batch)):
                l = 0
                sample_masses = m[j].unsqueeze(1)
                sample_pos0 = pos0[j]
                sample_vel0 = vel0[j]
                sample_acc0 = acc0[j]
                sample_acc1 = acc1[j]
                sample_pos0_bh = sample_pos0[black_hole_indexes[j]]
                sample_vel0_bh = sample_vel0[black_hole_indexes[j]]
                sample_masses_bh = sample_masses[black_hole_indexes[j]]
    
                pr_acc0 = model(sample_pos0, sample_vel0, sample_masses, sample_pos0_bh, sample_vel0_bh, sample_masses_bh)

                if use_custom_loss:
    
                    loss0, dists0 = loss_fn(pr_acc0, sample_acc0, model.num_neighbors)

                else:
                    loss0 = torch.mean(euclidean_distance(pr_acc0, sample_acc0))
                    dists0 = loss0
    
                all_dists.append(dists0.item())
    
                l += loss0 * 0.5
    
                pr_pos1, pr_vel1 = get_new_pos_vel(pr_acc0, sample_pos0, sample_vel0)
    
                pr_acc1 = model(pr_pos1, pr_vel1, sample_masses, pr_pos1[black_hole_indexes[j]], pr_vel1[black_hole_indexes[j]], sample_masses[black_hole_indexes[j]])
    
                loss1, dists1 = loss_fn(pr_acc1, sample_acc1, model.num_neighbors)
    
                all_dists.append(dists1.item())
    
                l += loss1 * 0.5
    
                #pr_pos2, pr_vel2 = get_new_pos_vel(pr_acc1,

                #pr_acc2 = model(pr_pos2, pr_vel2, sample_masses)

                #loss2, dists2 = loss_fn(pr_acc2, sample_acc2, model.num_neighbors)

                #all_dists.append(dists2.item())

                #l += loss2 * 0.2

                losses.append(l)

            total_loss = sum(losses) / len(batch)
            all_losses.append(total_loss.item())
            total_loss.backward()

            optimizer.step()

        print(f'Train Loss: {sum(all_losses)/len(all_losses)}, Train L2: {sum(all_dists)/len(all_dists)}')

        

def val(model, val_dir, batch_size, loss_fn, device):
    files = os.listdir(val_dir)

    model.eval()
    with torch.no_grad():
        all_losses = []
        all_dists = []
        for file in files:
            with open(os.path.join(val_dir, file)) as f:
                data = json.load(f)
            m = [b['masses'] for b in data]
            pos0 = [b['pos'] for b in data]
            vel0 = [b['vel'] for b in data]
            pos1 = [b['pos_next1'] for b in data]
            pos2 = [b['pos_next2'] for b in data]

            loss = 0
            for j in range(len(data)):
                sample_masses = torch.tensor(m[j], dtype=torch.float32).unsqueeze(1).to(device)
                sample_pos0 = torch.tensor(pos0[j], dtype=torch.float32).to(device)
                sample_vel0 = torch.tensor(vel0[j], dtype=torch.float32).to(device)
                sample_pos1 = torch.tensor(pos1[j], dtype=torch.float32).to(device)
                sample_pos2 = torch.tensor(pos2[j], dtype=torch.float32).to(device)


                pr_pos1, pr_vel1 = model(sample_pos0, sample_vel0, sample_masses)

                loss1, dist1 = loss_fn(pr_pos1, sample_pos1, model.num_neighbors)
                loss += 0.5*loss1

                all_dists.append(dist1.item())


                pr_pos2, _ = model(pr_pos1, pr_vel1, sample_masses)
                loss2, dist2 = loss_fn(pr_pos2, sample_pos2, model.num_neighbors)
                loss += 0.5*loss2

                all_dists.append(dist2.item())


            loss = loss / len(data)
            all_losses.append(loss.item())

            # clear memory
            del sample_masses
            del sample_pos0
            del sample_vel0
            del sample_pos1
            del sample_pos2
            del pr_pos1
            del pr_vel1
            del pr_pos2
            torch.cuda.empty_cache()
            


        print(f'Val Loss: {sum(all_losses)/len(all_losses)}, Val L2: {sum(all_dists)/len(all_dists)}')


            

In [5]:
def train(model, train_dir, val_dir, batch_size, loss_fn, optimizer, num_epochs, weights_dir=None, device='cuda', eval=True):

    model.to(device)

    if weights_dir is not None:
        weight_paths = os.listdir(weights_dir)
        weight_paths.sort()
        try:
            model.load_state_dict(torch.load(os.path.join(weights_dir, weight_paths[-1])))
            last_model = int(weight_paths[-1].split('_')[1].split('.')[0])
            last_model += 1
        except:
            last_model = 0
    
    train_files = os.listdir(train_dir)
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}')
        for t_file in train_files:
            full_path = os.path.join(train_dir, t_file)
            train_epoch(model, full_path, batch_size, loss_fn, optimizer, device)

        if eval:
            val(model, val_dir, batch_size, loss_fn, device)

    torch.save(model.state_dict(), f'./models/model_{last_model}.pt')

    return model

def train_memory(model, train_data, val_dir, batch_size, loss_fn, optimizer, num_epochs, weights_dir=None, device='cuda', eval=True, use_custom_loss=False):
    
        model.to(device)
    
        if weights_dir is not None:
            weight_paths = os.listdir(weights_dir)
            weight_paths.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))
            try:
                model.load_state_dict(torch.load(os.path.join(weights_dir, weight_paths[-1])))
                print(f'Loaded weights from {weight_paths[-1]}')
                last_model = int(weight_paths[-1].split('_')[1].split('.')[0])
                last_model += 1
            except:
                last_model = 0
        
        for epoch in range(num_epochs):
            print(f'Epoch {epoch}')
            train_epoch_memory(model, train_data, batch_size, loss_fn, optimizer, device, use_custom_loss)
    
            if eval:
                val(model, val_dir, batch_size, loss_fn, device)
    
        torch.save(model.state_dict(), f'./models/model_{last_model}.pt')
    
        return model

def train_memory_black_hole_info(model, train_data, val_dir, batch_size, loss_fn, optimizer, num_epochs, weights_dir=None, device='cuda', eval=True, use_custom_loss=False):
        
            model.to(device)
        
            if weights_dir is not None:
                weight_paths = os.listdir(weights_dir)
                weight_paths.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))
                try:
                    model.load_state_dict(torch.load(os.path.join(weights_dir, weight_paths[-1])))
                    print(f'Loaded weights from {weight_paths[-1]}')
                    last_model = int(weight_paths[-1].split('_')[1].split('.')[0])
                    last_model += 1
                except:
                    last_model = 0
            
            for epoch in range(num_epochs):
                print(f'Epoch {epoch}')
                train_epoch_memory_black_hole_info(model, train_data, batch_size, loss_fn, optimizer, device, use_custom_loss)
        
                if eval:
                    val(model, val_dir, batch_size, loss_fn, device)
        
            torch.save(model.state_dict(), f'./modelsbh/model_{last_model}.pt')
        
            return model

In [6]:
'''
model_files = os.listdir('./models/')
model_files.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))
try:
    last_model_id = model_files[-1].split('_')[1].split('.')[0]
except IndexError:
    last_model_id = -1
last_model_id = int(last_model_id)

model = KwisatzHaderach(activation=True, layer_channels=[128, 128, 64, 64, 3], calc_neighbors=True)



optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.7, verbose=True)

for i in range(40):
    # remove all files from train directory
    files = os.listdir('./train')
    for file in files:
        os.remove(os.path.join('./train', file))
    
    dataset = generate_dataset_memory(25)

    
    model = train_memory(model, dataset, './val', 16, loss_fn, optimizer, 1, './models', device='cuda', eval=False, use_custom_loss=True)
    del dataset
    last_model_id += 1
    torch.save(model.state_dict(), f'./models/model_{last_model_id}.pt')

    if i % 5 == 0:
        scheduler.step()
'''


"\nmodel_files = os.listdir('./models/')\nmodel_files.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))\ntry:\n    last_model_id = model_files[-1].split('_')[1].split('.')[0]\nexcept IndexError:\n    last_model_id = -1\nlast_model_id = int(last_model_id)\n\nmodel = KwisatzHaderach(activation=True, layer_channels=[128, 128, 64, 64, 3], calc_neighbors=True)\n\n\n\noptimizer = torch.optim.Adam(model.parameters(), lr=1e-2)\nscheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.7, verbose=True)\n\nfor i in range(40):\n    # remove all files from train directory\n    files = os.listdir('./train')\n    for file in files:\n        os.remove(os.path.join('./train', file))\n    \n    dataset = generate_dataset_memory(25)\n\n    \n    model = train_memory(model, dataset, './val', 16, loss_fn, optimizer, 1, './models', device='cuda', eval=False, use_custom_loss=True)\n    del dataset\n    last_model_id += 1\n    torch.save(model.state_dict(), f'./models/model_{last_model_id}.pt')\n\

In [7]:
from kwisatzHaderach_bh import KwisatzHaderachBH

In [8]:
model_files = os.listdir('./modelsbh/')
model_files.sort(key=lambda x: int(x.split('_')[1].split('.')[0]))
try:
    last_model_id = model_files[-1].split('_')[1].split('.')[0]
except IndexError:
    last_model_id = -1
last_model_id = int(last_model_id)

model = KwisatzHaderachBH(activation=True, layer_channels=[128, 64, 64, 32, 3], calc_neighbors=True)



optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, 0.7, verbose=True)

for i in range(40):
    # remove all files from train directory
    files = os.listdir('./train')
    for file in files:
        os.remove(os.path.join('./train', file))
    
    dataset = generate_dataset_memory_bh(1, window_size=2)

    
    model = train_memory_black_hole_info(model, dataset, './val', 16, loss_fn, optimizer, 1, './modelsbh', device='cuda', eval=False, use_custom_loss=True)
    del dataset
    last_model_id += 1
    torch.save(model.state_dict(), f'./modelsbh/model_{last_model_id}.pt')

    if i % 5 == 0:
        scheduler.step()

Adjusting learning rate of group 0 to 1.0000e-02.
Generating dataset with 1 scenes...


100%|██████████| 1/1 [00:17<00:00, 17.48s/it]


Epoch 0


  black_hole_indexes = torch.tensor([b['bh_index'] for b in batch], dtype=torch.int)
100%|██████████| 62/62 [00:48<00:00,  1.27it/s]


Train Loss: 1.5187266981304293, Train L2: 768583.4188400907
Adjusting learning rate of group 0 to 7.0000e-03.
Generating dataset with 1 scenes...


100%|██████████| 1/1 [00:15<00:00, 15.40s/it]


Loaded weights from model_0.pt
Epoch 0


100%|██████████| 62/62 [00:46<00:00,  1.32it/s]


Train Loss: 0.006750807699356829, Train L2: 431.9071403276175
Generating dataset with 1 scenes...


100%|██████████| 1/1 [00:14<00:00, 14.22s/it]


Loaded weights from model_1.pt
Epoch 0


100%|██████████| 62/62 [00:47<00:00,  1.31it/s]


Train Loss: 0.0070351732616883616, Train L2: 9.908531690917668
Generating dataset with 1 scenes...


100%|██████████| 1/1 [00:14<00:00, 14.01s/it]


Loaded weights from model_2.pt
Epoch 0


100%|██████████| 62/62 [00:49<00:00,  1.25it/s]


Train Loss: 0.0036425845990438136, Train L2: 4.859474384856801
Generating dataset with 1 scenes...


100%|██████████| 1/1 [00:14<00:00, 14.05s/it]


Loaded weights from model_3.pt
Epoch 0


100%|██████████| 62/62 [00:47<00:00,  1.29it/s]


Train Loss: 0.008151102893715423, Train L2: 6.373239514207648
Generating dataset with 1 scenes...


  0%|          | 0/1 [00:09<?, ?it/s]


KeyboardInterrupt: 