In [1]:
import torch
from torch_geometric.data import InMemoryDataset, download_url, Data, Batch
from torch import nn
from torch import functional as F
import os
import pandas as pd
import numpy as np
import pickle
import itertools
import jax
from jax import numpy as jnp
import networkx as nx
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import MinMaxScaler
import mendeleev

In [2]:
class SCDataset(InMemoryDataset):
    def __init__(self, root, transform=None, pre_transform=None, training= True):
        super().__init__(root, transform, pre_transform)
        self.filenames = pd.read_csv("raw/processed_names.csv")
        self.charges = pd.read_csv("raw/mulliken_charges.csv")
        self.magnetic_shieldings = pd.read_csv("raw/magnetic_shielding_tensors.csv")
        self.dipole_moments = pd.read_csv("raw/dipole_moments.csv")
        self.potential_energy = pd.read_csv("raw/potential_energy.csv")
        self.target = pd.read_csv("raw/train.csv")
        self.structures = pd.read_csv("raw/structures.csv")
        self.molecule_names = molecule_names = np.unique(self.potential_energy["molecule_name"])
        if training:
            self.training_mask = np.loadtxt("./raw/training_mask2.csv").astype(bool)
            self.molecule_names = self.molecule_names[self.training_mask]
            
    def len(self) -> int:
        return len(self.molecule_names)
    def standarize(self):
        mms = MinMaxScaler([-4, 4])
        self.charges["mulliken_charge"] = mms.fit_transform(self.charges[["mulliken_charge"]]).squeeze()
        self.magnetic_shieldings.iloc[:, 2:] = mms.fit_transform(self.magnetic_shieldings.iloc[:, 2:])
        self.dipole_moments.iloc[:, 1:] = mms.fit_transform(self.dipole_moments.iloc[:, 1:])
        self.potential_energy["potential_energy"] = mms.fit_transform(self.potential_energy[["potential_energy"]]).squeeze()

    def preprocess(self, k = None):
        charges = self.charges
        magnetic_shieldings = self.magnetic_shieldings
        dipole_moments = self.dipole_moments
        potential_energy = self.potential_energy
        molecule_names = self.molecule_names
        target = self.target
        structures = self.structures
        dfs = [charges, magnetic_shieldings, dipole_moments, potential_energy, target, structures]
        for i in range(len(dfs)):
            dfs[i] = dfs[i].set_index("molecule_name", drop=True)
        charges, magnetic_shieldings, dipole_moments, potential_energy, target, structures = dfs
        atoms = structures["atom"].unique()
        atoms_id = {atoms[i]:i for i in range(len(atoms))}
        training_mask = []
        for x, name in enumerate(list(molecule_names)):
            any_training_edges = True
            coords = structures.loc[name][["x", "y", "z"]].to_numpy()
            n_nodes = coords.shape[0]
            print("{}/{}".format(x + 1, len(molecule_names)), end = "\r")
            # adj_mat
            with open("./processed/{}_adj_mat.csv".format(name), "wb") as f:
                np.savetxt(f, get_distance_matrix(coords, k))
            atom_types = structures.loc[name]["atom"].replace(atoms_id).to_numpy()
            atom_onehot = np.zeros([n_nodes, len(atoms)])
            atom_onehot[np.arange(0, n_nodes), atom_types] = 1
            charge = charges.loc[name]["mulliken_charge"].to_numpy()
            shieldings = magnetic_shieldings.loc[name].iloc[:, 2:].to_numpy()
            node_features = np.concatenate([charge[:, None], shieldings, atom_onehot], axis=1)
            with open("./processed/{}_node_attr.csv".format(name), "wb") as f:
                np.savetxt(f, node_features)
            try:
                edges_target = target.loc[[name]]
                training_mask.append(True)
            except KeyError:
                training_mask.append(False)
                any_training_edges = False
                
            if any_training_edges:
                edges_target["type"] = edges_target["type"].replace(edge_to_int).astype(np.int64)
                scalar_coupling = edges_target.loc[:, ["atom_index_0", "atom_index_1","type","scalar_coupling_constant"]].to_numpy()
            else:
                scalar_coupling = np.array([-1, -1, -1, 0])
            with open("./processed/{}_target.csv".format(name), "wb") as f:
                np.savetxt(f, scalar_coupling)
            # Graph features
            dipole_moment = dipole_moments.loc[name]
            norm_dipole = np.array([np.linalg.norm(dipole_moment)])
            potential = potential_energy.loc[name]
            graph_features = (np.concatenate([dipole_moment, norm_dipole, potential, np.array([n_nodes])]))
            with open("./processed/{}_graph_feautures.csv".format(name), "wb") as f:
                np.savetxt(f, graph_features)
            # edge_features
            edgelist, edgeattr = get_edge_features(coords, dipole_moment)
            with open("./processed/{}_edge_list.csv".format(name), "wb") as f:
                np.savetxt(f, edgelist)
            with open("./processed/{}_edge_attr.csv".format(name), "wb") as f:
                np.savetxt(f, edgeattr)
            # # metaedge
            # incident_edges = edgelist[edgeattr[:,1] == 1]
            # metaedge_list, metaedge_attr = get_metaedge_features(incident_edges, coords)
            # with open("./processed/{}_metaedge_list.csv".format(name), "wb") as f:
            #     np.savetxt(f,  metaedge_list)
            # with open("./processed/{}_metaedge_attr.csv".format(name), "wb") as f:
            #     np.savetxt(f, metaedge_attr)
        with open("./raw/training_mask2.csv".format(name), "wb") as f:
                np.savetxt(f, np.array(training_mask))
                
    def mem_load(self):
        self.mem = {}
        for i, molecule in enumerate(self.molecule_names):
            graph_features = pd.read_csv("./processed/{}_graph_feautures.csv".format(molecule), sep=" ", header=None).to_numpy()
            node_features = pd.read_csv("./processed/{}_node_attr.csv".format(molecule), sep=" ", header=None).to_numpy()
            atomtypes = node_features[:,-5:].argmax(axis=1)
            prop_atoms = props[atomtypes,:]
            n_nodes = node_features.shape[0]
            graph_features = np.tile(graph_features, [1, n_nodes]).T
            node_features = np.concatenate([prop_atoms, node_features, graph_features], axis=1)
            target =  pd.read_csv("./processed/{}_target.csv".format(molecule), sep=" ", header=None).to_numpy()
            edge_type = target[:,2]
            edge_type = np.concatenate([edge_type, edge_type], axis=0)
            edges_target = target[:,0:2]
            target = target[:,3]
            target = np.concatenate([target, target])
            edges_target = np.concatenate([edges_target, edges_target[:,::-1]], axis=0)
            edge_list = pd.read_csv("./processed/{}_edge_list.csv".format(molecule), sep=" ", header=None).to_numpy()
            edge_attr = pd.read_csv("./processed/{}_edge_attr.csv".format(molecule), sep=" ", header=None).to_numpy()
            metaedge_list = pd.read_csv("./processed/{}_metaedge_list.csv".format(molecule), sep=" ", header=None).to_numpy()
            metaedge_attr = pd.read_csv("./processed/{}_metaedge_attr.csv".format(molecule), sep=" ", header=None).to_numpy()
            data = Data(x=torch.Tensor(node_features), edge_index = torch.Tensor(edge_list).T, y=torch.Tensor(target), edge_attr = torch.Tensor(edge_attr))
            data.nodes_target = torch.Tensor(edges_target)
            data.nodes = n_nodes
            data.edges = edge_list.shape[0]
            data.types = torch.Tensor(edge_type)
            data.metaedge_list = metaedge_list
            data.metaedge_list = metaedge_attr
            # data.edge_cross = edgelist
            # data.nodes = node_features.shape[0]
            self.mem[molecule] = data
            print("{}/{}".format(i, len(self.molecule_names)), end = "\r")
            
    def __getitem__(self, idx):
        molecule = self.molecule_names[idx]
        return self.mem[molecule]
        

def get_distance_matrix(X, k=None):
    dist = squareform(pdist(X))
    if k is not None:
        non_k = dist.argsort(axis=1)[:, k+1:]
        dist[np.arange(0, dist.shape[0])[:,None], non_k] = 0
    return dist

def to_batch(list_graphs):
    n_nodes = 0
    for graph in list_graphs:
        graph["nodes_target"] += n_nodes
        n_nodes += graph.nodes
    return Batch.from_data_list(list_graphs) 

In [3]:
with open("./train_dataloader.pkl", "rb") as f:
    train_dataloader = pickle.load(f)
with open("./test_dataloader.pkl", "rb") as f:
    test_dataloader = pickle.load(f)

In [4]:
from torch_geometric.nn import TransformerConv, PDNConv, GATv2Conv
import torch.nn as nn
import torch.nn.functional as F

def init_weights(m):
    if isinstance(m, nn.Linear):
        torch.nn.init.xavier_uniform_(m.weight)
        m.bias.data.fill_(0.01)

In [5]:
class TransformerConvEncoderGated(nn.Module):
    def __init__(self, num_node_features, hidden_features, heads, n_layers, p_dropout):
        super().__init__()
        self.p_dropout = p_dropout
        assert n_layers > 1
        self.init_conv = TransformerConv(num_node_features, hidden_features, heads=n_heads, dropout=p_dropout,  edge_dim=8, concat=False)
        self.layers = nn.ModuleList([TransformerConv(hidden_features, hidden_features, heads=n_heads, dropout=p_dropout,  edge_dim=8, concat=False) for i in range(n_layers)])
        self.gates = nn.Parameter(torch.Tensor(n_layers))
        self.init_conv.apply(init_weights)
        for conv in self.layers:
            conv.apply(init_weights)
    def forward(self, x, edge_index, edge_attr):
        range_gates = torch.sigmoid(self.gates)
        x = self.init_conv(x, edge_index, edge_attr)
        for i, layer in enumerate(self.layers):
            x = F.leaky_relu(x)
            x = (range_gates[i])*layer(x, edge_index, edge_attr) + (1-range_gates[i])*x
        return x
    
class ResNetGated(nn.Module):
    def __init__(self, init_dim, hidden_dim, layers, p_dropout):
        super().__init__()
        self.p_dropout = p_dropout
        assert n_layers > 1
        self.layers = nn.ModuleList([nn.Sequential(nn.Linear(init_dim, hidden_dim),
                             nn.ReLU(),
                             nn.Dropout(p=p_dropout),
                             nn.Linear( hidden_dim, init_dim)) for i in range(layers)])
        self.gates = nn.Parameter(torch.Tensor(n_layers))
        self.layers.apply(init_weights)
    def forward(self, x):
        range_gates = torch.sigmoid(self.gates)
        for i, layer in enumerate(self.layers):
            x = F.relu(x)
            x = (range_gates[i])*layer(x) + (1-range_gates[i])*x
        return x

    
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, out_features, n_heads, n_layers, n_res, p_dropout):
        super().__init__()
        self.conv = TransformerConvEncoderGated(num_node_features, out_features, heads=n_heads, p_dropout=p_dropout,  n_layers=3)
        self.fcs = nn.ModuleList([nn.Sequential(ResNetGated(out_features*2, out_features*64, n_res, p_dropout),
                               nn.Linear(2 * out_features, 1)) for i in range(8)])
        for fc in self.fcs:
            fc.apply(init_weights)
    def forward(self, x, edge_index, edge_attr, edge_cross, types, return_embeddings = False):
        x = self.conv(x, edge_index, edge_attr)
        if return_embeddings:
            embeddings = x
        x = x[edge_cross]
        shp = x.shape
        x = x.transpose(1, 2).reshape([shp[0], shp[2]*2])
        xs = []
        for i in range(8):
            xs.append(self.fcs[i](x[types == i]))
        x = torch.concat(xs, axis=0)
        if return_embeddings:
            return x, embeddings
        return x

In [6]:
### Define the loss function
loss_fn = nn.MSELoss

p_dropout = 0.001
conv_features = 128
n_heads = 6
n_layers = 3
n_res = 2
### Set the random seed for reproducible results
torch.manual_seed(0)

gcn = GCN(29, conv_features, n_heads, n_layers, n_res, p_dropout=p_dropout)
saved_model = "./saved_models/with_resnet_2022-02-02-14:46:07_lr=0.001_wd=1e-06_p=0.001_conv_features=128_n_layers=3_n_res=2.pth"
gcn.load_state_dict(torch.load(saved_model))
# Check if the GPU is available
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f'Selected device: {device}')

# Move both the encoder and the decoder to the selected device
gcn.to(device)

Selected device: cuda


GCN(
  (conv): TransformerConvEncoderGated(
    (init_conv): TransformerConv(29, 128, heads=6)
    (layers): ModuleList(
      (0): TransformerConv(128, 128, heads=6)
      (1): TransformerConv(128, 128, heads=6)
      (2): TransformerConv(128, 128, heads=6)
    )
  )
  (fcs): ModuleList(
    (0): Sequential(
      (0): ResNetGated(
        (layers): ModuleList(
          (0): Sequential(
            (0): Linear(in_features=256, out_features=8192, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.001, inplace=False)
            (3): Linear(in_features=8192, out_features=256, bias=True)
          )
          (1): Sequential(
            (0): Linear(in_features=256, out_features=8192, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.001, inplace=False)
            (3): Linear(in_features=8192, out_features=256, bias=True)
          )
        )
      )
      (1): Linear(in_features=256, out_features=1, bias=True)
    )
    (1): Sequential(
      (0): ResNetGat

In [7]:
### Testing function
def embed(model, device, data, return_edge_features = False):
    embeddings = []
    targets = []
    # Set evaluation mode for encoder and decoder
    model.eval()
    test_losses = []
    with torch.no_grad(): # No need to track the gradients
        for i, batch in enumerate(data):
            x, edge_index, edge_attr, target, edge_cross, types = (batch["x"],
                                                               batch["edge_index"],
                                                               batch["edge_attr"],
                                                               batch["y"],
                                                               batch["nodes_target"],
                                                               batch["types"])
            types_cpu = types.numpy()
            sort_index = torch.Tensor(types.numpy().argsort(kind="stable")).long()
            x, edge_index, edge_attr, target, edge_cross, types = x.to(device), \
                                                                edge_index.long().to(device), \
                                                                edge_attr.to(device), \
                                                                target.to(device),\
                                                                edge_cross.long().to(device), \
                                                                types.long().to(device)
            logits, x = model(x, edge_index, edge_attr, edge_cross, types, return_embeddings=True)
            x = x.cpu()[edge_cross.cpu()]
            shp = x.shape
            x = x.transpose(1, 2).reshape([shp[0], shp[2]*2])
            if return_edge_features:
                target_edges = pd.DataFrame(np.concatenate([x, edge_cross.cpu().numpy()], axis=1))
                all_edges = pd.DataFrame(np.concatenate([edge_attr.cpu().numpy(), edge_index.cpu().numpy().T], axis=1))
                features = target_edges.merge(all_edges, left_on=[256, 257], right_on=[8, 9]).drop([256, 257, "8_y", "9_y"], axis=1)
                embeddings.append(features.to_numpy())
            else:
                embeddings.append(x)
            labels = torch.concat([types.cpu()[:,None], target.cpu()[:,None]], axis=1)
            targets.append(labels.numpy())
    return np.concatenate(embeddings, axis=0), np.concatenate(targets, axis=0)

In [8]:
embeddings_transformer, targets = embed(gcn, device, train_dataloader, return_edge_features = False)
embeddings_transformer_test, targets_test = embed(gcn, device, test_dataloader, return_edge_features = False)

In [9]:
class GATv2EncoderGated(nn.Module):
    def __init__(self, num_node_features, hidden_features, heads, n_layers, p_dropout):
        super().__init__()
        self.p_dropout = p_dropout
        assert n_layers > 1
        self.init_conv = GATv2Conv(num_node_features, hidden_features, heads=n_heads, dropout=p_dropout,  edge_dim=8, concat=False)
        self.layers = nn.ModuleList([GATv2Conv(hidden_features, hidden_features, heads=n_heads, dropout=p_dropout,  edge_dim=8, concat=False) for i in range(n_layers)])
        self.gates = nn.Parameter(torch.Tensor(n_layers))
        self.init_conv.apply(init_weights)
        for conv in self.layers:
            conv.apply(init_weights)
    def forward(self, x, edge_index, edge_attr):
        range_gates = torch.sigmoid(self.gates)
        x = self.init_conv(x, edge_index, edge_attr)
        for i, layer in enumerate(self.layers):
            x = F.leaky_relu(x)
            x = (range_gates[i])*layer(x, edge_index, edge_attr) + (1-range_gates[i])*x
        return x
    
class ResNetGated(nn.Module):
    def __init__(self, init_dim, hidden_dim, layers, p_dropout):
        super().__init__()
        self.p_dropout = p_dropout
        assert n_layers > 1
        self.layers = nn.ModuleList([nn.Sequential(nn.Linear(init_dim, hidden_dim),
                             nn.ReLU(),
                             nn.Dropout(p=p_dropout),
                             nn.Linear( hidden_dim, init_dim)) for i in range(layers)])
        self.gates = nn.Parameter(torch.Tensor(n_layers))
        self.layers.apply(init_weights)
    def forward(self, x):
        range_gates = torch.sigmoid(self.gates)
        for i, layer in enumerate(self.layers):
            x = F.relu(x)
            x = (range_gates[i])*layer(x) + (1-range_gates[i])*x
        return x

    
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, out_features, n_heads, n_layers, n_res, p_dropout):
        super().__init__()
        self.conv = GATv2EncoderGated(num_node_features, out_features, heads=n_heads, p_dropout=p_dropout,  n_layers=3)
        self.fcs = nn.ModuleList([nn.Sequential(ResNetGated(out_features*2, out_features*64, n_res, p_dropout),
                               nn.Linear(2 * out_features, 1)) for i in range(8)])
        for fc in self.fcs:
            fc.apply(init_weights)
    def forward(self, x, edge_index, edge_attr, edge_cross, types, return_embeddings = False):
        x = self.conv(x, edge_index, edge_attr)
        if return_embeddings:
            embeddings = x
        x = x[edge_cross]
        shp = x.shape
        x = x.transpose(1, 2).reshape([shp[0], shp[2]*2])
        xs = []
        for i in range(8):
            xs.append(self.fcs[i](x[types == i]))
        x = torch.concat(xs, axis=0)
        if return_embeddings:
            return x, embeddings
        return x

In [10]:
p_dropout = 0.001
conv_features = 128
n_heads = 6
n_layers = 3
n_res = 2
### Set the random seed for reproducible results
torch.manual_seed(0)

gcn = GCN(29, conv_features, n_heads, n_layers, n_res, p_dropout=p_dropout)
saved_model = "./saved_models/with_resnet_2022-02-02-10:50:52_lr=0.001_wd=1e-06_p=0.001_conv_features=128_n_layers=3_n_res=2.pth"

gcn.load_state_dict(torch.load(saved_model))
gcn.to(device)

GCN(
  (conv): GATv2EncoderGated(
    (init_conv): GATv2Conv(29, 128, heads=6)
    (layers): ModuleList(
      (0): GATv2Conv(128, 128, heads=6)
      (1): GATv2Conv(128, 128, heads=6)
      (2): GATv2Conv(128, 128, heads=6)
    )
  )
  (fcs): ModuleList(
    (0): Sequential(
      (0): ResNetGated(
        (layers): ModuleList(
          (0): Sequential(
            (0): Linear(in_features=256, out_features=8192, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.001, inplace=False)
            (3): Linear(in_features=8192, out_features=256, bias=True)
          )
          (1): Sequential(
            (0): Linear(in_features=256, out_features=8192, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.001, inplace=False)
            (3): Linear(in_features=8192, out_features=256, bias=True)
          )
        )
      )
      (1): Linear(in_features=256, out_features=1, bias=True)
    )
    (1): Sequential(
      (0): ResNetGated(
        (layers): ModuleList(


In [11]:
embeddings_gatv2, _ = embed(gcn, device, train_dataloader, return_edge_features = False)
embeddings_gatv2_test, _ = embed(gcn, device, test_dataloader, return_edge_features = False)

In [12]:
class PDNConvEncoderGated(nn.Module):
    def __init__(self, num_node_features, hidden_features, hidden_edge, n_layers):
        super().__init__()
        self.p_dropout = p_dropout
        assert n_layers > 1
        self.init_conv = PDNConv(num_node_features, hidden_features, hidden_channels = hidden_edge,  edge_dim=8)
        self.layers = nn.ModuleList([GATv2Conv(hidden_features, hidden_features, heads=4,  edge_dim=8, concat=False) for i in range(n_layers - 1)])
        self.gates = nn.Parameter(torch.Tensor(n_layers))
    def forward(self, x, edge_index, edge_attr):
        range_gates = torch.sigmoid(self.gates)
        x = self.init_conv(x, edge_index, edge_attr)
        for i, layer in enumerate(self.layers):
            x = F.leaky_relu(x)
            x = (range_gates[i])*layer(x, edge_index, edge_attr) + (1-range_gates[i])*x
        return x
    
class ResNetGated(nn.Module):
    def __init__(self, init_dim, hidden_dim, layers, p_dropout):
        super().__init__()
        self.p_dropout = p_dropout
        assert n_layers > 1
        self.layers = nn.ModuleList([nn.Sequential(nn.Linear(init_dim, hidden_dim),
                             nn.ReLU(),
                             nn.Dropout(p=p_dropout),
                             nn.Linear( hidden_dim, init_dim)) for i in range(layers)])
        self.gates = nn.Parameter(torch.Tensor(n_layers))
        self.layers.apply(init_weights)
    def forward(self, x):
        range_gates = torch.sigmoid(self.gates)
        for i, layer in enumerate(self.layers):
            x = F.relu(x)
            x = (range_gates[i])*layer(x) + (1-range_gates[i])*x
        return x

    
class GCN(torch.nn.Module):
    def __init__(self, num_node_features, out_features, hidden_edge, n_layers, n_res, p_dropout):
        super().__init__()
        self.conv = PDNConvEncoderGated(num_node_features, out_features, hidden_edge, n_layers=3)
        self.fcs = nn.ModuleList([nn.Sequential(ResNetGated(out_features*2, out_features*64, n_res, p_dropout),
                               nn.Linear(2 * out_features, 1)) for i in range(8)])
        for fc in self.fcs:
            fc.apply(init_weights)
    def forward(self, x, edge_index, edge_attr, edge_cross, types, return_embeddings = False):
        x = self.conv(x, edge_index, edge_attr)
        if return_embeddings:
            embeddings = x
        x = x[edge_cross]
        shp = x.shape
        x = x.transpose(1, 2).reshape([shp[0], shp[2]*2])
        xs = []
        for i in range(8):
            xs.append(self.fcs[i](x[types == i]))
        x = torch.concat(xs, axis=0)
        if return_embeddings:
            return x, embeddings
        return x

In [13]:
p_dropout = 0.001
conv_features = 128
hidden_edge = 512
n_layers = 2
n_res = 2
### Set the random seed for reproducible results
torch.manual_seed(0)

gcn = GCN(29, conv_features, hidden_edge, n_layers, n_res, p_dropout=p_dropout)
saved_model = "./saved_models/with_resnet_2022-02-03-19:57:29_lr=0.001_wd=1e-06_p=0.001_conv_features=128_n_layers=2_n_res=2.pth"
gcn.load_state_dict(torch.load(saved_model))
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
gcn.to(device)

GCN(
  (conv): PDNConvEncoderGated(
    (init_conv): PDNConv(29, 128)
    (layers): ModuleList(
      (0): GATv2Conv(128, 128, heads=4)
      (1): GATv2Conv(128, 128, heads=4)
    )
  )
  (fcs): ModuleList(
    (0): Sequential(
      (0): ResNetGated(
        (layers): ModuleList(
          (0): Sequential(
            (0): Linear(in_features=256, out_features=8192, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.001, inplace=False)
            (3): Linear(in_features=8192, out_features=256, bias=True)
          )
          (1): Sequential(
            (0): Linear(in_features=256, out_features=8192, bias=True)
            (1): ReLU()
            (2): Dropout(p=0.001, inplace=False)
            (3): Linear(in_features=8192, out_features=256, bias=True)
          )
        )
      )
      (1): Linear(in_features=256, out_features=1, bias=True)
    )
    (1): Sequential(
      (0): ResNetGated(
        (layers): ModuleList(
          (0): Sequential(
            (0): Linea

In [14]:
embeddings_PDN_ef, _ = embed(gcn, device, train_dataloader, return_edge_features = True)
embeddings_PDN_ef_test, _ = embed(gcn, device, test_dataloader, return_edge_features = True)

In [15]:
X_train = np.concatenate([embeddings_transformer, embeddings_gatv2, embeddings_PDN_ef], axis=1)
X_train = pd.DataFrame(X_train)
X_test = np.concatenate([embeddings_transformer_test, embeddings_gatv2_test, embeddings_PDN_ef_test], axis=1)
X_test = pd.DataFrame(X_test)

In [16]:
del embeddings_transformer, embeddings_gatv2, embeddings_PDN_ef, embeddings_transformer_test, embeddings_gatv2_test, embeddings_PDN_ef_test

In [17]:
def chunk_hd5(folder, name, df):
    iters = np.arange(0, df.shape[0], 2**16)
    for i in iters:
        if i < iters[len(iters)-1]:
            df_temp = df.iloc[i:i+2**16, :]
        else:
            df_temp = df.iloc[i:, :]
        df_temp.to_hdf('{}{}_{}.h5'.format(folder, name, str(i).zfill(8)), 
                           index=False,
                           mode = 'w',
                           key="f")

In [18]:
chunk_hd5("/mnt/10edb508-27ad-4f92-9467-37a536784b53/temp/train/", "data", X_train)

In [19]:
y_train = pd.DataFrame(targets)
chunk_hd5("/mnt/10edb508-27ad-4f92-9467-37a536784b53/temp/train/", "labels", y_train)

In [20]:
chunk_hd5("/mnt/10edb508-27ad-4f92-9467-37a536784b53/temp/test/", "data", X_test)

In [21]:
y_test = pd.DataFrame(targets_test)

In [22]:
chunk_hd5("/mnt/10edb508-27ad-4f92-9467-37a536784b53/temp/test/", "labels", y_test)