In [1]:
import json
import os
import pandas as pd
import torch
import warnings
from pymatgen.core import Structure
from pymatgen import analysis
from pymatgen.analysis.graphs import StructureGraph
from pymatgen.analysis.local_env import JmolNN, VoronoiNN
warnings.filterwarnings("ignore")



In [2]:
import torch_geometric
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

In [3]:
def read_pymatgen_dict(file):
    with open(file, "r") as f:
        d = json.load(f)
    return d, Structure.from_dict(d)

In [4]:
TRAIN_DIR = "C:\\Users\\Cherry\\PycharmProjects\\IDAO_2022\\data\\dichalcogenides_public\\structures"
train_structure_names = os.listdir(TRAIN_DIR)
targets = pd.read_csv("C:\\Users\\Cherry\\PycharmProjects\\IDAO_2022\\data\\dichalcogenides_public\\targets.csv")
len(train_structure_names)

2966

In [5]:
def specifications_of_structure(path):
    crystall_dict, crystall_struct = read_pymatgen_dict(path)
    graph = StructureGraph.with_local_env_strategy(crystall_struct, JmolNN())

    adj = graph.as_dict()["graphs"]["adjacency"]
    f = []
    t = []
    edge_features = []
    for from_id in range(len(adj)):
        for to in adj[from_id]:
            f.append(from_id)
            t.append(to["id"])
            edge_features.append(to["to_jimage"])

    y = torch.Tensor(targets[targets._id == path.split("/")[-1][:-5]].band_gap.to_numpy())

    oh_atoms = {
        "Mo": [1, 0, 0, 0],
        "W": [0, 1, 0, 0],
        "Se": [0, 0, 1, 0],
        "S": [0, 0, 0, 1]
    }

    X = []
    for atom_i in crystall_dict['sites']:
        x = []
        xyz = atom_i["xyz"]
        abc = atom_i["abc"]
        element = oh_atoms[atom_i["label"]]
        x.extend(xyz)
        x.extend(abc)
        x.extend(element)
        X.append(x)
    edge_index = torch.LongTensor([f, t])
    X = torch.tensor(X)
    return Data(x=X, y=y, edge_index=edge_index)

In [15]:
list_1 = []
list_1.append(specifications_of_structure('C:\\Users\\Cherry\\PycharmProjects\\IDAO_2022\\data\\dichalcogenides_public\\structures\\6141cf0f51c1cbd9654b8870.json'))

In [22]:
dataloader = DataLoader(list_1)
for i in dataloader:
    print(i)
torch.save(dataloader, "test_dataloader.pth")

DataBatch(x=[190, 10], edge_index=[2, 570], y=[0], batch=[190], ptr=[2])


In [7]:
targets

Unnamed: 0,_id,band_gap
0,6141cf0f51c1cbd9654b8870,1.0843
1,6141cf1051c1cbd9654b8872,1.1102
2,6141cf11cc0e69a0cf28ab35,1.1484
3,6141cf11b842c2e72e2f2d48,1.8068
4,6141cf11ae4fb853db2e3f14,0.3600
...,...,...
2961,6146d0b54e27a1844a5f0b02,1.1461
2962,6146dd853ac25c70a5c6cdeb,0.3550
2963,6146e9103ac25c70a5c6cded,0.3491
2964,6146ecdb3ac25c70a5c6cdef,0.3506


In [8]:
y = torch.Tensor(targets.band_gap.to_numpy())

In [23]:
from tqdm import tqdm
dataset = []
for filename in tqdm(train_structure_names):
    dataset.append(specifications_of_structure(TRAIN_DIR + '\\' + filename))
dataloader = DataLoader(dataset)
torch.save(dataloader, "dataloader.pth")

100%|██████████| 2966/2966 [3:33:17<00:00,  4.31s/it]  


In [46]:
dataset

[Data(x=[190, 10], edge_index=[2, 570], y=[0]),
 Data(x=[190, 10], edge_index=[2, 570], y=[0]),
 Data(x=[191, 10], edge_index=[2, 564], y=[0]),
 Data(x=[192, 10], edge_index=[2, 576], y=[0]),
 Data(x=[191, 10], edge_index=[2, 573], y=[0]),
 Data(x=[191, 10], edge_index=[2, 573], y=[0]),
 Data(x=[190, 10], edge_index=[2, 561], y=[0]),
 Data(x=[192, 10], edge_index=[2, 576], y=[0]),
 Data(x=[191, 10], edge_index=[2, 573], y=[0]),
 Data(x=[190, 10], edge_index=[2, 561], y=[0]),
 Data(x=[189, 10], edge_index=[2, 558], y=[0]),
 Data(x=[191, 10], edge_index=[2, 564], y=[0]),
 Data(x=[192, 10], edge_index=[2, 576], y=[0]),
 Data(x=[191, 10], edge_index=[2, 573], y=[0]),
 Data(x=[191, 10], edge_index=[2, 573], y=[0]),
 Data(x=[190, 10], edge_index=[2, 561], y=[0]),
 Data(x=[190, 10], edge_index=[2, 561], y=[0]),
 Data(x=[192, 10], edge_index=[2, 576], y=[0]),
 Data(x=[191, 10], edge_index=[2, 564], y=[0]),
 Data(x=[190, 10], edge_index=[2, 570], y=[0]),
 Data(x=[191, 10], edge_index=[2, 573], 

In [47]:
dataloader = DataLoader(dataset, batch_size=16)

In [53]:
from torch_geometric.nn import GCNConv, BatchNorm, global_mean_pool
from torch.nn import Linear, Softmax


class GNNPass(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GNNPass, self).__init__()
        torch.manual_seed(12345)
        self.conv1 = GCNConv(10, 64)
        self.conv2 = GCNConv(64, 64)
        self.conv3 = GCNConv(64, 64)
        self.conv4 = GCNConv(64, 64)
        self.conv5 = GCNConv(64, 64)
        self.conv6 = GCNConv(64 * 5, 64)

        self.norm1 = BatchNorm(64)
        self.norm2 = BatchNorm(64)
        self.norm3 = BatchNorm(64)
        self.norm4 = BatchNorm(64)
        self.norm5 = BatchNorm(64)
        self.norm6 = BatchNorm(64)

        self.lin = Linear(64, 1)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        # 1. Obtain node embeddings
        x1 = self.conv1(x, edge_index)
        x1 = x1.relu()
        x1 = self.norm1(x1)

        x2 = self.conv2(x1, edge_index)
        x2 = x2.relu()
        x2 = self.norm2(x2)

        x3 = self.conv3(x2, edge_index)
        x3 = x3.relu()
        x3 = self.norm3(x3)

        x4 = self.conv4(x3, edge_index)
        x4 = x4.relu()
        x4 = self.norm4(x4)

        x5 = self.conv5(x4, edge_index)
        x5 = x5.relu()
        x5 = self.norm5(x5)

        x6 = self.conv6(torch.hstack([x1, x2, x3, x4, x5]), edge_index)

        # 2. Readout layer
        x = global_mean_pool(x6, batch)  # [batch_size, hidden_channels]
        # print(x5.shape)

        # 3. Apply a final classifier
        x = self.norm6(x)

        class_outputs = self.lin(x)

        return class_outputs

In [59]:
loss_function = torch.nn.MSELoss()
#loss_function.to(device)

In [55]:
import random
import os
from torch.optim import Optimizer

def train(train_loader):
    model.train()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
    for data in train_loader:  # Iterate in batches over the training dataset.
        out = model(data).class_outputs  # Perform a single forward pass.
        loss = loss_function(out, data.y.type(torch.LongTensor))  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad()  # Clear gradients.


def test(test_loader, preds):
    labels = np.array([])
    for data in test_loader:
        labels = np.append(labels, data.y.numpy())

    return mean_absolute_error(labels, preds)


def predict(test_loader):
    preds = np.array([])
    for data in test_loader:
        data = data.to(device)
        out = model(data)
        out = out.cpu().detach().numpy().reshape(1, -1)[0]
        preds = np.append(preds, out)

    return preds


def stack(base_model, name, data_list, y, epochs, start_fold, loss_function, batch_size, random_state, device):
    random.seed(random_state),
    np.random.seed(random_state)
    torch.manual_seed(random_state)
    torch.cuda.manual_seed_all(random_state)


    # loss_function.to(device)
    scores = []
    kfold = KFold(n_splits=4)

    for fold, (train_ids, test_ids) in enumerate(kfold.split(data_list, y)):
        if fold >= start_fold:
            sub_scores = []
            model = base_model

            optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

            train_subsampler = [data_list[id] for id in train_ids]
            train_loader = torch_geometric.loader.DataLoader(
                train_subsampler,
                batch_size=batch_size)

            test_subsampler = [data_list[id] for id in test_ids]
            test_loader = torch_geometric.loader.DataLoader(
                test_subsampler,
                batch_size=batch_size)

            epochs = epochs
            total_steps = len(train_loader) * epochs

            for epoch in range(epochs):

                model.train()
                for data in train_loader:  # Iterate in batches over the training dataset.
                    data = data.to(device)
                    out = model(data)  # Perform a single forward pass.
                    y = data.y.to(device)
                    loss = loss_function(out, y)  # Compute the loss.
                    loss.backward()  # Derive gradients.
                    optimizer.step()  # Update parameters based on gradients.
                    optimizer.zero_grad()  # Clear gradients.

                train_preds = predict(train_loader)
                test_preds = predict(test_loader)

                train_mae = test(train_loader, train_preds)
                test_mae = test(test_loader, test_preds)

                sub_scores.append(test_mae)
                print(f'Fold: {fold} Epoch: {epoch:03d}, Train MAE: {train_mae:.4f}, Test MAE: {test_mae:.4f}')

                with open(f"{name}_{epoch}_epochs_{fold}_fold.npy", 'wb') as fin:
                    np.save(fin, test_preds)

                torch.save(model.state_dict(), f"{name}_{epoch}_epochs_{fold}_fold.h5")

            best_epoch = sub_scores.index(min(sub_scores))
            print(f"{name}_{best_epoch}_epochs_{fold}_fold saved!")

            for path in os.listdir():
                if f"{name}" in path:
                    os.remove(path)

            scores.append(min(sub_scores))

    return scores

In [66]:
import torch.cuda
#x = torch.randn(3, 4, 5, device='cuda:0')
#torch.cuda.is_available()

None


In [67]:
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

model = GNNPass(64)
#device = torch.device("cuda")
#model.to(device)
scores = stack(name="gnnpass", base_model=model, data_list=dataset, y=y, epochs=50, start_fold=0, loss_function=loss_function, batch_size=16, random_state=42, device=device)
scores

AssertionError: Torch not compiled with CUDA enabled