In [1]:
import yaml
import json

import pandas as pd
import numpy as np
import tensorflow as tf

from pathlib import Path
from pymatgen.core import Structure
from sklearn.model_selection import train_test_split
from megnet.models import MEGNetModel
from megnet.data.crystal import CrystalGraph

2023-01-18 02:10:51.675266: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


In [2]:
print ('Numpy version:', np.__version__)

Numpy version: 1.19.2


In [3]:
def read_pymatgen_dict(file):
    with open(file, "r") as f:
        d = json.load(f)
    return Structure.from_dict(d)

In [4]:
def energy_within_threshold(prediction, target):
   # вычислите абсолютную погрешность по энергии для каждой системы.
    # затем подсчитайте количество систем, в которых максимальная ошибка энергопотребления составляет < 0,02.
    e_thresh = 0.02
    error_energy = tf.math.abs(target - prediction)

    success = tf.math.count_nonzero(error_energy < e_thresh)
    total = tf.size(target)
    return success / tf.cast(total, tf.int64)

In [5]:
def prepare_dataset(dataset_path):
    dataset_path = Path(dataset_path)
    targets = pd.read_csv(dataset_path / "targets.csv", index_col=0)
    struct = {
        item.name.strip(".json"): read_pymatgen_dict(item)
        for item in (dataset_path / "structures").iterdir()
    }

    data = pd.DataFrame(columns=["structures"], index=struct.keys())
    data = data.assign(structures=struct.values(), targets=targets)

    return train_test_split(data, test_size=0.25, random_state=666)

In [6]:
def prepare_model(cutoff, lr):
    nfeat_bond = 10
    r_cutoff = cutoff
    gaussian_centers = np.linspace(0, r_cutoff + 1, nfeat_bond)
    gaussian_width = 0.8
    
    return MEGNetModel(
        graph_converter=CrystalGraph(cutoff=r_cutoff),
        centers=gaussian_centers,
        width=gaussian_width,
        loss=["MAE"],
        npass=2,
        lr=lr,
        metrics=energy_within_threshold
    )

In [7]:
 train, test = prepare_dataset('data/dichalcogenides_public')

  return array(a, dtype, copy=False, order=order)


In [23]:
lr = 1e-4
cutoff = 4
#with tf.device("/gpu:3"):
model = prepare_model(
    float(cutoff),
    float(lr), 
)

In [24]:
epochs = 20
batch_size = 128

In [25]:
with tf.device("/gpu:3"):
    model.train(
        train.structures,
        train.targets,
        validation_structures=test.structures,
        validation_targets=test.targets,
        epochs=int(epochs),
        batch_size=int(batch_size),
    )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [26]:
model = prepare_model(
    float(cutoff),
    float(lr), 
)

In [27]:
model.load_weights('callback/val_mae_00007_0.459013.hdf5')

In [28]:
 dataset_path = Path('data/dichalcogenides_private')

In [29]:
struct = {item.name.strip('.json'): read_pymatgen_dict(item) for item in (dataset_path/'structures').iterdir()}

In [30]:
private_test = pd.DataFrame(columns=['id', 'structures'], index=struct.keys())
private_test = private_test.assign(structures=struct.values())
private_test = private_test.assign(predictions=model.predict_structures(private_test.structures))

In [31]:
private_test[['predictions']].to_csv('./submission.csv', index_label='id')