This document demonstrates the making, training, saving, loading, and usage of a sklearn-compliant CGCNN model.

The dataset is randomly spilt into 8:2 training: test set in this script. 

### Import packages

In [None]:
import matplotlib.pyplot as plt
import multiprocess as mp
import numpy as np
import pandas as pd
import pickle
import random
import tqdm
import torch
import skorch.callbacks.base

import os
import sys
sys.path.insert(0, 'cgcnn/') # you will need to clone the CGCNN repository and add the path here
import mongo
import cgcnn

from cgcnn.data import collate_pool, MergeDataset, StructureDataTransformer
from cgcnn.model import CrystalGraphConvNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ShuffleSplit, train_test_split 
from sklearn.metrics import mean_absolute_error, mean_squared_error
from skorch.callbacks import Checkpoint, LoadInitState 
from skorch.callbacks.lr_scheduler import WarmRestartLR, LRScheduler
from skorch.dataset import CVSplit
from skorch import NeuralNetRegressor
from torch.utils.data import Dataset, DataLoader
from torch.optim import Adam, SGD

#Select which GPU to use if necessary
%env CUDA_DEVICE_ORDER=PCI_BUS_ID
%env CUDA_VISIBLE_DEVICES=0

### Load the dataset as mongo docs

In [None]:
data = pickle.load(open('../surface_energy_dataset/surface_energy_dataset.pkl','rb'))
docs = data['docs']
sort = data['sort']
random.seed(123)
random.shuffle(docs)

###   Process  data

#### input

In [None]:
SDT = StructureDataTransformer(atom_init_loc='atom_init.json',
                               max_num_nbr=12,
                               step=0.2,
                               radius=8,
                               use_voronoi=False,
                               use_tag=False,
                               use_fixed_info=False,
                               use_distance=False,
                               train_geometry = 'initial'
                               )

SDT_out = SDT.transform(docs)
structures = SDT_out[0]

#Settings necessary to build the model (since they are size of vectors as inputs)
orig_atom_fea_len = structures[0].shape[-1]
nbr_fea_len = structures[1].shape[-1]

SDT_out = SDT.transform(docs)
with mp.Pool(4) as pool:
    SDT_list = list(tqdm.tqdm(pool.imap(lambda x: SDT_out[x],range(len(SDT_out)),chunksize=40),total=len(SDT_out)))

#### labels

In [None]:
# Make the target list
# We kept the index of each documents in the list so we can keep track of them after they are shuffled
target_list = np.array([[int(docs.index(doc)),np.log(doc['intercept'])] for doc in docs])
target_list = pd.DataFrame(target_list, columns = ['doc_index', 'intercept'])

### Shuffle and split the data into training and test set

In [None]:
SDT_training, SDT_test, target_training, target_test = train_test_split(SDT_list, target_list, test_size=0.2, random_state=42)

### CGCNN model with skorch to make it sklearn compliant

In [None]:
cuda = torch.cuda.is_available()
if cuda:
    device = torch.device("cuda")
else:
    device='cpu'

#Make a checkpoint to save parameters every time there is a new best for validation lost
cp = Checkpoint(monitor='valid_loss_best',fn_prefix='valid_best_')

#Callback to load the checkpoint with the best validation loss at the end of training
class train_end_load_best_valid_loss(skorch.callbacks.base.Callback):
    def on_train_end(self, net, X, y):
        net.load_params('valid_best_params.pt')
        
load_best_valid_loss = train_end_load_best_valid_loss()

### Train a CGCNN model

In [None]:
#further spilt the training data into train and validate set by 8:2 ratio for CGCNN
train_test_splitter = ShuffleSplit(test_size=0.2, random_state=42)
LR_schedule = LRScheduler('MultiStepLR',milestones=[100],gamma=0.1)

net = NeuralNetRegressor(
    CrystalGraphConvNet,
    module__orig_atom_fea_len = orig_atom_fea_len,
    module__nbr_fea_len = nbr_fea_len,
    batch_size=87,  
    module__classification=False,
    lr=np.exp(-6.465085550816676),     
    max_epochs=218,
    module__atom_fea_len=43,
    module__h_fea_len=114,
    module__n_conv=8,
    module__n_h=3, 
    optimizer=Adam,
    iterator_train__pin_memory=True,
    iterator_train__num_workers=0,
    iterator_train__collate_fn = collate_pool,
    iterator_train__shuffle=True, #VERY IMPORTANT
    iterator_valid__pin_memory=True,
    iterator_valid__num_workers=0,
    iterator_valid__collate_fn = collate_pool,
    iterator_valid__shuffle=False, #This should be False, which is the default
    device=device,
   criterion=torch.nn.L1Loss,
    dataset=MergeDataset,
    train_split = CVSplit(cv=train_test_splitter),
    callbacks=[cp, load_best_valid_loss, LR_schedule]
)

In [None]:
net.initialize()
net.fit(SDT_training,np.array(target_training[['intercept']]))

### Make predictions and save the results

In [None]:
#further spilt training set into train & validation set to look at the prediction accuracy 
SDT_train, SDT_valid, target_train, target_valid = train_test_split(SDT_training, target_training, test_size=0.2, random_state=42)

training_data = {'doc_index': list(target_train['doc_index']),
                 'actual_value':np.exp(target_train['intercept']),
                 'predicted_value':np.exp(net.predict(SDT_train).reshape(-1))}

validation_data = {'doc_index': list(target_valid['doc_index']),
                   'actual_value':np.exp(target_valid['intercept']),
                   'predicted_value':np.exp(net.predict(SDT_valid).reshape(-1))}

test_data = {'doc_index': list(target_test['doc_index']),
            'actual_value':np.exp(target_test['intercept']),
            'predicted_value':np.exp(net.predict(SDT_test).reshape(-1))}

df_training = pd.concat([pd.DataFrame(train_data), pd.DataFrame(validation_data)])
df_test = pd.DataFrame(test_data)

# df_training.to_csv('../analyze_prediction_results/CGCNN_prediction_results/randomsplit_training.csv', sep='\t')
# df_test.to_csv('/analyze_prediction_results/CGCNN_prediction_results/randomsplit_test.csv', sep='\t')