# Prototipo de preprocesamiento del dataset

In [1]:
import glob

from typing import Tuple

import numpy             as np
import pandas            as pd
import seaborn           as sns
import matplotlib.pyplot as plt
import sklearn

In [2]:
data_dir = 'datasets/test_systems'

xv_files  = glob.glob(f'{data_dir}/*.XV')
xyz_files = glob.glob(f'{data_dir}/*.xyz')

print(f'[{len(xv_files)}, {len(xyz_files)}]')
print(f'\n{xv_files [:3]}\n...')
print(f'\n{xyz_files[:3]}\n...')

[10, 10]

['datasets/test_systems\\sys1.XV', 'datasets/test_systems\\sys10.XV', 'datasets/test_systems\\sys2.XV']
...

['datasets/test_systems\\sys1.xyz', 'datasets/test_systems\\sys10.xyz', 'datasets/test_systems\\sys2.xyz']
...


In [3]:
# coordinate range [-3.368006, 19.782751] (23.150757000000002)

MIN_COOR = -3.368006
MAX_COOR = 19.782751
RANGE    = MAX_COOR - MIN_COOR

MIN_DIFF     = 0.04481336368763274
AVERAGE_DIFF = 0.8968847383141587

av_bins_f = RANGE / AVERAGE_DIFF
mn_bins_f = RANGE / MIN_DIFF

print(f'Número de cubetas del promedio: {av_bins_f: 8.3f}')
print(f'Número de cupetas del mínimo  : {mn_bins_f: 8.3f}')

Número de cubetas del promedio:   25.812
Número de cupetas del mínimo  :  516.604


In [4]:
AVERAGE_BINS = 26
MINIMUM_BINS = 517

av_coor_space = np.zeros((AVERAGE_BINS, AVERAGE_BINS, AVERAGE_BINS), dtype=int)
mn_coor_space = np.zeros((MINIMUM_BINS, MINIMUM_BINS, MINIMUM_BINS), dtype=int)

elements = {
	'H' : 1,
	'C' : 6, 'N' : 7, 'O' : 8, 'F' : 9,
	'Si':14, 'P' :15, 'S' :16, 'Cl':17,
	'Ca':20, 
	'Mo':44
}

def get_element(line:str): # -> tuple[str, tuple[float, float float]]:
	data = line.split()

	element = data[0]
	coor    = [float(f) for f in data[1:]]
	return element, coor

with open(xyz_files[0], 'r') as coor_file:
	line = coor_file.readline()

	n_mol = int(line)
	print(n_mol)
	for i in range(n_mol):
		line = coor_file.readline()
		while len(line.split()) <= 0:
			line = coor_file.readline	()
		element, coordinates = get_element(line)

		element     = elements[element]
		coordinates = (np.array(coordinates)-MIN_COOR) / RANGE

		av_coor = np.rint(coordinates * AVERAGE_BINS).astype(int)
		mn_coor = np.rint(coordinates * MINIMUM_BINS).astype(int)

		av_coor_space[av_coor[0], av_coor[1], av_coor[2]] = element
		mn_coor_space[mn_coor[0], mn_coor[1], mn_coor[2]] = element
print(np.count_nonzero(mn_coor_space))
print(np.count_nonzero(av_coor_space))
# print(av_coor_space)

136
136
136


In [5]:
def space_layout(
	filename    :str,
	element_dict:dict,
	n_bins      :int,
	range_      :float,
	offset      :float      = 0,
	out         :np.ndarray = None,
	return_n_mol:bool       = False
) -> np.ndarray:
	if out is None:
		out = np.zeros((n_bins, n_bins, n_bins))
	n_mol:int = 0
	with open(filename, 'r') as coor_file:
		line = coor_file.readline()

		n_mol = int(line)
		for i in range(n_mol):
			line = coor_file.readline()
			while len(line.split()) <= 0:
				line = coor_file.readline()
			element, coors = get_element(line)

			element = element_dict[element]
			coors   = (np.array(coors)-offset) * n_bins / range_
			coors   = np.floor(coors).astype(int)

			out[coors[0], coors[1], coors[2]] = element
	if return_n_mol:
		return out, n_mol
	return out

In [6]:
n_clashes:int = 0

used_bins     = AVERAGE_BINS + 4
used_range    = RANGE + (RANGE/100)
used_min_coor = MIN_COOR

for i, xyz_file in enumerate(xyz_files):
	print(f'\r[{i+1:2d}]', end='')
	coor_space, n_mol = space_layout(
		xyz_file, elements, used_bins, used_range, used_min_coor, return_n_mol=True
	)

	n_clashes += n_mol - np.count_nonzero(coor_space)
print()
print(f'Número de choques      : {n_clashes}')
print(f'Cubetas usadas         : {used_bins}')
print(f'Rango usado            : {used_range}')
print(f'Coordenada mínima usada: {used_min_coor}')

[10]
Número de choques      : 0
Cubetas usadas         : 30
Rango usado            : 23.382264570000004
Coordenada mínima usada: -3.368006


## Usando el DataLoader

In [17]:
import libs.mol_process as molp
import torch

dataLoader = molp.MolDataLoader(
	f'{data_dir}/*.xyz', elements,
	offset=used_min_coor,
	_range=used_range,
	n_bins=used_bins,
	batch_size=4,
	shuffle=True,
	device= torch.device('cpu')
)
targetLoader = molp.BandGapDataLoader(
	f'{data_dir}/*.XV',
	device=torch.device('cpu')
)

for mol in dataLoader:
	print(f'{mol.shape} {np.count_nonzero(mol.cpu())}')
print()
for tar in targetLoader:
	print(tar)

mol = dataLoader[5]
print(f'\n{mol.shape} {np.count_nonzero(mol.cpu())}')
print(targetLoader[5])

torch.Size([30, 30, 30]) 94
torch.Size([30, 30, 30]) 96
torch.Size([30, 30, 30]) 128
torch.Size([30, 30, 30]) 56
torch.Size([30, 30, 30]) 124
torch.Size([30, 30, 30]) 58
torch.Size([30, 30, 30]) 32
torch.Size([30, 30, 30]) 136
torch.Size([30, 30, 30]) 102
torch.Size([30, 30, 30]) 104

tensor(14.3808, device='cuda:0')
tensor(12.9692, device='cuda:0')
tensor(17.8012, device='cuda:0')
tensor(16.0494, device='cuda:0')
tensor(8.9384, device='cuda:0')
tensor(14.7229, device='cuda:0')
tensor(10.0080, device='cuda:0')
tensor(15.9644, device='cuda:0')
tensor(10.5088, device='cuda:0')
tensor(13.4171, device='cuda:0')

torch.Size([30, 30, 30]) 58
tensor(14.7229, device='cuda:0')
