In [1]:
import numpy as np
from pathlib import Path
import pandas as pd
from tqdm import tqdm
tqdm.pandas()
from fastai.core import parallel
from fastai.data_block import get_files
import traceback

In [2]:
# wget https://springernature.figshare.com/ndownloader/files/3195389 -O dsgdb9nsd.xyz.tar.bz2
# tar xvjf dsgdb9nsd.xyz.tar.bz2 -C QM9
# !ls QM9/ | while read i; do sed -i 's/*^/E/g' QM9/$i; done

In [3]:
ext = '_ext'

In [4]:
train_fname = Path(f'train{ext}.npz')
try:
    npzfile = np.load(train_fname)
    m = npzfile['m']
except:
    assert False

In [5]:
t = pd.read_csv(f'train{ext}.csv')

In [6]:
m_map = pd.factorize(t['molecule_name'])[1]

In [7]:
df = pd.DataFrame({'m': m_map[m]})

In [8]:
df.head()

Unnamed: 0,m
0,dsgdb9nsd_000001
1,dsgdb9nsd_000001
2,dsgdb9nsd_000001
3,dsgdb9nsd_000001
4,dsgdb9nsd_000001


In [9]:
def r_csv(mol,i):
    zeros = np.zeros((1, 29), dtype=np.float32)
    fname = f'QM9/{mol}.xyz'
    qm = pd.read_csv(fname,
                     skiprows=2,
                     nrows=int(open(fname).readline().rstrip()),
                     sep='\t',
                     header=None)[[4]].values.T
    zeros[:, :qm.shape[1]] = qm
    return mol, zeros

In [22]:
mols = [p.stem for p in get_files(path=Path('QM9'), extensions='.xyz')]

In [24]:
mulliken = dict(parallel(r_csv, mols))

In [25]:
df['x_qm9_mulliken'] = df['m'].map(mulliken)

In [26]:
df.head()

Unnamed: 0,m,x_qm9_mulliken
0,dsgdb9nsd_000001,"[[-0.535689, 0.133921, 0.133922, 0.133923, 0.1..."
1,dsgdb9nsd_000001,"[[-0.535689, 0.133921, 0.133922, 0.133923, 0.1..."
2,dsgdb9nsd_000001,"[[-0.535689, 0.133921, 0.133922, 0.133923, 0.1..."
3,dsgdb9nsd_000001,"[[-0.535689, 0.133921, 0.133922, 0.133923, 0.1..."
4,dsgdb9nsd_000001,"[[-0.535689, 0.133921, 0.133922, 0.133923, 0.1..."


In [27]:
x_qm9_mulliken = df['x_qm9_mulliken'].values

In [28]:
x_qm9_mulliken = np.array(x_qm9_mulliken.tolist()).astype(np.float32)
np.save(f'x_qm9_mulliken{ext}.npy', x_qm9_mulliken)

In [29]:
from functools import partial
use_memmap = True
load_fn = np.load if not use_memmap else partial(np.lib.format.open_memmap, mode='r')
x_qm9_mulliken = load_fn(f'x_qm9_mulliken{ext}.npy')
x_qm9_mulliken.shape

(1405126, 1, 29)

In [30]:
!ls -altrh x_qm9_mulliken.npy

-rw-rw-r-- 1 antor antor 87M jun 28 10:43 x_qm9_mulliken.npy


In [32]:
test_fname = Path(f'test{ext}.npz')
try:
    npzfile_test = np.load(test_fname)
    m_test = npzfile_test['m']
except:
    assert False
test = pd.read_csv(f'test{ext}.csv')
m_map_test = pd.factorize(test['molecule_name'])[1]
m_map_test
df_test = pd.DataFrame({'m': m_map_test[m_test - m_test[0]]})

In [33]:
df_test.head()

Unnamed: 0,m
0,dsgdb9nsd_000004
1,dsgdb9nsd_000004
2,dsgdb9nsd_000004
3,dsgdb9nsd_000004
4,dsgdb9nsd_000015


In [36]:
df_test['x_qm9_mulliken']= df_test['m'].map(mulliken)

In [38]:
xt_qm9_mulliken = df_test['x_qm9_mulliken'].values

In [39]:
xt_qm9_mulliken = np.array(xt_qm9_mulliken.tolist()).astype(np.float32)
np.save(f'xt_qm9_mulliken{ext}.npy', xt_qm9_mulliken)

In [40]:
from functools import partial
use_memmap = True
load_fn = np.load if not use_memmap else partial(np.lib.format.open_memmap, mode='r')
xt_qm9_mulliken = load_fn(f'xt_qm9_mulliken{ext}.npy')
xt_qm9_mulliken.shape

(756113, 1, 29)