In [99]:
import json
from itertools import groupby
import numpy as np
import sys
import re
sys.path.append('..')
import geomlib
from pathlib import Path
from tabulate import tabulate

bohr = 0.52917721092

In [100]:
with open('res/suppl-info.json') as f:
    pages = json.load(f)

In [107]:
def get_lines(pages):
    for page in pages:
        top_last = 0
        for top, tokens in groupby(page['text'], lambda tok: tok['top']):
            if top < top_last+12:
                print(page['number'], list(tokens))
                return
            top_last = top
            token_data = [tok['data'] for tok in tokens]
            if len(token_data) == 1:
                continue
            yield token_data

In [108]:
lines = get_lines(pages)
while next(lines)[-1] != 'COORDINATES':
    pass
next(lines)
geoms = {}
try:
    for line in lines:
        if len(line) in (2, 3):
            geom = {'atoms': []}
            if len(line) == 3:
                geom['charge'] = int(line[2].split('CHARGE=')[1])
            geoms[(int(line[0]), line[1].lower())] = geom
        elif len(line) == 4:
            geom['atoms'].append((line[3].capitalize(), tuple(float(x)*bohr for x in line[0:3])))
except:
    print('Line: ', line)
    raise

1 [{'data': '[a]', 'font': 3, 'height': 11, 'top': 565, 'left': 509, 'width': 15}]


StopIteration: 

In [95]:
print(tabulate(
        (idx, system, geom.get('charge'), len(geom['atoms']))
        for (idx, system), geom in sorted(geoms.items())
    ))

-  --------  -  ---
1  complex       24
1  monomer       12
2  complex1      92
2  complex2      86
2  host          72
2  monomer1      20
2  monomer2      14
3  complex1     126
3  complex2     113
3  host          98
3  monomer1      28
3  monomer2      15
4  complex1     148
4  complex2     158
4  host          88
4  monomer1      60
4  monomer2      70
5  complex1     144
5  complex2     142
5  host         130
5  monomer1      14
5  monomer2      12
6  complex1  1  125
6  complex2  1  122
6  host         108
6  monomer1  1   17
6  monomer2  1   14
7  complex1  2  177
7  complex2     153
7  host         126
7  monomer1  2   51
7  monomer2      27
-  --------  -  ---


In [96]:
for (idx, system), geom in sorted(geoms.items()):
    system, subidx = re.findall(r'(host|complex|monomer)(\d)?', system)[0]
    subidx = int(subidx) if subidx else 0
    filename = '{}-{}-{}.xyz'.format(idx, system, subidx)
    with (Path('geoms')/filename).open('w') as f:
        f.write('{}\n'.format(len(geom['atoms'])))
        if 'charge' in geom:
            f.write('charge: {}\n'.format(geom['charge']))
        else:
            f.write('\n')
        for specie, coord in geom['atoms']:
            f.write('{:>2} {:.6} {:.6} {:.6}\n'.format(specie, *coord))