In [1]:
import pandas as pd
import seaborn as sns
import json
from os import listdir
from os.path import isfile, join
from rdkit import Chem
from rdkit.Chem import Descriptors

In [None]:
file = './JSON/ACKIPRQROWPKNS-UHFFFAOYSA-N.json'
jsonfile = open(file)
jdata = json.load(jsonfile)

lambda_list = []

peaks = jdata['PRISTINE'][0]['uvvis'][0]['peaks']

for peak in peaks:
    try:
        lambda_n = peak['lambda']
        lambda_list.append(int(lambda_n))
    except ValueError:
        pass

lambda_exp = max(lambda_list)

lambda_index = [i for i, d in enumerate(peaks) if str(lambda_exp) in d.values()]

extinction_unit = jdata['PRISTINE'][0]['uvvis'][0]['peaks'][lambda_index[0]]['extinction_unit']
extinction = jdata['PRISTINE'][0]['uvvis'][0]['peaks'][lambda_index[0]]['extinction']
solvent = jdata['PRISTINE'][0]['uvvis'][0]['peaks'][lambda_index[0]]['solvent']
lambda_unit = jdata['PRISTINE'][0]['uvvis'][0]['peaks'][lambda_index[0]]['lambda_unit']

In [2]:
# resgatando SMILES, lambda_sTDA, f_sTDA e gap HOMO-LUMO da base de dados

dict_list = []

path = './JSON/'

jsonfiles = [f for f in listdir(path) if isfile(join(path, f)) and f.endswith('.json')]

for fname in jsonfiles:
    with open(join(path,fname)) as fi:
        jdata = json.load(fi)
        
        lambda_list = []
        
        if 'uvvis' in jdata['PRISTINE'][0]:
            smi_exp = jdata['PRISTINE'][0]['SMI']
            peaks = jdata['PRISTINE'][0]['uvvis'][0]['peaks']
            
            for peak in peaks:
                try:
                    lambda_n = peak['lambda']
                    lambda_list.append(int(lambda_n))
                
                except ValueError:
                    pass
            
            if len(lambda_list) == 0:
                pass
            else:
                lambda_exp = max(lambda_list)
                lambda_index = [i for i, d in enumerate(peaks) if str(lambda_exp) in d.values()]
                extinction_unit = jdata['PRISTINE'][0]['uvvis'][0]['peaks'][lambda_index[0]]['extinction_unit']
                extinction = jdata['PRISTINE'][0]['uvvis'][0]['peaks'][lambda_index[0]]['extinction']
                solvent = jdata['PRISTINE'][0]['uvvis'][0]['peaks'][lambda_index[0]]['solvent']
                lambda_unit = jdata['PRISTINE'][0]['uvvis'][0]['peaks'][lambda_index[0]]['lambda_unit']

        if 'FILTERED' in jdata:
            smi_teorico = jdata['FILTERED'][0]['SMI']
            
            if 'orca' in jdata['FILTERED'][0]:
                lambda_sTDA = jdata['FILTERED'][0]['orca'][0]['excited_states']['orbital_energy_list'][0]['amplitude']
                f_sTDA = jdata['FILTERED'][0]['orca'][0]['excited_states']['orbital_energy_list'][0]['oscillator_strength']
                gap = jdata['FILTERED'][0]['orca'][0]['ground_states']['properties']['gap']
                properties = {'SMILES_exp': smi_exp, 'SMILES_teorico': smi_teorico, 'lambda_exp': lambda_exp,
                              'lambda_unit': lambda_unit, 'extinction': extinction, 'extinction_unit': extinction_unit,
                              'solvent': solvent, 'lambda_sTDA': lambda_sTDA, 'f_sTDA': f_sTDA, 'gap_HL': gap}
                dict_list.append(properties)

# adicionando os SMILES e os valores de gap a um dataframe
dados_json = pd.DataFrame(dict_list)

# obtendo o peso molecular de cada SMILES do dataframe

molWt_list = []

for smiles in dados_json['SMILES_teorico']:
    molWt = Descriptors.MolWt(Chem.MolFromSmiles(smiles))
    molWt = round(molWt, 2)
    molWt_list.append(molWt)

dados_json['molWt'] = molWt_list

In [3]:
import numpy as np
dados_json = dados_json.replace({'NULL': np.nan})

In [4]:
dados_json

Unnamed: 0,SMILES_exp,SMILES_teorico,lambda_exp,lambda_unit,extinction,extinction_unit,solvent,lambda_sTDA,f_sTDA,gap_HL,molWt
0,C[Si]1(C)c2cc(ccc2c2c1cc(cc2)c1ccc(cc1)N(c1ccc...,C[Si]1(C)c2cc(ccc2c2c1cc(cc2)c1ccc(cc1)N(c1ccc...,371,nm,,,,326.0,2.094090e+00,7.3040,696.97
1,Sc1[nH]nc(n1)N=Nc1c(c2ccccc2)n(c2c1cccc2)C,Sc1[nH]nc(n1)N=Nc1c(c2ccccc2)n(c2c1cccc2)C,394,,,,DMSO,434.5,3.531655e-03,7.5156,334.41
2,CCCCCCCCCCCCOc1cc2nc(C=Cc3ccc(cc3)c3cc(nc(c3)c...,COc1cc2nc(C=Cc3ccc(cc3)c3cc(nc(c3)c3ccccn3)c3c...,503,nm,,,,355.1,1.376181e+00,6.8510,655.76
3,N#CC(=CN1C(Cl)CCc2c1cccc2)C(=O)O,N#CC(=CN1C(Cl)CCc2c1cccc2)C(=O)O,450,,,,,269.8,6.929888e-01,8.5706,262.70
4,Cc1cc(c(s1)C)C1=C(CC(=Cc2ccc(cc2)N(C)C)C1=O)c1...,Cc1cc(c(s1)C)C1=C(CC(=Cc2ccc(cc2)N(C)C)C1=O)c1...,411,nm,4,,,325.6,2.501992e-02,7.3413,433.64
...,...,...,...,...,...,...,...,...,...,...,...
6137,[O-][N+](=O)c1ccc2c(c1)c1ccccc1[nH]2,[O-][N+](=O)c1ccc2c(c1)c1ccccc1[nH]2,336,,,,,313.0,5.020000e-07,8.3115,212.21
6138,C1=CCc2c(C1)cc1c(c2C=Nc2ccc3c(c2)cccc3)cccc1,C1=CCc2c(C1)cc1c(c2C=Nc2ccc3c(c2)cccc3)cccc1,405,nm,17770,l mol − 1 cm − 1,,323.0,5.779404e-01,7.7012,333.43
6139,CCCCN1c2ccc(cc2Oc2c1cccc2)C=C(C(=O)O)C#N,CN1c2ccc(cc2Oc2c1cccc2)C=C(C(=O)O)C#N,464,nm,,,,363.3,5.437133e-01,6.8095,292.29
6140,c1ccc(cc1)C=Cc1nc(c(n1Cc1ccccc1)c1ccco1)c1ccco1,c1ccc(cc1)C=Cc1nc(c(n1Cc1ccccc1)c1ccco1)c1ccco1,345,nm,,,,314.6,1.296807e+00,7.3731,392.46


In [5]:
dados_json.to_csv('DBcomparativa_from_json.csv', index=False)