In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
import json

from DataAnalysisClass import *
from clustering_work import *
from pathlib import Path
from matplotlib import pyplot as plt
import seaborn as sns

single = ['aug-cc-pVDZ', 'aug-cc-pVTZ', 'aug-cc-pVQZ', 'aug-cc-pV5Z', 'aug-cc-pV6Z']
single_polarized = ['aug-cc-pCVDZ', 'aug-cc-pCVTZ', 'aug-cc-pCVQZ']
double = ['d-aug-cc-pVDZ', 'd-aug-cc-pVTZ', 'd-aug-cc-pVQZ', 'd-aug-cc-pV5Z', 'd-aug-cc-pV6Z']
double_polarized = ['d-aug-cc-pCVDZ', 'd-aug-cc-pCVTZ', 'd-aug-cc-pCVQZ']
all_basis_sets = single + single_polarized + double + double_polarized

In [2]:

data_base=Path("/home/ahurta92/data/may2024")
mol_list_path = data_base.joinpath("mols_list")
# read mol_list

with open(mol_list_path) as mol_list:
    full_mol_list = mol_list.read().splitlines()

# split into N lists

def split_list(lst, n):
    k, m = divmod(len(lst), n)
    return (lst[i * k + min(i, m):(i + 1) * k + min(i + 1, m)] for i in range(n))


In [3]:
N=4
mol_list = list(split_list(full_mol_list, N))


for i in range(N):
    print(' '.join(mol_list[i]))

FNO CH3SH N2H2 CH3NH2 HOCl SiO HBS HBO CH2BH PH3O ClF HF BH2Cl SiH4 SF2 P2H4 NaCN OCl2
CH3Cl SCl2 HCCF LiH BF SiH3F HCONH2 CS S2H2 FCN NaCl SiH3Cl N2H4 NH2Cl LiCl NH3O NH2OH BH3
SO2 HCN CH3BH2 NaLi HOF CH3F HNS CH3OH O3 CH2NH ClCN Mg2 NH3 H2O PH3 BHF2 LiCN NH2F
HCl HCCCl HCOOH HCHS LiH_s HCHO SH2 CSO HOOH CO HCP NOCl HNO NaH OF2 HNC BH2F LiBH4


In [4]:
def query_beta_data(df,omega_b,omega_c):
    om=df.Bfreq.unique()

    b = om[omega_b]
    c = om[omega_c]

    return df.query('Bfreq==@b and Cfreq==@c')
def query_alpha_data(df,omega):
    om=df.omega.unique()
    b = om[omega]
    return df.query('omega==@b')
def process_beta_df(beta_df):
    beta_df.Afreq = beta_df.Afreq.round(3)
    beta_df.Bfreq = beta_df.Bfreq.round(3)
    beta_df.Cfreq = beta_df.Cfreq.round(3)
    beta_df['ijk'] = beta_df['A'].astype(str) + beta_df['B'].astype(str) + beta_df['C'].astype(str)
    beta_df = beta_df.drop(columns=['A', 'B', 'C'])
    return beta_df


class MRAData:
    def __init__(self,mol,data_base):
        self.mol = mol
        self.data_dir = data_base.joinpath(mol)
        calc_path = self.data_dir.joinpath("calc_path.json")
        self.calc_path_json = json.load(open(calc_path))
        self.alpha=pd.DataFrame(json.load(open(self.data_dir.joinpath(self.calc_path_json['alpha_json_path']))))
        self.alpha['basis']='MRA'
        self.alpha['mol']=mol
        self.beta=process_beta_df(pd.DataFrame(json.load(open(self.data_dir.joinpath(self.calc_path_json['quadratic_json_path'])))))
        self.beta['basis']='MRA'
        self.beta['molecule']=mol

        self.moldft_calc_info=json.load(open(self.data_dir.joinpath(self.calc_path_json['moldft']['outfiles']['calc_info'])))
        self.energy=self.moldft_calc_info['return_energy']


    def get_beta(self,omega_b,omega_c):
        return query_beta_data(self.beta,omega_b,omega_c)
    def get_alpha(self,omega):
        return query_alpha_data(self.alpha,omega)
        


In [5]:
mol='CO'
output_data=data_base.joinpath("output")
mra_data=MRAData(mol,output_data)

In [6]:
mra_data.get_alpha(0)

Unnamed: 0,omega,ij,alpha,basis,mol
0,0.0,XX,11.273423,MRA,CO
1,0.0,XY,-5e-06,MRA,CO
2,0.0,XZ,8.6e-05,MRA,CO
3,0.0,YX,-7e-06,MRA,CO
4,0.0,YY,11.273931,MRA,CO
5,0.0,YZ,0.000117,MRA,CO
6,0.0,ZX,0.000361,MRA,CO
7,0.0,ZY,3.1e-05,MRA,CO
8,0.0,ZZ,14.465446,MRA,CO


In [7]:
mra_data.get_beta(0,0)

Unnamed: 0,Afreq,Bfreq,Cfreq,Beta,ijk,basis,molecule
0,-0.0,0.0,0.0,0.000678,XXX,MRA,CO
1,-0.0,0.0,0.0,0.000116,XXY,MRA,CO
2,-0.0,0.0,0.0,4.899628,XXZ,MRA,CO
3,-0.0,0.0,0.0,0.000116,XYX,MRA,CO
4,-0.0,0.0,0.0,8.4e-05,XYY,MRA,CO
5,-0.0,0.0,0.0,-0.000147,XYZ,MRA,CO
6,-0.0,0.0,0.0,4.899628,XZX,MRA,CO
7,-0.0,0.0,0.0,-0.000147,XZY,MRA,CO
8,-0.0,0.0,0.0,-0.001812,XZZ,MRA,CO
9,-0.0,0.0,0.0,0.000135,YXX,MRA,CO


In [8]:
data={}
output_data=data_base.joinpath("output")
available_data = []
not_available_data = []

for mol in full_mol_list:
    try:
        data[mol]=MRAData(mol,output_data)
        available_data.append(mol)
    except FileNotFoundError as e:
        not_available_data.append(mol)
    except:
        data[mol]=None

In [9]:
print(len(available_data))
print(available_data)

21
['FNO', 'CH3SH', 'N2H2', 'CH3NH2', 'HOCl', 'SiO', 'CH3Cl', 'SO2', 'HCN', 'CH3BH2', 'HOF', 'CH3F', 'HNS', 'CH3OH', 'HCl', 'HCCCl', 'HCOOH', 'HCHS', 'LiH_s', 'HCHO', 'CO']


In [10]:
all_alpha_data= pd.concat([data[mol].alpha for mol in available_data])
all_beta_data= pd.concat([data[mol].beta for mol in available_data])
all_energy_data = pd.DataFrame([[mol,data[mol].energy] for mol in available_data],columns=['molecule','energy'])




In [11]:
class MRADataFrames():

   def __init__(self,database_root):
      self.database_root = database_root
      self.csv_data = database_root.joinpath("csv_data")
      if not self.csv_data.exists():
         self.csv_data.mkdir()
      self.energy = pd.DataFrame()
      self.alpha = pd.DataFrame()   
      self.beta = pd.DataFrame() 

   def from_dict(self,data_dict):
      self.energy = pd.concat([pd.Series({'energy':data_dict[mol].energy,'basis':'MRA','molecule':mol}) for mol in data_dict.keys()],axis=1).T 
      self.alpha = pd.concat([data_dict[mol].alpha for mol in data_dict.keys()])
      self.beta = pd.concat([data_dict[mol].beta for mol in data_dict.keys()])
   def to_csv(self):
      self.energy.to_csv(self.csv_data.joinpath("mra_energy.csv"),index=False)
      self.alpha.to_csv(self.csv_data.joinpath("mra_alpha.csv"),index=False)
      self.beta.to_csv(self.csv_data.joinpath("mra_beta.csv"),index=False)
   def from_csv(self):
      self.energy = pd.read_csv(self.csv_data.joinpath("mra_energy.csv"))
      self.alpha = pd.read_csv(self.csv_data.joinpath("mra_alpha.csv"))
      self.beta = pd.read_csv(self.csv_data.joinpath("mra_beta.csv"))

In [12]:
mradata=MRADataFrames(data_base)
mradata.from_dict(data)

In [13]:
mradata.energy

Unnamed: 0,energy,basis,molecule
0,-228.729672,MRA,FNO
1,-437.765348,MRA,CH3SH
2,-110.039195,MRA,N2H2
3,-95.263091,MRA,CH3NH2
4,-534.934746,MRA,HOCl
5,-363.855333,MRA,SiO
6,-499.153871,MRA,CH3Cl
7,-547.327243,MRA,SO2
8,-92.915446,MRA,HCN
9,-65.472057,MRA,CH3BH2


In [15]:
len(not_available_data)

51