# This notebook is a work area to develop code to scrape log files to build a PFAS database

## First step is to make a list of log files in the directory that will be iterated over

In [1]:
from pathlib import Path  
import os
import glob
import re


In [12]:
class logFileObj(object):
    def __init__(self, fileStringPath, filePath = None, name = None, xyz = None, energyFree = None, energySCF = None, 
                 molFormula = None, program = None, homo = None, lumo =  None, alphaOcc = None, alphaVirtual = None):
        #The following are the class attributes 
        # I had a self.datablock but the python literature idicates I might have problems with that file being kept open in that case
        self.fileStringPath = fileStringPath
        self.filePath = Path(fileStringPath)
        self.name = self.filePath.name
        # Im not sure if I really want these to be attributes or things I get from methods
        self.energyFree = energyFree
        self.energySCF = energySCF
        self.molFormula = molFormula
        self.program = program
        self.homo = homo
        self.lumo = lumo
        self.alphaOcc = alphaOcc
        self.alphaVirtual = alphaVirtual
        
        #The following are the class methods
    def getText(self): # this method is just to test my object creation
        return('Test object creation by returning this text')
    
    def getFinalSCF(self):
        regSCF = re.compile(r'.*SCF Done.*')
        regSCFenerg = re.compile(r'-\d*\.\d*')
        finalSCFenerg = ''
        finalSCFenergString = ''
        with open(self.filePath) as dataBlock:
            readDataBlock = dataBlock.read()
            SCFmatches = regSCF.findall(readDataBlock)
            try:
                finalSCFenergString = SCFmatches[-1]
            except IndexError:
                pass
            finalSCFenerg = regSCFenerg.search(finalSCFenergString)
            try:
                return(float(finalSCFenerg.group()))
            except AttributeError:
                pass
            
    def getEnergyFree(self):
        regFreeStatement = re.compile(r'.*Sum of electronic and thermal Free Energies.*')
        regFreeEnerg = re.compile(r'-\d*\.\d*')
        finalFreeEnergStr = ''
        finalFreeEnerg = ''
        with open(self.filePath) as dataBlock:
            readDataBlock = dataBlock.read()
            regMatches = regFreeStatement.findall(readDataBlock)
            try:
                finalFreeEnergStr = regMatches[-1]
            except IndexError:
                pass
            finalFreeEnergy = regFreeEnerg.search(finalFreeEnergStr)
            try:
                return(float(finalFreeEnergy.group()))
            except AttributeError:
                pass            
            
    def getMOs(self): #potential error wont work on log files strung together as written
        molecOrbDict = {}
        with open(self.filePath) as dataBlock:
            dataLines = dataBlock.readlines()
            occOrbsReg = re.compile(r'(Alpha\s{2}occ[.]\seigenvalues\s-{2})((\s*-?\d*[.]\d*)*)')
            virtualOrbsReg = re.compile(r'(Alpha\svirt[.]\seigenvalues\s-{2})((\s*-?\d*[.]\d*)*)')
            orbsOcc = ''
            orbsVirtual = ''
            for line in dataLines:
                occRegMatches = occOrbsReg.search(line)
                try:
                    orbsOcc += occRegMatches.group(2)
                    molecOrbDict['occOrbs'] = orbsOcc.split()
                except AttributeError:
                    pass
            for line in dataLines:
                virtRegMatches = virtualOrbsReg.search(line)
                try:
                    orbsVirtual += virtRegMatches.group(2)
                    molecOrbDict['virtOrbs'] = orbsVirtual.split()
                except AttributeError:
                    pass
       # This snippet deletes any sets of alpha energies before the last set in the log file
        i = 1
        while i < len(molecOrbDict['occOrbs']):
            if float(molecOrbDict['occOrbs'][i]) < float(molecOrbDict['occOrbs'][(i-1)]):
                del molecOrbDict['occOrbs'][:i]
            else:
                i += 1
        i = 1
        while i < len(molecOrbDict['virtOrbs']):
            if float(molecOrbDict['virtOrbs'][i]) < float(molecOrbDict['virtOrbs'][(i-1)]):
                del molecOrbDict['virtOrbs'][:i]
            else:
                i += 1
        # This snippet of code converts the strings in these lists to floats
        molecOrbDict['occOrbs'] = list(map(float, molecOrbDict['occOrbs']))
        molecOrbDict['virtOrbs'] = list(map(float, molecOrbDict['virtOrbs']))
        return(molecOrbDict)


                
#   def isNormalterm(self):
    
#    def whatIsJobType(self):
        
#    def 
    
    
        

In [3]:
# This snippet creates a dictionary of logfile objects
# to do: extend this to input dir
def listLogsMakelogFileObj(): 
    
    '''This function is designed to get a list of log files and turn them into a series of logfile objects
    that have a group of methods making their parsing and metadata collection easier'''
    
    workingDir = Path.cwd() #defines the working directory as current working directory path object
    logPathList = list(workingDir.glob('*.log')) # makes a list of path object with stated pattern
    
    logObjectDictionary = {}
    for logPath in logPathList:
        logPathString = str(logPath)
        logObject = logFileObj(logPathString)
        logObjectDictionary[logPathString] = logObject
    
    return(logObjectDictionary)
    


    
    

In [4]:
# creating the dictionary of logFile objects with filePath string keys
logFileObjs = listLogsMakelogFileObj()
print(logFileObjs)

{'/Users/Asa/Documents/Science/Research/dataScienceForPFAS/gaussMiner/HSO4-minus-12Water.log': <__main__.logFileObj object at 0x10aefd250>, '/Users/Asa/Documents/Science/Research/dataScienceForPFAS/gaussMiner/secondTest.log': <__main__.logFileObj object at 0x108eab910>, '/Users/Asa/Documents/Science/Research/dataScienceForPFAS/gaussMiner/SO4-2minus-24Water-OCT.log': <__main__.logFileObj object at 0x10aefdcd0>, '/Users/Asa/Documents/Science/Research/dataScienceForPFAS/gaussMiner/test.log': <__main__.logFileObj object at 0x10aefdf90>, '/Users/Asa/Documents/Science/Research/dataScienceForPFAS/gaussMiner/test2.log': <__main__.logFileObj object at 0x10aefded0>}


## To do
* I need a column indicating the completion level of the job with regards to frequency calculations

## This section is going to parse what kind of job was run in order to collect appropriate data

In [5]:
#This code snippet is to establish which king of job was made
def whichJobType():
    optimizationReg = re.compile(r'\sopt\W', "i")
    thermoFrequencyReg = re.compile(r'\sfreq\s', "i")
    otherFreqReg = re.compile(r'\sfreq\W', "i") # TO DO I want to exclude the space for this one.  Use groups.
    ircReg = re.compile(r'\sirc\s', "i")
    transitionStateReg = re.compile(r'\sopt\W\s', "i")
    
# probably just identify thermo jobs for now and pull out the rest of the parameters regardless

In [6]:
#This code snipet establishes the level of theory, basis set, and solvent model
def whichLevelOfTheory():
    levelOfTheoryReg = re.compile(r'')
    solventModelReg = re.compile(r'')
    

In [7]:
# assigning SCF energy metaData to logFile objects using the corresponding method.
for key in logFileObjs.keys():
    logFileObjs[key].energySCF = logFileObjs[key].getFinalSCF()
for key in logFileObjs.keys():
    logFileObjs[key].energyFree = logFileObjs[key].getEnergyFree()

In [8]:
# testing out SCF energy assignment
print(logFileObjs['/Users/Asa/Documents/Science/Research/dataScienceForPFAS/gaussMiner/HSO4-minus-12Water.log'].energySCF)

-1617.70693639


## HOMO LUMO energies

In [9]:
MOs = logFileObjs['/Users/Asa/Documents/Science/Research/dataScienceForPFAS/gaussMiner/HSO4-minus-12Water.log'].getMOs()
MOs

{'occOrbs': [-89.10268,
  -19.1716,
  -19.14644,
  -19.13781,
  -19.13688,
  -19.1367,
  -19.13607,
  -19.13005,
  -19.12504,
  -19.12501,
  -19.12437,
  -19.12397,
  -19.12396,
  -19.12346,
  -19.12306,
  -19.12231,
  -19.12188,
  -8.15445,
  -6.11883,
  -6.11698,
  -6.11688,
  -1.18074,
  -1.05018,
  -1.0345,
  -1.02792,
  -1.02611,
  -1.02399,
  -1.02246,
  -1.02143,
  -1.01995,
  -1.01195,
  -1.00707,
  -1.00669,
  -1.00654,
  -1.0062,
  -1.00502,
  -1.00449,
  -0.66011,
  -0.57295,
  -0.55616,
  -0.55048,
  -0.53873,
  -0.53708,
  -0.53683,
  -0.53649,
  -0.53114,
  -0.53073,
  -0.52481,
  -0.52455,
  -0.52437,
  -0.52238,
  -0.50353,
  -0.5029,
  -0.4694,
  -0.42801,
  -0.42162,
  -0.41465,
  -0.41309,
  -0.41144,
  -0.40883,
  -0.40579,
  -0.40562,
  -0.40234,
  -0.40142,
  -0.39875,
  -0.39709,
  -0.38802,
  -0.37733,
  -0.3719,
  -0.36946,
  -0.34886,
  -0.33662,
  -0.3313,
  -0.33059,
  -0.32975,
  -0.32776,
  -0.32531,
  -0.31942,
  -0.31675,
  -0.31636,
  -0.31591,
  -0.315

In [10]:
#This code snippet assigns alpha electron energies as attributes to log file objects
for key in logFileObjs.keys():
    try: 
        alphaEnergiesDict = logFileObjs[key].getMOs()
        logFileObjs[key].alphaOcc = alphaEnergiesDict['occOrbs']
        logFileObjs[key].alphaVirtual = alphaEnergiesDict['virtOrbs']
        logFileObjs[key].homo = alphaEnergiesDict['occOrbs'][-1]
        logFileObjs[key].lumo = alphaEnergiesDict['virtOrbs'][0]
    except KeyError:
        pass

In [15]:
logFileObjs['/Users/Asa/Documents/Science/Research/dataScienceForPFAS/gaussMiner/HSO4-minus-12Water.log'].lumo




-0.02008

## Writing File Attributes To a .CSV

In [38]:
workDir = Path.cwd()
outPutFileName = str(workDir.name) + '_csv.txt'
print(outPutFileName)
with open(outPutFileName, "a+") as fOut:
    fOut.write('name, energySCF, energyFree, homo, lumo\n')
    for key in logFileObjs.keys():
        fOut.write(f'{logFileObjs[key].name}, {logFileObjs[key].energySCF}, {logFileObjs[key].energyFree}, \
{logFileObjs[key].homo}, {logFileObjs[key].lumo}\n')
        
    

gaussMiner_csv.txt
