In [1]:
import numpy as np
import pandas as pd
import os
import csv


In [2]:
# define csv file
csv_file = '../Dataset/dataset_left.csv'
df = pd.DataFrame(columns=('Pos1', 'Pos2', 'Pos3', 'Pos4', 'Pos5', 'Pos6', 'Pos7', 'Pos8', 'Pos9', 'Pos10', 'Pos11', 'Pos12', 'Pos13', 'Pos14', 'Pos15', 'Pos16', 'Nsubs', 'R', 'lambd'))

In [3]:
# Read logfiles

dir_path = '../6Helicenos'
logfiles = []
for path in os.listdir(dir_path):
    if os.path.isfile(os.path.join(dir_path, path)):
        logfiles.append(path)

# atomic masses
mass = np.zeros(5)
mass[0] = 1.0     # H
mass[1] = 18.998  # F
mass[2] = 35.453  # Cl
mass[3] = 79.904  # Br
mass[4] = 126.904 # I

y = np.zeros(2)
i_data = 0
for logfile in logfiles:

    infile = open(os.path.join(dir_path,logfile),'r')

    # first, full of 16 hydrogens (id 0)
    x = [0]*16
    n_substitutes = 0

    ### Use logfile name to define the feature vector
    if(logfile!="6Heliceno.log"):

        # Remove rubish from the logfile name
        logfile = logfile.replace('6Heliceno_','')
        logfile = logfile.replace('.log','')
        #print(logfile)
        # Split substituent elements 
        elements = logfile.split('_')
        #print(elements)
        for element in elements:
            xaux = element.split('-')
            i = int(xaux[0])-1
            halogen = xaux[1]
            if(halogen=='fluor'):
                halogen_id = 1
                n_substitutes = n_substitutes + 1
            elif(halogen=='cloro'):
                halogen_id = 2
                n_substitutes = n_substitutes + 1
            elif(halogen=='bromo'):
                halogen_id = 3
                n_substitutes = n_substitutes + 1
            elif(halogen=='yodo'):
                halogen_id = 4
                n_substitutes = n_substitutes + 1

            x[i] = halogen_id
        #print(x)
    else:
        print("6Heliceno")

    ### Define targets from higher Rotatory Strength (R) and wave length
    # read all lines of logfile into a list
    lines = infile.readlines()

    # Find R data
    for line in lines:
        # check if string is present on the current line
        word = 'R(length)'
        if line.find(word) != -1:
            j_init = lines.index(line) + 1
            break

    # compute maximum R
    R_max = 0
    for j in range(100):
        line = lines[j+j_init]
        # read R
        columns = line.split()
        state = int(columns[0])
        R = float(columns[4])
        if(R>R_max): 
            R_max = R
            state_max = state
    #print(R_max)
    y[0] = R_max

    # Find wave length data
    for line in lines:
        words = 'oscillator strengths:'
        if line.find(words) != -1:
            j_init = lines.index(line) + 2
            break

    # Find wave length of the state with maximum R
    for line in lines[j_init:]:
        columns = line.split()
        #print(columns)
        if(len(columns)>0):
            word1 = columns[0]
            if(word1=='Excited'):
                state = columns[2]
                state = state.replace(':','')
                state = int(state)
                if(state==state_max):
                    #print(state)
                    lambd = float(columns[6])
                    break                
    #print(lambd)
    y[1] = lambd

    #print(y)

    # define symmetric molecule
    x_reverse = x.copy()
    x_reverse.reverse()

    # compute center of masses of the molecules
    mu = 0
    mu_reverse = 0
    m_total = 0
    for i in range(16):
        m_total = m_total + mass[x[i]]
        mu = mu + float(i+1) * mass[x[i]]
        mu_reverse = mu_reverse + float(i+1) * mass[x_reverse[i]]
    mu = mu / m_total
    mu_reverse = mu_reverse / m_total
    #print(mu)
    #print(mu_reverse)

    # select the most left molecule
    if(mu_reverse < mu):
        x.reverse()

    ### Save data into a dataframe
    datum = []
    for i in range(16):
        datum.append(str(x[i]))
    datum.append(str(n_substitutes)) # append number of subtitutes to data
    datum.append(str(y[0]))
    datum.append(str(y[1]))
    print(datum)
    df.loc[i_data] = datum
    i_data = i_data+1

['0', '0', '0', '0', '0', '4', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '2', '484.4822', '350.23']
['0', '4', '2', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '2', '774.6917', '347.5']
['3', '3', '0', '0', '0', '0', '0', '1', '2', '0', '0', '0', '0', '0', '0', '0', '4', '644.614', '364.11']
['1', '3', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '2', '630.3364', '347.1']
['0', '1', '0', '0', '4', '0', '0', '0', '0', '4', '0', '3', '0', '0', '0', '0', '4', '392.8587', '358.78']
['3', '0', '2', '0', '4', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '3', '404.2231', '366.31']
['0', '2', '0', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '2', '698.8705', '344.2']
['0', '0', '0', '0', '0', '0', '0', '2', '0', '0', '0', '0', '0', '0', '0', '0', '1', '668.6168', '344.16']
['0', '0', '3', '1', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '2', '722.1028', '343.5']
['0', '4', '0', '0', '0', '0', '0

In [4]:
# Save dataframe into a csv file 
df.to_csv(csv_file)
df.shape

(253, 19)