In [6]:
def read_data_file(file_path):
    """
    Reads the .data file and extracts HOMO, LUMO, and GAP values for each conformer.
    
    Args:
        file_path (str): Path to the .data file.
    
    Returns:
        dict: A dictionary with conformer numbers as keys and their properties (HOMO, LUMO, GAP) as values.
    """
    conformers = {}  # Store data for each conformer
    with open(file_path, 'r') as f:
        lines = f.readlines()

    conformer_idx = None
    for i, line in enumerate(lines):
        line = line.strip()
        if line.startswith("Conformer"):  # Start of a new conformer
            conformer_idx = int(line.split()[-1])  # Extract conformer number
            conformers[conformer_idx] = {}
        
        if line.startswith("QChem"):
            parts = line.split(",")
            method = parts[0].split()[1]  # Extract the DFT method (e.g., B3LYP)
            homo = float(parts[1])  # HOMO value
            lumo = float(parts[2])  # LUMO value
            gap = float(parts[3])  # Gap value
            conformers[conformer_idx][method] = {"HOMO": homo, "LUMO": lumo, "GAP": gap}

    return conformers


def display_conformer_data(conformers):
    """
    Displays the extracted HOMO, LUMO, and GAP values for each conformer.
    
    Args:
        conformers (dict): Conformer data as returned by `read_data_file`.
    """
    for conformer, methods in conformers.items():
        print(f"Conformer {conformer}:")
        for method, values in methods.items():
            print(f"  {method}: HOMO = {values['HOMO']:.3f}, LUMO = {values['LUMO']:.3f}, GAP = {values['GAP']:.3f}")
        print()

import os
import os
from pathlib import Path


notebook_dir = Path(os.getcwd())
parent_dir = os.path.dirname(notebook_dir)
data_dir = os.path.join(parent_dir, 'data')

file_path = data_dir + "/HOPV_15_revised.data"
conformer_data = read_data_file(file_path)
display_conformer_data(conformer_data)


FileNotFoundError: [Errno 2] No such file or directory: '/home/maria/dyn-detanet/data/HOPV_15_revised.data'

In [7]:
import os
from torch_geometric.data import Data

def read_harvard_opv(file_path):
    """
    Reads the Harvard OPV dataset file and transforms it into a list of Data objects.
    
    Args:
        file_path (str): Path to the Harvard OPV .data file.
    
    Returns:
        data_list (list): List of Data objects.
    """
    data_list = []

    with open(file_path, 'r') as f:
        lines = f.readlines()

    i = 0
    while i < len(lines):
        line = lines[i].strip()
        if line.startswith("Conformer"):
            # Parse the conformer information
            conformer_number = int(line.split(" ")[1])

            # Get the number of atoms
            i += 1
            num_atoms = int(lines[i].strip())

            # Parse atomic positions and numbers
            pos = []
            z = []
            for _ in range(num_atoms):
                i += 1
                atom_line = lines[i].strip().split()
                z.append(get_atomic_number(atom_line[0]))  # Convert element symbol to atomic number
                pos.append([float(atom_line[1]), float(atom_line[2]), float(atom_line[3])])

            # Parse the next line for property values (HOMO, LUMO, gap)
            i += 1
            properties = lines[i].strip().split(",")
            homo = float(properties[1])  # HOMO value
            lumo = float(properties[2])  # LUMO value
            gap = float(properties[3])  # Gap value

            # Create a Data object
            data = Data(
                pos=pos,                # Atomic positions
                z=z,                    # Atomic numbers
                homo=homo,              # HOMO value
                lumo=lumo,              # LUMO value
                gap=gap,                # Gap value
                conformer=conformer_number  # Conformer ID
            )
            data_list.append(data)

        i += 1  # Move to the next line

    return data_list


def get_atomic_number(symbol):
    """
    Converts an atomic symbol to its atomic number.
    
    Args:
        symbol (str): Atomic symbol (e.g., 'C', 'H').
    
    Returns:
        atomic_number (int): Corresponding atomic number.
    """
    periodic_table = {
        'H': 1, 'He': 2, 'Li': 3, 'Be': 4, 'B': 5, 'C': 6, 'N': 7, 'O': 8, 'F': 9, 'Ne': 10,
        'Na': 11, 'Mg': 12, 'Al': 13, 'Si': 14, 'P': 15, 'S': 16, 'Cl': 17, 'Ar': 18, 'K': 19,
        'Ca': 20, 'Sc': 21, 'Ti': 22, 'V': 23, 'Cr': 24, 'Mn': 25, 'Fe': 26, 'Co': 27, 'Ni': 28,
        'Cu': 29, 'Zn': 30, 'Ga': 31, 'Ge': 32, 'As': 33, 'Se': 34, 'Br': 35, 'Kr': 36, 'Rb': 37,
        'Sr': 38, 'Y': 39, 'Zr': 40, 'Nb': 41, 'Mo': 42, 'Tc': 43, 'Ru': 44, 'Rh': 45, 'Pd': 46,
        'Ag': 47, 'Cd': 48, 'In': 49, 'Sn': 50, 'Sb': 51, 'Te': 52, 'I': 53, 'Xe': 54, 'Cs': 55,
        'Ba': 56, 'La': 57, 'Ce': 58, 'Pr': 59, 'Nd': 60, 'Pm': 61, 'Sm': 62, 'Eu': 63, 'Gd': 64,
        'Tb': 65, 'Dy': 66, 'Ho': 67, 'Er': 68, 'Tm': 69, 'Yb': 70, 'Lu': 71, 'Hf': 72, 'Ta': 73,
        'W': 74, 'Re': 75, 'Os': 76, 'Ir': 77, 'Pt': 78, 'Au': 79, 'Hg': 80, 'Tl': 81, 'Pb': 82,
        'Bi': 83, 'Po': 84, 'At': 85, 'Rn': 86, 'Fr': 87, 'Ra': 88, 'Ac': 89, 'Th': 90, 'Pa': 91,
        'U': 92, 'Np': 93, 'Pu': 94, 'Am': 95, 'Cm': 96, 'Bk': 97, 'Cf': 98, 'Es': 99, 'Fm': 100,
        'Md': 101, 'No': 102, 'Lr': 103
    }
    return periodic_table[symbol]


# Example Usage
file_path = "/media/maria/work_space/capsule-3259363/data/HOPV_15_revised_2.data"  # Replace with the actual file path
data_list = read_harvard_opv(file_path)

# Print the first data object as an example
print(data_list[0])


Data(pos=[68], z=[68], homo=-0.187, lumo=-0.099, gap=0.088, conformer=1)


In [8]:
print(data_list[0].z)

[6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 16, 6, 6, 6, 6, 6, 16, 6, 6, 6, 16, 6, 8, 8, 16, 6, 16, 16, 6, 6, 6, 6, 6, 6, 6, 16, 6, 6, 6, 16, 6, 16, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [9]:
print(len(data_list))

4855
