In [2]:
def find_repeated_smiles(file_content: str) -> set:
    lines = file_content.strip().split("\n")
    smiles_set = set()
    repeated_smiles = set()

    for i, line in enumerate(lines):
        if line == "<LIGAND>":
            smiles = lines[i + 1].strip()
            if smiles in smiles_set:
                repeated_smiles.add(smiles)
            else:
                smiles_set.add(smiles)

    return repeated_smiles

In [1]:
with open('../data/xyz_mols/input.txt', 'r') as f:
    file_content = f.read()

In [3]:
repeated_smiles = find_repeated_smiles(file_content)
repeated_smiles

{'N[C@@H]1C[S@](=O)C[C@@H]1C(=O)O',
 'N[C@@H]1C[S@](=O)C[C@H]1C(=O)O',
 'OC1C2OC3OC1[C@H](O)C(O3)[C@H]2O',
 'O[C@H]1[C@H](O)[C@@H](O)[C@@H](O)[C@H](O)[C@H]1O',
 'O[C@H]1[C@H](O)[C@H](O)[C@H](O)[C@@H](O)[C@H]1O'}

In [4]:
def remove_repeated_smiles(file_content: str) -> str:
    lines = file_content.strip().split("\n")
    smiles_set = set()
    new_lines = []
    i = 0
    while i < len(lines):
        if lines[i] == "<LIGAND>":
            smiles = lines[i + 1].strip()
            if smiles in smiles_set:
                # Skip the block if SMILES is repeated
                i += 1
                while i < len(lines) and lines[i] != "<eos>":
                    i += 1
            else:
                smiles_set.add(smiles)
                new_lines.append(lines[i])  # <LIGAND>
                new_lines.append(lines[i + 1])  # SMILES
                i += 1
        else:
            new_lines.append(lines[i])
        i += 1
    
    return "\n".join(new_lines)

In [5]:
cleaned_content = remove_repeated_smiles(file_content)
cleaned_content

'<LIGAND>\nCn1c(=O)c2c(ncn2C)n(C)c1=O\n<XYZ>\nC 3.2932 0.3895 0.2537\nN 2.1108 -0.4198 0.1026\nC 2.0698 -1.7874 0.0333\nN 0.8358 -2.2304 -0.1052\nC 0.0712 -1.1032 -0.1240\nC 0.8238 0.0247 0.0014\nC 0.2701 1.3335 0.0113\nO 0.9576 2.3440 0.1250\nN -1.1216 1.3332 -0.1210\nC -1.8079 2.6102 -0.1253\nC -1.9352 0.1881 -0.2531\nO -3.1606 0.2905 -0.3648\nN -1.2957 -1.0520 -0.2516\nC -2.0603 -2.2787 -0.3829\n<eos>\n<LIGAND>\nO=C(O)[C@@H]1/C(=C/CO)O[C@@H]2CC(=O)N12\n<XYZ>\nO 0.3990 2.7262 -1.1579\nC -0.1430 2.3432 -0.1315\nO -0.9607 3.1860 0.5403\nC 0.1305 1.0292 0.6271\nC 0.6167 -0.2112 -0.1060\nC 1.4080 -0.2522 -1.1889\nC 1.8543 -1.4905 -1.8916\nO 3.2699 -1.5542 -1.8172\nO 0.1589 -1.4067 0.4791\nC -0.7524 -0.9707 1.5189\nC -2.2552 -1.1558 1.2731\nC -2.1894 0.2998 0.8051\nO -2.8553 1.0092 0.0896\nN -0.9694 0.4876 1.4145\n<eos>\n<LIGAND>\nCn1c(=O)c2ncnnc2n(C)c1=O\n<XYZ>\nC -0.7375 2.5467 0.0523\nN -0.2162 1.1884 0.0249\nC -1.1452 0.1366 0.1170\nN -2.4623 0.3968 0.2477\nN -3.3494 -0.6399 0.3272\nC

In [6]:
repeated_smiles = find_repeated_smiles(cleaned_content)
repeated_smiles

set()

In [7]:
def check_format(file_content: str) -> tuple:
    lines = file_content.strip().split("\n")
    i = 0
    valid_format = True
    issues = []
    
    while i < len(lines):
        if lines[i] != "<LIGAND>":
            issues.append(f"Line {i+1}: Expected <LIGAND>, found '{lines[i]}'")
            valid_format = False
            i += 1
            continue
        
        if i + 1 >= len(lines) or not lines[i + 1]:
            issues.append(f"Line {i+2}: Missing SMILES string after <LIGAND>")
            valid_format = False
        i += 2  # Move to the line after SMILES
        
        if i >= len(lines) or lines[i] != "<XYZ>":
            issues.append(f"Line {i+1}: Expected <XYZ>, found '{lines[i]}'" if i < len(lines) else f"Line {i+1}: Missing <XYZ>")
            valid_format = False
            i += 1
            continue
        
        i += 1  # Move to the line after <XYZ>
        
        while i < len(lines) and not lines[i].startswith("<"):
            i += 1  # Skip over the atomic coordinates
        
        if i >= len(lines) or lines[i] != "<eos>":
            issues.append(f"Line {i+1}: Expected <eos>, found '{lines[i]}'" if i < len(lines) else f"Line {i+1}: Missing <eos>")
            valid_format = False
            continue
        
        i += 1  # Move to the next block

    return valid_format, issues

# Run the check on the cleaned content
valid_format, format_issues = check_format(cleaned_content)
valid_format, format_issues

(True, [])

In [8]:
# save the cleaned content
with open('../data/xyz_mols/cleaned_input.txt', 'w') as f:
    f.write(cleaned_content)