In [8]:
from pathlib import Path
import yaml

import pandas as pd
from rdkit import Chem

property_name = "Tg"
configs = yaml.safe_load(
    open(f"configs/{property_name}.yml", "r", encoding="utf-8")
)
polymer = configs["polymer"]
permitted_atomes = configs["computable_atoms"]
if polymer:
    permitted_atomes += ["*"]
permitted_atomes_set = set(permitted_atomes)

file_path = Path(f"data/raw/{property_name}.csv")
prop_col = configs["dataset"]["prop_col_name"]

df = pd.read_csv(file_path)
df = df[["smiles", *prop_col]]
display(df.head())
print("前処理前のサンプル数:", len(df))
def check_atoms(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        atoms_set = set([atom.GetSymbol() for atom in mol.GetAtoms()])
        isin = permitted_atomes_set > atoms_set
        if isin:
            return smiles
        else:
            return None
    except:
        return None

df["smiles"] = df["smiles"].map(check_atoms)
df = df.dropna().reset_index(drop=True)
df.to_csv(f"./data/processed/{property_name}.csv", index=False)
print("前処理後のサンプル数:", len(df))
df

Unnamed: 0,smiles,tg
0,*C*,-120.0
1,*CC*,-120.0
2,*CC(*)C,-3.0
3,*CC(*)CC,-24.1
4,*CC(*)CCC,-37.0


前処理前のサンプル数: 6696
前処理後のサンプル数: 6348


Unnamed: 0,smiles,tg
0,*C*,-120.0
1,*CC*,-120.0
2,*CC(*)C,-3.0
3,*CC(*)CC,-24.1
4,*CC(*)CCC,-37.0
...,...,...
6343,*CC(*)(F)C(=O)OCCC,62.0
6344,*CC(F)(F)C1(F)C(*)CC(O)(C(F)(F)F)C1(F)F,152.0
6345,*CC(F)(F)C1(F)CC(CC(O)(C(F)(F)F)C(F)(F)F)CC1*,98.0
6346,*CC(F)(F)C1(F)CC(C(O)(C(F)(F)F)C(F)(F)F)CC1*,118.0
