In [None]:
import numpy as np
import os
from rdkit import Chem
from rdkit.Chem import AllChem
import os
import pandas as pd

# read excel file
def read_excel(file):
    df = pd.read_excel(file, sheet_name='Sheet1')
    smiles = df['SMILES'].values
    for i, smile in enumerate(smiles):
        print(i)
        print(smile)
        try:
            # 将SMILES字符串转换为RDKit分子对象
            molecule = Chem.MolFromSmiles(smile)
            heavymolecule = molecule.GetNumHeavyAtoms()
            # 生成3D坐标
            molecule = Chem.AddHs(molecule)
            AllChem.EmbedMolecule(molecule)
            AllChem.UFFOptimizeMolecule(molecule)
            if not os.path.exists('PDB'):
                os.makedirs('PDB')

            # 将分子对象写入PDB文件
            with open(f'PDB/ext_{i}.pdb', 'w') as file:
                file.write(Chem.MolToPDBBlock(molecule))
        except Exception as e:
            print(f"Error processing SMILES {smile} at index {i}: {e}")

# 遍历当前文件夹中所有的PDB文件
def get_pdb_files():
    pdb_files = []
    pdb_folder = 'PDB'
    if os.path.exists(pdb_folder):
        for file in os.listdir(pdb_folder):
            if file.endswith('.pdb'):
                pdb_files.append(os.path.join(pdb_folder, file))
    return pdb_files

# 逐一读取PDB文件中的原子坐标
def read_pdb_file(file):
    pos = []
    with open(file, 'r') as f:
        pdb_data = f.readlines()
        pdb_data = [line.strip() for line in pdb_data if line.startswith('HETATM')]
    for line in pdb_data:

        line = line.split()
        atom_type = line[2]
        # 根据实际的PDB来修改
        if len(line) == 11:
            x, y, z = map(float, line[5:8])
        elif len(line) == 9:
            x, y, z = map(float, line[5:8])
        elif len(line) == 8:
            x, y, z = map(float, line[4:7])
        else:
            continue

        if len(atom_type) == 1: 
            pos.append([atom_type, x, y, z])
        elif len(atom_type) >= 2:
            if atom_type[0].isalpha() and atom_type[1].isdigit():
                pos.append([atom_type[0], x, y, z])
            elif atom_type[0].isalpha() and atom_type[1].isalpha():
                pos.append([atom_type[0] + atom_type[1].lower(), x, y, z])
        else: continue
    return np.array(pos, dtype=object)

# 开一个子文件夹, 存放即将要写入的gjf文件
def make_dir():
    if not os.path.exists('gjf_files'):
        os.mkdir('gjf_files')
        
# 写入gjf文件       
def write_gjf():
    pdb_files = get_pdb_files()
    
    make_dir()
    for file in pdb_files:
        pos = read_pdb_file(file)
        filename = os.path.basename(file)  # Extract the filename from the full path
        
        with open('gjf_files/' + filename[:-4] + '.gjf', 'w') as f:
            f.write('%nprocshared=14\n')  
            f.write('%mem=4GB\n')
            f.write('%chk=' + filename[:-4] + '.chk\n')
            f.write('# opt freq b3lyp/def2svp em=gd3bj  \n\n')
            f.write('Title Card Required\n\n')
            f.write('0 1\n')       # 电荷与自选多重度 自己改写
            
            # 按照注释中的数据模式写入
            for atom in pos:
                f.write(' {:<2} {:>12.6f} {:>12.6f} {:>12.6f}\n'.format(atom[0], float(atom[1]), float(atom[2]), float(atom[3])))
            f.write('\n\n')
            f.write('--link1--\n')
            f.write(f'%oldchk=' + filename[:-4] + '.chk\n')
            f.write(f'%chk=' + filename[:-4] + 'sp' + '.chk\n')
            f.write('# b3lyp/def2tzvp em=gd3bj geom=allcheck\n\n')
            
# 生成一个bash文件, 用于批量运行g16
def write_bash():
    with open('gjf_files/'+ 'run_g16.sh', 'w') as f:
        f.write('#!/bin/bash\n')
        for file in get_pdb_files():
            f.write('g16 < ' + file[4:-4] + '.gjf |tee ' + file[4:-4] + '.out\n')

     
read_excel('selected_data_with_smiles.xlsx')
write_gjf()
write_bash()

