In [None]:
"""
Develop for linux system only
In windows system, the end of line is '\r\n', but in linux system, the end of line is '\n'
So we modify the end of line to '\n' in the open function.
We get the PDB files from the SMILES files,  then 
traverse the PDB files to get the atom coordinates, then
write the gjf files for Gaussian16 calculation, then
write the bash file for running the gjf files, then
write the slurm file for submitting the jobs to the cluster.
"""

import numpy as np
import os
from rdkit import Chem
from rdkit.Chem import AllChem
import os

# # 读取SMILES文件
# with open('smiles_screen.csv', 'r') as file:
#     smiles = file.read().strip()
#     smiles = smiles.split('\n')

#     for i, smile in enumerate(smiles):
#         print(i)
#         print(smile)
#         try:
#             # 将SMILES字符串转换为RDKit分子对象
#             molecule = Chem.MolFromSmiles(smile)
#             heavymolecule = molecule.GetNumHeavyAtoms()
#             # 生成3D坐标
#             molecule = Chem.AddHs(molecule)
#             AllChem.EmbedMolecule(molecule)
#             AllChem.UFFOptimizeMolecule(molecule)
#             if not os.path.exists('PDB'):
#                 os.makedirs('PDB')

#             # 将分子对象写入PDB文件
#             with open(f'PDB/ext_{i}.pdb', 'w') as file:
#                 file.write(Chem.MolToPDBBlock(molecule))
#         except Exception as e:
#             print(f"Error processing SMILES {smile} at index {i}: {e}")


# 遍历当前文件夹中所有的PDB文件
def get_pdb_files():
    pdb_files = []
    pdb_folder = 'PDB'
    if os.path.exists(pdb_folder):
        for file in os.listdir(pdb_folder):
            if file.endswith('.pdb'):
                pdb_files.append(os.path.join(pdb_folder, file))
    return pdb_files

# 逐一读取PDB文件中的原子坐标
def read_pdb_file(file):
    pos = []
    with open(file, 'r') as f:
        pdb_data = f.readlines()
        pdb_data = [line.strip() for line in pdb_data if line.startswith('HETATM')]
    for line in pdb_data:

        line = line.split()
        atom_type = line[2]
        # 根据实际的PDB来修改
        if len(line) == 11:
            x, y, z = map(float, line[5:8])
        elif len(line) == 9:
            x, y, z = map(float, line[5:8])
        elif len(line) == 8:
            x, y, z = map(float, line[4:7])
        else:
            continue

        if len(atom_type) == 1: 
            pos.append([atom_type, x, y, z])
        elif len(atom_type) >= 2:
            if atom_type[0].isalpha() and atom_type[1].isdigit():
                pos.append([atom_type[0], x, y, z])
            elif atom_type[0].isalpha() and atom_type[1].isalpha():
                pos.append([atom_type[0] + atom_type[1].lower(), x, y, z])
        else: continue
    return np.array(pos, dtype=object)

# 开一个子文件夹, 存放即将要写入的gjf文件
def make_dir():
    if not os.path.exists('gjf_files'):
        os.mkdir('gjf_files')
        
# 写入gjf文件       
def write_gjf():
    pdb_files = get_pdb_files()
    
    for i, file in enumerate(pdb_files):
        folder_index = i // 1  # 每个文件一个文件夹
        folder_name = f'gjf_files{folder_index + 1}'  # 文件夹后缀从1开始
        
        if not os.path.exists(folder_name):
            os.mkdir(folder_name)
        
        pos = read_pdb_file(file)
        
        gjf_filename = os.path.join(folder_name, os.path.basename(file)[:-4] + '.gjf')
        with open(gjf_filename, 'w') as f:
            f.write('%nprocshared=48\n')  
            f.write('%mem=80GB\n')
            f.write('%chk=' + os.path.basename(file)[:-4] + '.chk\n')
            f.write('# opt=Cartesian b3lyp/def2svp em=gd3bj  \n\n')
            f.write('Title Card Required\n\n')
            f.write('0 1\n')       # 电荷与自选多重度 自己改写
            
            # 按照注释中的数据模式写入
            for atom in pos:
                f.write(' {:<2} {:>12.6f} {:>12.6f} {:>12.6f}\n'.format(atom[0], float(atom[1]), float(atom[2]), float(atom[3])))
            f.write('\n\n')
            f.write('--link1--\n')
            f.write('%nprocshared=48\n')  
            f.write('%mem=80GB\n')
            f.write(f'%oldchk=' + file[4:-4] + '.chk\n')
            f.write(f'%chk=' + file[4:-4] + 'sp' + '.chk\n')
            f.write('# b3lyp/def2tzvp em=gd3bj geom=allcheck\n\n\n')

# 生成一个bash文件, 用于批量运行g16
def write_bash():
    folder_index = 0
    while True:
        folder_name = f'gjf_files{folder_index + 1}'  # 文件夹后缀从1开始
        if not os.path.exists(folder_name):
            break
        
        with open(os.path.join(folder_name, 'run_g16.sh'), 'w') as f:
            f.write('#!/bin/bash\n')
            for file in os.listdir(folder_name):
                if file.endswith('.gjf'):
                    f.write(f'g16 < {file} | tee {file[:-4]}.out\n')
        
        folder_index += 1

# 写一个slurm文件, 用于批量提交任务
def write_slurm():
    folder_index = 0
    while True:
        folder_name = f'gjf_files{folder_index + 1}'  # 文件夹后缀从1开始
        if not os.path.exists(folder_name):
            break
        
        slurm_filename = os.path.join(folder_name, 'gaussian_slurm.sh')
        with open(slurm_filename, 'w', newline='\n') as f:
            f.write('#!/bin/bash\n')
            f.write('#SBATCH -o Gaussian.%j.out\n')
            f.write('#SBATCH -J Gaussian\n')
            f.write('#SBATCH --nodes=1\n')
            f.write('#SBATCH --ntasks-per-node=48\n')
            f.write('#SBATCH --exclude=node[0100,0121,0181,0227,0233-0234,0235,0283,0364,0371,0382-0385,0405,0547-0548,0575,0599,0602-0603,0664-0665,0734,0819,0824]\n\n')
            for file in os.listdir(folder_name):
                if file.endswith('.gjf'):
                    f.write(f'GJF={file}\n')
            f.write('gaussbin=g16\n')
            f.write('source /public/software/gaussian/g16/bsd/g16.profile\n\n')
            f.write('$guassbin  $GJF\n')
        
        folder_index += 1

write_gjf()
# write_bash()
write_slurm()
