## Select Top 1 molecules that has the specific properties for the task

In [None]:

import os
import re
import datetime
from mordred import Calculator, descriptors
from rdkit import Chem
import numpy as np
import pandas as pd
from rdkit.Chem import PandasTools
import sys
import torch
# get encoder info in X
import sys
sys.path.append('VAE_model/cpu')
from fast_jtnn import *
from rdkit import Chem

# setting VAE params
model_path = "VAE_model/model.epoch-39"
vocab_path = "VAE_model/model.epoch-39/smi_vocab-2.txt"


# Load vocabulary
vocab = [x.strip("\r\n ") for x in open(vocab_path)]
vocab = Vocab(vocab)

# Initial Step for VAE
vae_model = JTNNVAE(vocab, hidden_size=450, latent_size=32, depthT=3, depthG=20)
vae_model.load_state_dict(torch.load(model_path, map_location='cpu'))
vae_model.cpu()
vae_model.eval()

In [None]:
today = datetime.date.today().strftime("%Y%m%d")

#methods = ['ABC', 'GA', 'PSO', 'SA']
methods = ['PSO', 'ABC']
methods = ['PSO/round6', 'PSO/round7', 'PSO/round8', 'PSO/round9',  'PSO/round10']
methods = ['BO/round1', 'BO/round2', 'BO/round3', 'BO/round4','BO/round5','BO/round6', 'BO/round7', 'BO/round8', 'BO/round9',  'BO/round10']
methods = ['Random/round1', 'Random/round2', 'Random/round3', 'Random/round4', 'Random/round5','Random/round6', 'Random/round7', 'Random/round8', 'Random/round9', 'Random/round10']

methods = ['PSO-re/round1', 'PSO-re/round2', 'PSO-re/round3', 'PSO-re/round4',  'PSO-re/round5',
'PSO-re/round6', 'PSO-re/round7', 'PSO-re/round8','PSO-re/round9',  'PSO-re/round10']


methods = ['ABC_T/round4', 'ABC_T/round5',
            'ABC_T/round6', 'ABC_T/round7', 'ABC_T/round8', 'ABC_T/round9', 'ABC_T/round10']

methods = [ 'ABC_T/round5',
            'ABC_T/round6', 'ABC_T/round7','ABC_T/round9']

subfolder = 'c2'

for method in methods:
    print(f"Processing {method} method...")
    folder_path_cn = f'/home/ianlee/opt_ian/Model_Create_and_Results1/Pvk_additives/1_Preprocessing_for_Analysis/{method}/{subfolder}'  # 取得當前工作目錄
    folder_list = []

    for item in os.listdir(folder_path_cn):
        if os.path.isdir(os.path.join(folder_path_cn, item)) and item.startswith("Result"):
            folder_list.append(item)

    data = []
    for folder_name in folder_list:
        folder_path = os.path.join(folder_path_cn, folder_name)
        if os.path.exists(folder_path):
            file_list = [file for file in os.listdir(folder_path) if '.sh.o' in file]
            for file in file_list:
                print(f'Processing {file}')
                file_path = os.path.join(folder_path, file)
                with open(file_path, 'r') as f:
                    file_content = f.read()
                parts = file_content.split('\n\n')
                iteration_details, rank_details = parts[-2], parts[-1]
                fitness_list = re.findall('Fitness: ([\de\.\+\-]+)', rank_details)
                fitness_list = list(map(float, fitness_list))
                ranks = rank_details.split("-------------------------")
                
                #read in every data that has rank 1-10
                best = 0
                for i in range(10):
                    #best_fitness_index = fitness_list.index(fitness_list[i])
                    rank_data = ranks[i].split('\n')
                    rank_data = [line.strip() for line in rank_data if line]
                    file_data = {
                        "Folder Name": folder_name,
                        "Smiles": rank_data[1].split(": ")[1].strip(),
                        "Fitness": rank_data[2].split(": ")[1].strip(),
                        "Reagent1_(ul)": rank_data[5].split(": ")[1].strip(),
                        "Reagent2_(ul)": rank_data[6].split(": ")[1].strip(),
                        "Reagent3_(ul)": rank_data[7].split(": ")[1].strip(),
                        "Reagent4_(ul)": rank_data[8].split(": ")[1].strip(),
                        "lab_code": rank_data[9].split(": ")[1].strip(),
                        "crystal_size": rank_data[11].split(": ")[1].strip()}
                    try:
                        vae_model.encode_latent_mean([file_data['Smiles']])
                        calc = Calculator(descriptors, ignore_3D=True)
                        smiles = (file_data['Smiles'])
                        mols = Chem.MolFromSmiles(smiles)
                        mordred_data = calc.pandas([mols])
                        mordred_data = mordred_data.dropna(axis='columns')
                        numeric_cols = mordred_data.select_dtypes(exclude='number')
                        mordred_data.drop(numeric_cols, axis=1, inplace=True)
                        #print(mordred_data.shape)

                        columns_exist = all(item in mordred_data.columns for item in ["ATSC5v", "AATSC5Z", "MATS8se"])

                        if columns_exist:
                            if best < 1:
                                data.append(file_data)
                                best = 1
                    except:
                        print(f"Error: {file_data['Smiles']}", "encode falied")      
                              
                #need to calculate the Mordred descriptors
        else:
            print(f"Folder '{folder_name}' does not exist.")

    df = pd.DataFrame(data)
    # modred feature calculating
    calc = Calculator(descriptors, ignore_3D=True)
    smiles_list = list(df['Smiles'])
    mols = [Chem.MolFromSmiles(smi) for smi in smiles_list]
    mordred_df = calc.pandas(mols)
    
    cols_to_convert = ['ATSC5v', 'AATSC5Z', 'MATS8se']
    
    import pandas as pd
    df = pd.concat((df, mordred_df['ATSC5v'] ,mordred_df['AATSC5Z'], mordred_df['MATS8se']), axis=1)
    
    df[["crystal_size", "crystal_size_Std"]] = df["crystal_size"].str.split(",", expand=True).astype(float)
    df["Fitness"] = df["Fitness"].astype(float)  
    df = df.sort_values(by='Fitness', ascending=False)  
    df.reset_index(drop=True, inplace=True) 
    df = df[['Folder Name', 'Smiles', 'Fitness', 'ATSC5v', 'AATSC5Z', 
            'MATS8se', 'Reagent1_(ul)', 'Reagent2_(ul)', 'Reagent3_(ul)', 'Reagent4_(ul)', 'lab_code',
            'crystal_size', 'crystal_size_Std']]
    df.dropna()
    
    if len(df)>10:
        #keep the top 10 rows
        df = df.head(10)
    df.to_csv(os.path.join(folder_path_cn, f"{today}pvk_Report.csv"), index=False)
    print(f"CSV file '{today}pvk_Report.csv' has been created.")

    # 找到重复的文件夹名
    duplicates = df[df.duplicated('Smiles')]
    print('Duplicate Smiles:',duplicates['Smiles'])

# Show oracle results

In [None]:
# from mordred import Calculator, descriptors
from sklearn.ensemble import RandomForestRegressor
# from rdkit.Chem import PandasTools
import pickle

In [None]:
def pvk_crystal_predict(df: pd.DataFrame, 
                        pvk_rfr: RandomForestRegressor, 
                        # pvk_rfc: RandomForestClassifier,
                        ):

    # scroe predict
    pvk_size_feature_list = ['Reagent1 (ul)','Reagent2 (ul)','Reagent3 (ul)','Reagent4 (ul)','lab_code','ATSC5v', 'AATSC5Z', 'MATS8se']
    # pvk_score_feature_list = ['Reagent1 (ul)','Reagent2 (ul)','Reagent3 (ul)','Reagent4 (ul)','ATSC5v','ATSC5pe']
    df_size = df[pvk_size_feature_list]
    # df_score = df[pvk_score_feature_list]

    # return pvk_rfr.predict(df_size), pvk_rfc.predict(df_score), df[pvk_size_feature_list], df[pvk_score_feature_list]
    return pvk_rfr.predict(df_size), df[pvk_size_feature_list]


In [None]:
size_model_path = '/home/ianlee/opt_ian/Model_Create_and_Results1/Pvk_additives/0_Create_Ground_Truth_Model/pvkadditives/pvk_rfr_size.pkl'

with open(size_model_path, 'rb') as f:
    rf_regressor = pickle.load(f)

In [None]:
rf_regressor.feature_names_in_

In [None]:
#葉子點數計算
leaf_counts = [Estimator.get_n_leaves() for Estimator in rf_regressor]
mean_leaves = np.mean(leaf_counts)
print(mean_leaves)

In [None]:
inital_csv = pd.read_csv('/home/ianlee/opt_ian/Model_Create_and_Results1/Pvk_additives/0_Create_Surrogate_Model/cycle0_new.csv')
size_array = inital_csv['crystal_size']
criteria = np.percentile(size_array, 100)
print(criteria)
size_array.max(), size_array.mean() 

In [None]:

from matplotlib import colors
import matplotlib.pyplot as plt
import numpy as np
#methods = ['ABC', 'GA', 'PSO', 'SA']
methods = ['PSO','ABC']
methods = ['PSO/round6', 'PSO/round7', 'PSO/round8', 'PSO/round9',  'PSO/round10']
methods = ['Random/round1', 'Random/round2', 'Random/round3', 'Random/round4', 'Random/round5','Random/round6', 'Random/round7', 'Random/round8', 'Random/round9', 'Random/round10']

methods = ['PSO-re/round1', 'PSO-re/round2', 'PSO-re/round3', 'PSO-re/round4',  'PSO-re/round5',
'PSO-re/round6', 'PSO-re/round7', 'PSO-re/round8', 'PSO-re/round9',  'PSO-re/round10']

methods = ['ABC_T/round4', 'ABC_T/round5',
            'ABC_T/round6', 'ABC_T/round7', 'ABC_T/round8', 'ABC_T/round9', 'ABC_T/round10']

            
#methods = ['PSO-re/round1', 'PSO-re/round2', 'PSO-re/round3', 'PSO-re/round4',  'PSO-re/round5',
#'PSO-re/round6', 'PSO-re/round7', 'PSO-re/round8']

subfolder = 'c2'
today = datetime.date.today().strftime("%Y%m%d")

folder_path_cn = f'/home/ianlee/opt_ian/Model_Create_and_Results1/Pvk_additives/1_Preprocessing_for_Analysis/SA/{subfolder}'  # 取得當前工作目錄

#methods = ['ABC', 'GA', 'PSO', 'SA']
#methods = ['ABC/10_iter']
colors = [
    "#1f77b4",  # 藍
    "#ff7f0e",  # 橙
    "#2ca02c",  # 綠
    "#d62728",  # 紅
    "#9467bd",  # 紫
    "#8c564b",  # 棕
    "#e377c2",  # 粉
    "#7f7f7f",  # 灰
    "#bcbd22",  # 橄欖
    "#17becf",  # 青
]


#colors = ['#EE6352']
domain_max, domain_min = 3.0, 0.0
#domain_max, domain_min = 2.0, 0.0

plt.figure(figsize=(8, 6), dpi=400)
plt.fill_betweenx([criteria, domain_max], criteria, domain_max, color='yellow', alpha=0.3)
for method, color in zip(methods, colors):
    new_folder_path_cn = folder_path_cn.replace('SA', method)
    file_path = os.path.join(new_folder_path_cn, f'{today}pvk_Report.csv')
    cyclen_pred = pd.read_csv(file_path)
    pvk_size_feature_list = ['Reagent1 (ul)','Reagent2 (ul)','Reagent3 (ul)','Reagent4 (ul)','lab_code','ATSC5v', 'AATSC5Z', 'MATS8se', 'crystal_size']
    cyclen_pred = cyclen_pred.rename(columns={
    'Reagent1_(ul)': 'Reagent1 (ul)',
    'Reagent2_(ul)': 'Reagent2 (ul)',
    'Reagent3_(ul)': 'Reagent3 (ul)',
    'Reagent4_(ul)': 'Reagent4 (ul)', 
})  
    cyclen_pred = cyclen_pred[pvk_size_feature_list]
    size_features = cyclen_pred.apply(pd.to_numeric, errors='coerce')
    size_features_nonull = size_features.dropna()
    # print(cyclen_pred.columns)
    gt_prediction = pvk_crystal_predict(size_features_nonull, rf_regressor)[0]
    
    #save the prediction result to a new csv file

    plt.scatter(size_features_nonull['crystal_size'], gt_prediction, color=color, label=method)




x = np.linspace(domain_min, domain_max)
plt.plot(x, x, color='black')

plt.xlim(domain_min, domain_max)
plt.ylim(domain_min, domain_max)


plt.plot(x, x + 0.1, color='gray', linestyle='dashed')
plt.plot(x, x - 0.1, color='gray', linestyle='dashed')


# plt.text(1.3, 1.8, 'Underestimate', fontsize=12)
# plt.text(1.7, 1.25, 'Overestimate', fontsize=12)
plt.title(f'Prediction vs. Ground Truth (After {int(subfolder[1]) + 1} Cycle)')
plt.xlabel('Prediction (mm)')
plt.ylabel('Ground Truth (mm)')
plt.legend()
plt.show()