### 1.Import classes and set the src path

In [None]:
import pandas as pd
from tqdm import tqdm
from rdkit import Chem
from rdkit import RDLogger
import numpy as np
import re
import sys
import os
import random
import time
from pandarallel import pandarallel
import shutil

# Add the path to 'src' (where 'gdb_ml' is located)
src_path = os.path.abspath(".../gdb_ml/src")  
if src_path not in sys.path:
    sys.path.append(src_path)

from gdb_ml import ChemUtils
# Create an instance
chem_utils = ChemUtils()

from gdb_ml import PropertiesCalculator
# Create an instance
properties_calculator = PropertiesCalculator()

from gdb_ml import DataProcessor
# Create an instance
data_processor = DataProcessor()

# Suppress RDKit warnings and errors
RDLogger.DisableLog('rdApp.error')

# Initialize pandarallel
pandarallel.initialize(progress_bar=True)  # Enable progress bar for tracking

### 2.Canonicalize the SMILES

In [None]:
FILE_PATH_READ = "...graphs_or_mols.txt"

In [None]:
df = data_processor.load_data(FILE_PATH_READ, has_header=0, add_header=1)
df

In [None]:
df["Canonicalized SMILES"] = ""
count = 0
my_list_invaild = []
for q in tqdm(range(0, len(df)), desc = 'Loop 1'):
    try:  
        smiles = df['SMILES'][q]
        random_one_smiles = chem_utils.canonicalize_smiles(smiles)
        #print(token)
        df.loc[q, 'Canonicalized SMILES'] = str(random_one_smiles)

    except:
        count += 1
        my_list_invaild.append(q)
        continue
print(count, 'invalids')
df

In [None]:
FILE_PATH_SAVE =  "...graphs_or_mols_canonicalized.txt"

In [None]:
data_processor.save_to_file(df['Canonicalized SMILES'], FILE_PATH_SAVE)

### 3.Randomize the SMILES (alternative)

In [None]:
FILE_PATH_READ = "...graphs_or_mols.txt"

In [None]:
df = data_processor.load_data(FILE_PATH_READ, SEPRATOR= ",", has_header=0, add_header=1, COLUMN_NAME_INPUT= ["SMILES"])
df

In [None]:
df = df[["SMILES"]]
df

In [None]:
df = df.drop_duplicates(subset=["SMILES"], keep='last').reset_index(drop=True)
df

In [None]:
df["Randomized SMILES"] = ""
count = 0
my_list_invaild = []
for q in tqdm(range(0, len(df)), desc = 'Loop 1'):
    try:  
        smiles = df['SMILES'][q]
        random_one_smiles = chem_utils.smiles_randomization(smiles)
        #print(token)
        df.loc[q, 'Randomized SMILES'] = str(random_one_smiles)

    except:
        count += 1
        my_list_invaild.append(q)
        continue
print(count, 'invalids')
df

### 4.FDV filter (MC1) 

In [None]:
df["FDV"] = ""
count = 0
my_list_invaild = []
for q in tqdm(range(0, len(df)), desc = 'Loop 1'):
    try:  
        smiles = df['Canonicalized SMILES'][q]
        random_one_smiles = properties_calculator.divalent_nodes_fraction(smiles)
        #print(token)
        df.loc[q, 'FDV'] = str(random_one_smiles)

    except:
        count += 1
        my_list_invaild.append(q)
        continue
print(count, 'invalids')
df

In [None]:
# Convert 'FDV' to numeric, setting errors='coerce' to handle non-numeric values
df['FDV'] = pd.to_numeric(df['FDV'], errors='coerce')

# Filter the rows where 'FDV' > 0.4
df = df[df['FDV'] > 0.4].reset_index(drop=True)
df

In [None]:
FILE_PATH_SAVE = "...fdv_filtered.txt"

In [None]:
data_processor.save_to_file(df['Canonicalized SMILES'], FILE_PATH_SAVE)

### 5.Graph extraction (from SMILES to character-based graphs)

In [None]:
FILE_PATH_READ = "...mols.smi"

In [None]:
df = data_processor.load_data(FILE_PATH_READ, has_header=0, add_header=1, SEPRATOR = "," ,COLUMN_NAME_INPUT= ["SMILES"])
df

In [None]:
df['Character-based Conversion']= ""

count = 0
my_list_invaild = []

for q in tqdm(range(0, len(df)), desc = 'Loop 1'):
    
    try:  
        smiles = df['SMILES'][q]
        df.loc[q, 'Character-based Conversion'] = chem_utils.graph_convert(smiles)
        
    except:
        count += 1
        my_list_invaild.append(q)
        continue

print(count, 'invalids')

In [None]:
FILE_PATH_SAVE = "...extracted_graphs.txt"

In [None]:
data_processor.save_to_file(df, FILE_PATH_SAVE)

In [None]:
data_processor.save_to_file(df['Character-based Conversion'], FILE_PATH_SAVE)

### 6.Concatenate the SMILES with dots

In [None]:
FILE_PATH_READ = "...canonicalized.txt"

In [None]:
df = data_processor.load_data(FILE_PATH_READ, SEPRATOR= "\t", has_header=0, add_header=1, COLUMN_NAME_INPUT= ["SMILES","Length"])
df

In [None]:
# Number of rows
n = len(df)

# Compute split indices
split = int(n * 0.33)  # First 33% and last 33%
middle_start = split
middle_end = n - split  # This ensures 34% of middle remains untouched

# Create a new column for concatenated SMILES (default: keep as single molecules)
df["Concatenated SMILES"] = df["SMILES"]

# Concatenate the first 33% with the last 33%
for i in range(split):
    shortest_smiles = df.loc[i, "SMILES"]
    longest_smiles = df.loc[n - i - 1, "SMILES"]
    df.loc[i, "Concatenated SMILES"] = f"{shortest_smiles}.{longest_smiles}"

# Remove the last 33% (since they have been concatenated)
df = df.iloc[:middle_end].reset_index(drop=True)  # Keep the first 33% (modified) + middle 34% (unchanged)

df

In [None]:
FILE_PATH_SAVE =  "...concatenated.txt"

In [None]:
data_processor.save_to_file(df['Concatenated SMILES'], FILE_PATH_SAVE)

### 7.Tokenize the SMILES

In [None]:
FILE_PATH_READ = "...concatenated.txt"

In [None]:
df = data_processor.load_data(FILE_PATH_READ, has_header=0, add_header=1)
df

In [None]:
df["Token"] = ""

count = 0
my_list_invaild = []
for q in tqdm(range(0, len(df)), desc = 'Loop 1'):

    try:  
        smiles = df['SMILES'][q]
        token = data_processor.tokenize_smiles(smiles)
        #print(token)
        df.loc[q, 'Token'] = str(token)

    except:
        count += 1
        my_list_invaild.append(q)
        continue

print(count, 'invalids')
df

In [None]:
FILE_PATH_SAVE = "...tokenized.txt"

In [None]:
data_processor.save_to_file(df['Token'], FILE_PATH_SAVE)

### 8.Shuffle the data

In [None]:
# File paths for both keys and values files of train or validation set
file1 = "...keys_tokenized.txt"
file2 = "...values_tokenized.txt"

# Read both files
with open(file1, 'r', encoding='utf-8') as f1, open(file2, 'r', encoding='utf-8') as f2:
    lines1 = f1.readlines()
    lines2 = f2.readlines()

# Ensure both files have the same number of lines
if len(lines1) != len(lines2):
    raise ValueError("Files have different numbers of lines!")

# Combine lines and shuffle
combined = list(zip(lines1, lines2))
random.shuffle(combined)

# Split back into separate lists
shuffled_lines1, shuffled_lines2 = zip(*combined)

# Write the shuffled lines back to the files
with open(file1, 'w', encoding='utf-8') as f1, open(file2, 'w', encoding='utf-8') as f2:
    f1.writelines(shuffled_lines1)
    f2.writelines(shuffled_lines2)

print("Files shuffled successfully!")

### 9.Detokenize the generated mols from transformer

In [None]:
FILE_PATH_READ = "...generated_mols.txt"

In [None]:
COLUMN_NAME_OUTPUT = ["Generated SMILES"]

In [None]:
df = data_processor.load_file_with_badlines(FILE_PATH_READ, COLUMN_NAME_OUTPUT)
df

In [None]:
df['Detokenized']=""

count = 0
my_list_invaild = []
for q in tqdm(range(0, len(df)), desc = 'Loop 1'):

    try:  
        tokens_string = df['Generated SMILES'][q]
        detokenized_string = data_processor.detokenize_smiles(tokens_string)
        df.loc[q,'Detokenized'] = detokenized_string 

    except:
        count += 1
        my_list_invaild.append(q)
        continue

print(count, 'invalids')
df

In [None]:
FILE_PATH_SAVE = "...detokenized.txt"

In [None]:
data_processor.save_to_file(df['Detokenized'], FILE_PATH_SAVE)

### 10.Evaluations of the generated mols

#### (1) Apend the log probs and calculate the validity, with canonicalization

##### a. For single file

In [None]:
FILE_PATH_READ = "...generated_mols.txt"

In [None]:
df = data_processor.load_data(FILE_PATH_READ, COLUMN_NAME_INPUT=['SMILES'])
df

In [None]:
FILE_PATH_READ = "..._log_probs.txt"

In [None]:
df_cs = data_processor.load_data(FILE_PATH_READ, COLUMN_NAME_INPUT=['log prob'])
df_cs

In [None]:
df = pd.DataFrame({'SMILES': df['SMILES'], 'Log Probs': df_cs['log prob']})
df

In [None]:
validity, df_valid = properties_calculator.validity(df)
df_valid

In [None]:
FILE_PATH_SAVE = "...valid_canonicalized_with_log_probs.txt"

In [None]:
data_processor.save_to_file(df_valid, FILE_PATH_SAVE)

##### b. For multiple files

###### i. Irregular file names

In [None]:
# Specify the folder path
folder_path = ".../generated_mols"

# Get a list of all files in the folder
file_list = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]


# Sort the list of files
file_list.sort()

print(len(file_list), file_list)

In [None]:
# Process files in the specified range, based on {suffix}

# Define a function to process a single file
def process_file(file):
    # Define file paths
    FILE_PATH_READ_1 = f"...generated_mols/{file}"
    FILE_PATH_READ_2 = f"...log_prob/{file}_log_probs"
    
    file2 = file.split(".txt")[0]
    FILE_PATH_SAVE = f"...valid_canonicalized_with_log_probs/{file2}_valid_canonicalized_with_log_probs.txt"

    # Process the file and return the results
    validity, df_valid = data_processor.detokenize_append_log_prob(FILE_PATH_READ_1, FILE_PATH_READ_2, FILE_PATH_SAVE)
    return validity, df_valid

# Convert file list to a pandas Series
file_series = pd.Series(file_list)

# Use pandarallel to apply the process_file function to each file
results = file_series.parallel_apply(process_file)

# End time
end_time = time.time()

# Print runtime
print(f"Runtime: {end_time - start_time:.4f} seconds")

###### ii. File names with suffix (alternative)

In [None]:
from itertools import product

# Generate file range automatically
file_range = [''.join(pair) for pair in product('abcdefghijklmnopqrstuvwxyz', repeat=2)]

# Slice the list for a specific range (e.g., 'aa' to 'a')
start = file_range.index('at')
end = file_range.index('aw') + 1
file_range = file_range[start:end]

print(file_range, len(file_range))

In [None]:
# Suppress RDKit warnings and errors
RDLogger.DisableLog('rdApp.error')

# Process files in the specified range, based on {suffix}
for suffix in file_range:
    FILE_PATH_READ_1 = f"generated_mols_{suffix}.txt"

    FILE_PATH_READ_2 = f"...{suffix}.txt_log_probs"

    FILE_PATH_SAVE = f"..._{suffix}_valid_canonicalized_with_log_probs.txt"
    
    validity, df_valid = data_processor.detokenize_append_log_prob(FILE_PATH_READ_1, FILE_PATH_READ_2, FILE_PATH_SAVE)


###### iii.Merge all the valid dfs appended with log porbs

In [None]:
FOLDER_PATH = "/...valid_canonicalized_with_log_probs/"

In [None]:
COLUMN_NAME_INPUT = ["SMILES", "Log Probs"]

In [None]:
df_valid_log_prob = data_processor.append_dfs_in_folder(FOLDER_PATH, COLUMN_NAME_INPUT)
df_valid_log_prob 

In [None]:
# Save the merged files:
FILE_PATH_SAVE = "...valid_canonicalized_with_log_prob.txt"

In [None]:
data_processor.save_to_file(df_valid_log_prob , FILE_PATH_SAVE)

#### (2) Calculate the uniqueness

In [None]:
FILE_PATH_READ = "...valid_canonicalized_with_log_prob.txt"

In [None]:
df = data_processor.load_data(FILE_PATH_READ, has_header=0, add_header=1, COLUMN_NAME_INPUT= ["SMILES", "Log Probs"])
df

In [None]:
uniqueness, df_unique = properties_calculator.uniqueness(df)

In [None]:
uniqueness

In [None]:
df_unique

In [None]:
# Save the merged files:
FILE_PATH_SAVE = "...valid_unique.txt"

In [None]:
data_processor.save_to_file(df_unique, FILE_PATH_SAVE)

#### (3) Claculate the novelty

In [None]:
FILE_PATH_READ = "...valid_unique.txt"

In [None]:
df_unique = data_processor.load_data(FILE_PATH_READ, has_header=0, add_header=1, COLUMN_NAME_INPUT= ["SMILES", "Log Probs"])
df_unique

In [None]:
FILE_PATH_READ = "...train_or_validation_values.txt"

In [None]:
df_train = data_processor.load_data(FILE_PATH_READ)
df_train

In [None]:
novelty, df_novel = properties_calculator.novelty(df_unique, df_train)

In [None]:
novelty

### 11.Properties profiling of the generated mols (QED, SAS, Fsp3, C-atoms Fraction, logP, NPscore, weight...)

#### a. For single file 

In [None]:
FILE_PATH_READ = "...valid_unique.txt"

In [None]:
df = data_processor.load_data(FILE_PATH_READ, has_header=0, add_header=1,COLUMN_NAME_INPUT=["SMILES", "Log Prob"])
df

In [None]:
properties_calculator.weight("c1ccccc1N")
properties_calculator.npscore("c1ccccc1N")
properties_calculator.logP("c1ccccc1N")

In [None]:
df['QED']= ""
df['SAscore']= ""
df['Fsp3']= ""
df['C-atoms Fraction']= ""

count = 0
my_list_invaild = []

for q in tqdm(range(0, len(df)), desc = 'Loop 1'):
    
    try:  
        smiles = df['SMILES'][q]
        df.loc[q, 'SAscore'] = properties_calculator.sascore(smiles)
        df.loc[q, 'QED'] = properties_calculator.qed(smiles)
        df.loc[q, 'Fsp3'] = properties_calculator.fsp3(smiles)
        df.loc[q, 'C-atoms Fraction'] = properties_calculator.fraction_c(smiles)
        
    except:
        count += 1
        my_list_invaild.append(q)
        continue

print(count, 'invalids')

In [None]:
FILE_PATH_SAVE = "...properties.txt"

In [None]:
data_processor.save_to_file(df, FILE_PATH_SAVE)

#### b. For multiple files

In [None]:
# Specify the folder path
folder_path = ".../valid_unique"

# Get a list of all files in the folder
file_list = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

# Sort the list of files
file_list.sort()

print(len(file_list), file_list)

In [None]:
start_time = time.time()

# Process files in the specified range, based on {suffix}

# Define the function to process a single file
def process_file(file):
    FILE_PATH_READ = f".../with_log_prob/{file}"
    
    file2 = file.split(".txt")[0]

    FILE_PATH_SAVE_QED_SAS_FSP3_CF = f".../{file2}_log_prob.txt"

    # Call the properties_calculator function
    properties_calculator.multi_qed_sas_fsp3_cf(FILE_PATH_READ, FILE_PATH_SAVE_QED_SAS_FSP3_CF)


# Convert the file list into a pandas Series
file_series = pd.Series(file_list)

# Use pandarallel to process files in parallel
file_series.parallel_apply(process_file)

# End time
end_time = time.time()

# Print runtime
print(f"Runtime: {end_time - start_time:.4f} seconds")

### 12.Check the undesired FG or structures

#### a. For single file

In [None]:
df['Filter1-10'] = df['SMILES'].apply(properties_calculator.undesired_FG_check)
df

In [None]:
df_failed = df[
    df['Filter1-10'].apply(
        lambda x: isinstance(x, list) and len(x) > 0 and x[0] == False
    )
].reset_index(drop=True)
df_failed

#### b. For multiple files 

In [None]:
# Specify the folder path
folder_path = ".../failed"

# Get a list of all files in the folder
file_list = [f for f in os.listdir(folder_path) if os.path.isfile(os.path.join(folder_path, f))]

# Sort the list of files
file_list.sort()

print(len(file_list), file_list)

##### i. Check the passed and failed molcules

In [None]:
# Process files in the specified range, based on {suffix}

# Define the function to process a single file
def process_file(file):
    FILE_PATH_READ = f".../failed/{file}"
    
    file2 = file.split(".txt")[0]

    FILE_PATH_SAVE_PASS = f".../{file2}_desired_mols.txt"
    
    FILE_PATH_SAVE_FAILED = f".../{file2}_undesired_mols.txt"
    
    # Call the properties_calculator function
    properties_calculator.undesired_FG_details(FILE_PATH_READ, FILE_PATH_SAVE_PASS, FILE_PATH_SAVE_FAILED)


# Convert the file list into a pandas Series
file_series = pd.Series(file_list)

# Use pandarallel to process files in parallel
file_series.parallel_apply(process_file)

# End time
end_time = time.time()

# Print runtime
print(f"Runtime: {end_time - start_time:.4f} seconds")


##### ii. Show the details of the failed molecules

In [None]:
FILE_PATH_READ = "...undesired_mols.txt"

properties_calculator.show_undesired_FG_details(FILE_PATH_READ)