# Miscellaneous

In [4]:
from rdkit import Chem
import pandas as pd
import csv

df = pd.read_csv('merged_molecules1.csv')

# Count the occurrences of each value in the 'validity' column
validity_counts = df['validity'].value_counts()

# Print the counts
print(validity_counts)

# Optionally, you can access the counts for 'valid' and 'invalid' specifically
valid_count = validity_counts.get('valid', 0)
invalid_count = validity_counts.get('invalid', 0)

print(f"Valid molecules: {valid_count}")
print(f"Invalid molecules: {invalid_count}")


validity
valid      84452
invalid    15548
Name: count, dtype: int64
Valid molecules: 84452
Invalid molecules: 15548


In [2]:
import re
import csv

def extract_losses_and_validities(file_path):
    # Initialize an empty list to store the results
    results = []
    
    # Regular expression pattern to match lines with disc_loss, gen_loss, and valid
    pattern = re.compile(r'disc_loss:\s*([\d.]+),\s*gen_loss:\s*([\d.-]+),\s*valid\s*\d+\s*=\s*([\d.]+)')
    
    # Open the file and read it line by line
    with open(file_path, 'r') as file:
        for line in file:
            # Search for the pattern in the current line
            match = pattern.search(line)
            if match:
                # Extract values and add them to the results list
                disc_loss, gen_loss, valid = match.groups()
                results.append({
                    'disc_loss': float(disc_loss),
                    'gen_loss': float(gen_loss),
                    'valid': float(valid)
                })
    
    return results

def save_to_csv(data, output_file):
    # Specify the fieldnames based on the keys in the dictionaries
    fieldnames = ['disc_loss', 'gen_loss', 'valid']
    
    # Open the CSV file in write mode
    with open(output_file, 'w', newline='') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        
        # Write the header
        writer.writeheader()
        
        # Write the data rows
        writer.writerows(data)

# Specify the path to your input .txt file and output .csv file
input_file_path = 'losses.txt'
output_file_path = 'losses.csv'

# Extract data from the .txt file
extracted_data = extract_losses_and_validities(input_file_path)

# Save the extracted data to a CSV file
save_to_csv(extracted_data, output_file_path)
extracted_data

[{'disc_loss': 0.69738722, 'gen_loss': 1.15852427, 'valid': 0.02},
 {'disc_loss': 0.23383799, 'gen_loss': -4.01097965, 'valid': 0.0},
 {'disc_loss': 0.04292502, 'gen_loss': 0.79418868, 'valid': 0.03},
 {'disc_loss': 0.00702529, 'gen_loss': -0.15746461, 'valid': 0.14},
 {'disc_loss': 0.01574175, 'gen_loss': 0.22207528, 'valid': 0.08},
 {'disc_loss': 0.0039141, 'gen_loss': 0.00193376, 'valid': 0.09},
 {'disc_loss': 0.0300383, 'gen_loss': -0.04787258, 'valid': 0.06},
 {'disc_loss': 0.00552681, 'gen_loss': 0.17580704, 'valid': 0.04},
 {'disc_loss': 0.00537377, 'gen_loss': -0.16845562, 'valid': 0.07},
 {'disc_loss': 0.00143724, 'gen_loss': -0.10243212, 'valid': 0.05},
 {'disc_loss': 0.00341556, 'gen_loss': 0.12107533, 'valid': 0.05},
 {'disc_loss': 0.00047034, 'gen_loss': -0.09414044, 'valid': 0.03},
 {'disc_loss': 0.01540486, 'gen_loss': -0.10632857, 'valid': 0.04},
 {'disc_loss': 0.00109082, 'gen_loss': -0.05116373, 'valid': 0.04},
 {'disc_loss': 0.00459421, 'gen_loss': 0.41004258, 'valid

In [3]:
import pandas as pd
import glob

# Define the path to the folder containing the CSV files
folder_path = 'generated/'

# Get a list of all CSV files in the folder
csv_files = glob.glob(folder_path + 'generated_molecules*.csv')

# Initialize an empty list to store dataframes
dataframes = []

# Loop through the list of CSV files and read each one into a dataframe
for file in csv_files:
    df = pd.read_csv(file)
    dataframes.append(df)

# Concatenate all the dataframes into one
merged_df = pd.concat(dataframes, ignore_index=True)

# Drop duplicates based on the 'canonical_smiles' column
merged_df.drop_duplicates(subset='canonical_smiles', inplace=True)

# Save the merged dataframe to a new CSV file
merged_df.to_csv('merged_molecules.csv', index=False)

print("CSV files have been merged and duplicates have been dropped.")


CSV files have been merged and duplicates have been dropped.


In [3]:
import pandas as pd

# Step 1: Load the CSV file
df = pd.read_csv('merged_molecules.csv')

# Step 2: Randomly sample 100,000 rows
df_sampled = df.sample(n=100000, random_state=42)

# Step 3: Save the sampled DataFrame to a new CSV file
df_sampled.to_csv('merged_molecules.csv', index=False)


In [1]:
import pandas as pd
from chembl_webresource_client.new_client import new_client
from rdkit import Chem
from rdkit.Chem import Descriptors, Crippen

# Define the ChEMBL IDs to be added
chembl_ids = [
    'CHEMBL1200633', 'CHEMBL684', 'CHEMBL529', 'CHEMBL267345', 'CHEMBL55', 
    'CHEMBL125', 'CHEMBL290960', 'CHEMBL374478', 'CHEMBL1292'
]

# Initialize the ChEMBL molecule and activity clients
molecule = new_client.molecule
activity = new_client.activity

# Function to check Lipinski's rule of five
def passes_lipinski(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return False
    h_donors = Descriptors.NumHDonors(mol)
    h_acceptors = Descriptors.NumHAcceptors(mol)
    mol_weight = Descriptors.MolWt(mol)
    logp = Crippen.MolLogP(mol)
    if h_donors <= 5 and h_acceptors <= 10 and mol_weight <= 500 and logp <= 5:
        return True
    return False

# Function to get molecule details from ChEMBL
def get_molecule_details(chembl_id):
    try:
        res = molecule.get(chembl_id)
        activities = activity.filter(molecule_chembl_id=chembl_id).only(['standard_value', 'standard_units', 'standard_type', 'pchembl_value', 'target_chembl_id', 'bao_label'])

        # Aggregate the first activity entry if available
        if activities:
            first_activity = activities[0]
            standard_value = first_activity['standard_value'] if 'standard_value' in first_activity else ''
            standard_units = first_activity['standard_units'] if 'standard_units' in first_activity else ''
            standard_type = first_activity['standard_type'] if 'standard_type' in first_activity else ''
            pchembl_value = first_activity['pchembl_value'] if 'pchembl_value' in first_activity else ''
            target_pref_name = first_activity['target_chembl_id'] if 'target_chembl_id' in first_activity else ''
            bao_label = first_activity['bao_label'] if 'bao_label' in first_activity else ''
        else:
            standard_value = ''
            standard_units = ''
            standard_type = ''
            pchembl_value = ''
            target_pref_name = ''
            bao_label = ''

        canonical_smiles = res['molecule_structures']['canonical_smiles'] if 'molecule_structures' in res else ''
        lipinski = passes_lipinski(canonical_smiles)

        return {
            'molecule_chembl_id': chembl_id,
            'canonical_smiles': canonical_smiles,
            'standard_value': standard_value,
            'standard_units': standard_units,
            'standard_type': standard_type,
            'pchembl_value': pchembl_value,
            'target_pref_name': target_pref_name,
            'bao_label': bao_label,
            'passes_lipinski': lipinski,
            'molecule_name': res['pref_name'] if 'pref_name' in res else ''
        }
    except Exception as e:
        print(f"Error retrieving data for {chembl_id}: {e}")
        return None

# Get details for all specified ChEMBL IDs
new_entries = [get_molecule_details(chembl_id) for chembl_id in chembl_ids]

# Filter out None entries
new_entries = [entry for entry in new_entries if entry is not None]

# Create a DataFrame for the new entries
new_entries_df = pd.DataFrame(new_entries)

# Read the existing CSV file
input_file_path = 'Updated_NTD_Drugs_with_names.csv'
existing_data_df = pd.read_csv(input_file_path)

# Append the new entries to the existing data
updated_data_df = pd.concat([existing_data_df, new_entries_df], ignore_index=True)
updated_data_df.drop_duplicates(subset='canonical_smiles', inplace=True)

# Save the updated data to a new CSV file
output_file_path = 'Final_NTD_Drugs_with_names.csv'
updated_data_df.to_csv(output_file_path, index=False)

print(f"CSV file '{output_file_path}' has been created successfully.")


CSV file 'Final_NTD_Drugs_with_names.csv' has been created successfully.


In [1]:
import pandas as pd
from chembl_webresource_client.new_client import new_client

# Load the CSV file
file_path = 'NTD_Drugs.csv'
df = pd.read_csv(file_path)

# Initialize the ChEMBL molecule client
molecule = new_client.molecule

# Function to get molecule name from SMILES using ChEMBL
def get_molecule_name(smiles):
    try:
        result = molecule.filter(molecule_structures__canonical_smiles=smiles).only(['molecule_structures', 'molecule_properties', 'molecule_synonyms'])
        if result:
            return result[0]['molecule_synonyms'][0]['synonyms']
        else:
            return None
    except Exception as e:
        print(f"Error fetching molecule name for SMILES {smiles}: {e}")
        return None

# Add a new column for molecule names
df['molecule_name'] = df['canonical_smiles'].apply(get_molecule_name)

df


Unnamed: 0,molecule_chembl_id,canonical_smiles,standard_value,standard_units,standard_type,pchembl_value,target_pref_name,bao_label,passes_lipinski,molecule_name
0,CHEMBL1082,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,29410.0,nM,IC50,4.53,HaCaT,cell-based format,True,AMIX 125
1,CHEMBL777,O=C(O)[C@H]1/C(=C/CO)O[C@@H]2CC(=O)N21,4.0,nM,IC50,8.4,Unchecked,assay format,True,BRL 14151
2,CHEMBL290960,CC1CS(=O)(=O)CCN1/N=C/c1ccc([N+](=O)[O-])o1,110.0,nM,IC50,6.96,Leishmania donovani,organism-based format,True,BAY 2502
3,CHEMBL1631694,CSc1ccc(OCc2ncc([N+](=O)[O-])n2C)cc1,400.0,nM,IC50,6.4,Trypanosoma brucei brucei,organism-based format,True,FEXINIDAZOLE
4,CHEMBL110,O=C(Cn1ccnc1[N+](=O)[O-])NCc1ccccc1,1.8,nM,IC50,8.74,Trypanosoma cruzi,organism-based format,True,BENZNIDAZOLE
5,CHEMBL20,CC(=O)Nc1nnc(S(N)(=O)=O)s1,2.58,nM,IC50,8.59,Carbonic anhydrase IX,single protein format,True,ACETAMOX
6,CHEMBL1483,CCCSc1ccc2[nH]c(NC(=O)OC)nc2c1,37.0,nM,IC50,7.43,Giardia intestinalis,organism-based format,True,ALBENDAZOLE
7,CHEMBL42442,COC(=O)Nc1nc2cc([S+]([O-])c3ccccc3)ccc2[nH]1,1.0,nM,IC50,9.0,Unchecked,assay format,True,NSC-758943
8,CHEMBL1503,COc1ccc2[nH]c([S+]([O-])Cc3ncc(C)c(OC)c3C)nc2c1,95.5,nM,IC50,7.02,Giardia intestinalis,organism-based format,True,Antra
9,CHEMBL384467,C[C@@H]1C[C@H]2[C@@H]3CCC4=CC(=O)C=C[C@]4(C)[C...,0.1,nM,IC50,10.0,Mus musculus,organism-based format,True,AEROSEB-DEX


In [None]:
df.to_csv('NTD_Drugs_with_names.csv', index=False)


In [1]:
import torch

class OriginalTokenizer(object):
    def __init__(self, data):
        # Define the additional tokens to be included
        self.additional_tokens = ['[C@H]', '[C@@H]', '[nH]', '[O-]', '[C@]', '[N+]', '[C@@]', '[n+]', '[S+]']

        # Build the unique character set and remove '@'
        unique_char = list(set(''.join(data))) + self.additional_tokens + ['<eos>'] + ['<sos>']
        unique_char = [char for char in unique_char if char != '@']  # Remove '@'
        
        self.test_char = unique_char
        self.mapping = {'<pad>': 0}

        for i, c in enumerate(unique_char, start=1):
            self.mapping[c] = i

        self.inv_mapping = {v: k for k, v in self.mapping.items()}
        self.start_token = self.mapping['<sos>']
        self.end_token = self.mapping['<eos>']
        self.vocab_size = len(self.mapping.keys())

    def encode_smile(self, mol, add_eos=True):
        i = 0
        out = []
        while i < len(mol):
            matched = False
            for token in self.additional_tokens:
                if mol[i:i+len(token)] == token:
                    out.append(self.mapping[token])
                    i += len(token)
                    matched = True
                    break
            if not matched:
                out.append(self.mapping[mol[i]])
                i += 1
        if add_eos:
            out.append(self.end_token)
        return torch.LongTensor(out)

    # testing purposes
    def decode_tensor(self, tensor):
        return ''.join([self.inv_mapping[token.item()] for token in tensor if token.item() != self.end_token])

    def batch_tokenize(self, batch):
        out = map(lambda x: self.encode_smile(x), batch)
        return torch.nn.utils.rnn.pad_sequence(list(out), batch_first=True)


# Example data
data = ['CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)C[S+]C1.Br']

original_tokenizer = OriginalTokenizer(data)
# Print token mappings
print("Original Tokenizer Mapping:", original_tokenizer.mapping)
print("Vocab Size: ", original_tokenizer.vocab_size)
print("start_token: ", original_tokenizer.start_token)
print("end_token: ", original_tokenizer.end_token)

# Encode and print each SMILES string
for smile in data:
    encoded_smile = original_tokenizer.encode_smile(smile)
    print(f"Original Tokenizer Encoding for {smile}: {encoded_smile}")
    decoded_smile = original_tokenizer.decode_tensor(encoded_smile)
    print(f"Decoded SMILES: {decoded_smile}")


  data = ['CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)C[S+]C1.Br']


Original Tokenizer Mapping: {'<pad>': 0, 'N': 1, '\\': 2, 'O': 3, 'B': 4, '1': 5, '-': 6, '+': 7, 'n': 8, '2': 9, 'H': 10, '=': 11, '[': 12, 'r': 13, '3': 14, 'C': 15, '(': 16, '/': 17, 'S': 18, '#': 19, ']': 20, '.': 21, ')': 22, 'c': 23, '[C@H]': 24, '[C@@H]': 25, '[nH]': 26, '[O-]': 27, '[C@]': 28, '[N+]': 29, '[C@@]': 30, '[n+]': 31, '[S+]': 32, '<eos>': 33, '<sos>': 34}
Vocab Size:  35
start_token:  34
end_token:  33
Original Tokenizer Encoding for CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)C[S+]C1.Br: tensor([15, 15,  3, 17,  1, 11, 15, 16,  2, 23,  5, 23, 23, 23, 16,  4, 13, 22,
        23, 23,  5, 22, 15,  5, 15, 25, 15,  1, 16, 15,  9, 16, 15, 19, 15, 22,
        15, 15,  1, 16, 15, 16, 11,  3, 22, 23, 14, 23, 16, 15, 22, 23, 23, 31,
        16, 27, 22, 23, 14, 15, 22, 15, 15,  9, 22, 15, 32, 15,  5, 21,  4, 13,
        33])
Decoded SMILES: CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)C[S+]C1.Br


In [3]:
import torch

class Tokenizer(object):
    def __init__(self, data):
        # Define the additional tokens to be included
        self.additional_tokens = ['[C@H]', '[C@@H]', '[nH]', '[O-]', '[C@]', '[N+]', '[C@@]', '[n+]']

        # Build the unique character set
        unique_char = list(set(''.join(data))) + self.additional_tokens + ['<eos>'] + ['<sos>']
        self.test_char = unique_char
        self.mapping = {'<pad>': 0}

        for i, c in enumerate(unique_char, start=1):
            self.mapping[c] = i

        self.inv_mapping = {v: k for k, v in self.mapping.items()}
        self.start_token = self.mapping['<sos>']
        self.end_token = self.mapping['<eos>']
        self.vocab_size = len(self.mapping.keys())

    def encode_smile(self, mol, add_eos=True):
        i = 0
        out = []
        while i < len(mol):
            matched = False
            for token in self.additional_tokens:
                if mol[i:i+len(token)] == token:
                    out.append(self.mapping[token])
                    i += len(token)
                    matched = True
                    break
            if not matched:
                out.append(self.mapping[mol[i]])
                i += 1
        if add_eos:
            out.append(self.end_token)
        return torch.LongTensor(out)

    # testing purposes
    def decode_tensor(self, tensor):
        return ''.join([self.inv_mapping[token.item()] for token in tensor if token.item() != self.end_token])

    def batch_tokenize(self, batch):
        out = map(lambda x: self.encode_smile(x), batch)
        return torch.nn.utils.rnn.pad_sequence(list(out), batch_first=True)

# Example usage
data = ["CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)CC[S@@]1.Br"]
tokenizer = Tokenizer(data)
print(tokenizer.mapping)
# Encode a sample SMILES string
encoded_smile = tokenizer.encode_smile("CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)CC[S@@]1.Br")
print("Encoded SMILES:", encoded_smile)

# Decode the tensor back to SMILES string
decoded_smile = tokenizer.decode_tensor(encoded_smile)
print("Decoded SMILES:", decoded_smile)


{'<pad>': 0, '.': 1, 'B': 2, '@': 3, 'C': 4, '2': 5, '3': 6, 'H': 7, ')': 8, '[': 9, 'O': 10, 'r': 11, '\\': 12, '1': 13, '/': 14, '=': 15, 'S': 16, '#': 17, '+': 18, 'n': 19, ']': 20, '(': 21, 'c': 22, '-': 23, 'N': 24, '[C@H]': 25, '[C@@H]': 26, '[nH]': 27, '[O-]': 28, '[C@]': 29, '[N+]': 30, '[C@@]': 31, '[n+]': 32, '<eos>': 33, '<sos>': 34}
Encoded SMILES: tensor([ 4,  4, 10, 14, 24, 15,  4, 21, 12, 22, 13, 22, 22, 22, 21,  2, 11,  8,
        22, 22, 13,  8,  4, 13,  4, 26,  4, 24, 21,  4,  5, 21,  4, 17,  4,  8,
         4,  4, 24, 21,  4, 21, 15, 10,  8, 22,  6, 22, 21,  4,  8, 22, 22, 32,
        21, 28,  8, 22,  6,  4,  8,  4,  4,  5,  8,  4,  4,  9, 16,  3,  3, 20,
        13,  1,  2, 11, 33])
Decoded SMILES: CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)CC[S@@]1.Br


  data = ["CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)CC[S@@]1.Br"]
  encoded_smile = tokenizer.encode_smile("CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)CC[S@@]1.Br")


In [1]:
import torch

class OriginalTokenizer(object):
    def __init__(self, data):
        # Define the additional tokens to be included
        additional_tokens = ['[C@H]', '[C@@H]', '[nH]', '[O-]', '[C@]', '[N+]', '[C@@]', '[n+]']
        
        # Build the unique character set
        unique_char = list(set(''.join(data))) + additional_tokens + ['<eos>'] + ['<sos>']
        self.test_char = unique_char
        self.mapping = {'<pad>': 0}

        for i, c in enumerate(unique_char, start=1):
            self.mapping[c] = i
        
        self.inv_mapping = {v: k for k, v in self.mapping.items()}
        self.start_token = self.mapping['<sos>']
        self.end_token = self.mapping['<eos>']
        self.vocab_size = len(self.mapping.keys())
        
    def encode_smile(self, mol, add_eos=True):
        out = [self.mapping[i] for i in mol]
        if add_eos:
            out = out + [self.end_token]
        return torch.LongTensor(out)

    def decode_tensor(self, tensor):
        return ''.join([self.inv_mapping[token.item()] for token in tensor if token.item() != self.end_token])

    def batch_tokenize(self, batch):
        out = map(lambda x: self.encode_smile(x), batch)
        return torch.nn.utils.rnn.pad_sequence(list(out), batch_first=True)
        

# Example data
data = ['CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)CC1.Br',
        'C=CCNc1c(O)cc2c(O)c1C[C@@H](C)C[C@H](OC)[C@H](O)[C@@H](C)/C=C(\C)[C@H](OC(N)=O)[C@@H](OC)/C=C\C=C(/C)C(=O)N2.Cl']
        # 'C=CCNc1c(O)cc2c(O)c1C[C@@H](C)C[C@H](OC)[C@H](O)[C@@H](C)/C=C(\C)[C@H](OC(N)=O)[C@@H](OC)/C=C\C=C(/C)C(=O)N2.Cl'
original_tokenizer = OriginalTokenizer(data)

# Print token mappings
print("Original Tokenizer Mapping:", original_tokenizer.mapping)
print("Vocab Size: ", original_tokenizer.vocab_size)
print("start_token: ", original_tokenizer.start_token)
print("end_token: ", original_tokenizer.end_token)

# Encode and print each SMILES string
for smile in data:
    encoded_smile = original_tokenizer.encode_smile(smile)
    print(f"Original Tokenizer Encoding for {smile}: {encoded_smile}")
    decoded_smile = original_tokenizer.decode_tensor(encoded_smile)
    print(f"Decoded SMILES: {decoded_smile}")

  data = ['CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)CC1.Br',
  'C=CCNc1c(O)cc2c(O)c1C[C@@H](C)C[C@H](OC)[C@H](O)[C@@H](C)/C=C(\C)[C@H](OC(N)=O)[C@@H](OC)/C=C\C=C(/C)C(=O)N2.Cl']


Original Tokenizer Mapping: {'<pad>': 0, '2': 1, '=': 2, 'c': 3, 'n': 4, 'l': 5, '@': 6, ')': 7, '(': 8, '.': 9, '\\': 10, 'N': 11, '[': 12, 'C': 13, 'H': 14, '3': 15, 'r': 16, '1': 17, 'B': 18, '#': 19, '+': 20, ']': 21, 'O': 22, '-': 23, '/': 24, '[C@H]': 25, '[C@@H]': 26, '[nH]': 27, '[O-]': 28, '[C@]': 29, '[N+]': 30, '[C@@]': 31, '[n+]': 32, '<eos>': 33, '<sos>': 34}
Vocab Size:  35
start_token:  34
end_token:  33
Original Tokenizer Encoding for CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)CC2)CC1.Br: tensor([13, 13, 22, 24, 11,  2, 13,  8, 10,  3, 17,  3,  3,  3,  8, 18, 16,  7,
         3,  3, 17,  7, 13, 17, 13, 12, 13,  6,  6, 14, 21, 13, 11,  8, 13,  1,
         8, 13, 19, 13,  7, 13, 13, 11,  8, 13,  8,  2, 22,  7,  3, 15,  3,  8,
        13,  7,  3,  3, 12,  4, 20, 21,  8, 12, 22, 23, 21,  7,  3, 15, 13,  7,
        13, 13,  1,  7, 13, 13, 17,  9, 18, 16, 33])
Decoded SMILES: CCO/N=C(\c1ccc(Br)cc1)C1C[C@@H]CN(C2(C#C)CCN(C(=O)c3c(C)cc[n+]([O-])c3C)

In [12]:
import pandas as pd
import re
from collections import Counter

# Load the dataset
data = pd.read_csv('NTD_filtered_smiles_dataset.csv')['canonical_smiles']

# Define the tokens to search for
tokens_to_search = ['[P@@]', '[Sr+2]', '[S@]', '[NH-]', '[Mg+2]', '[P+]', '[K+]', '[3H]', '[C-]', '[c+]', '[As]', '[C+]', '[S+]', '[C@]', '[S@@+]', '[I-]', '[N@]', '[Se+]', '[S@@]', '[o+]', '[Cl-]', '[Br-]', '[Li+]', '[Cl+3]', '[N@@+]', '[P-]', '[CaH2]', '[n+]', '[SeH]', '[Al]', '[n-]', '[N@+]', '[C@@H]', '[N+]', '[se]', '[s+]', '[cH-]', '[O]', '[Se]', '[2H]', '[Ca+2]', '[O+]', '[N-]', '[nH]', '[C@@]', '[I+]', '[B-]', '[S@+]', '[C@H]', '[Ag+]', '[NH+]', '[P@]', '[MgH2]', '[N@@]', '[Na+]', '[S-]', '[Si]', '[SH]', '[O-]', '[PH]']

# Initialize the Counter to keep track of token counts
token_counts = Counter()

# Iterate over each SMILES string in the dataset
for smile in data:
    for token in tokens_to_search:
        token_counts[token] += len(re.findall(re.escape(token), smile))

# Filter and sort the found tokens by their counts in descending order
found_tokens_sorted = sorted(token_counts.items(), key=lambda x: x[1], reverse=True)
found_tokens_sorted = [(token, count) for token, count in found_tokens_sorted if count > 0]

# Print the sorted found tokens and their counts
found_tokens_sorted



[('[C@H]', 199482),
 ('[C@@H]', 181136),
 ('[C@]', 70833),
 ('[C@@]', 64920),
 ('[nH]', 38677),
 ('[O-]', 27764),
 ('[N+]', 17734),
 ('[S+]', 8489),
 ('[n+]', 2393),
 ('[Na+]', 1077),
 ('[Br-]', 675),
 ('[Cl-]', 589),
 ('[I-]', 478),
 ('[Si]', 365),
 ('[N-]', 310),
 ('[K+]', 215),
 ('[Se]', 203),
 ('[P+]', 107),
 ('[2H]', 62),
 ('[se]', 60),
 ('[B-]', 47),
 ('[Cl+3]', 36),
 ('[C-]', 32),
 ('[I+]', 29),
 ('[3H]', 28),
 ('[S@@+]', 27),
 ('[P-]', 25),
 ('[o+]', 22),
 ('[S@+]', 21),
 ('[P@]', 19),
 ('[P@@]', 18),
 ('[Li+]', 17),
 ('[O+]', 12),
 ('[N@+]', 11),
 ('[PH]', 10),
 ('[n-]', 9),
 ('[O]', 9),
 ('[S-]', 9),
 ('[As]', 8),
 ('[N@@+]', 8),
 ('[s+]', 6),
 ('[N@]', 5),
 ('[N@@]', 5),
 ('[NH+]', 4),
 ('[Mg+2]', 3),
 ('[Al]', 3),
 ('[SH]', 3),
 ('[S@]', 2),
 ('[C+]', 2),
 ('[Ag+]', 2),
 ('[Sr+2]', 1),
 ('[NH-]', 1),
 ('[c+]', 1),
 ('[Se+]', 1),
 ('[S@@]', 1),
 ('[CaH2]', 1),
 ('[SeH]', 1),
 ('[cH-]', 1),
 ('[Ca+2]', 1),
 ('[MgH2]', 1)]

In [4]:
import pandas as pd

# Load the dataset
df = pd.read_csv('final_smiles_dataset.csv')

# Keep only the 'Original_SMILES' column
df = df[['canonical_smiles']]

# Remove duplicates
df = df.drop_duplicates(subset='canonical_smiles')


# Save the cleaned dataset to a new CSV file
df.to_csv('final_smiles_dataset.csv', index=False)


Unnamed: 0,canonical_smiles
0,O=P(OP(=O)(OP(=O)(O)OC[C@@H]1O[C@H](n2c(nc(N)c...
1,CC(N1CCC[C@H]1C(N[C@@H](CCCCN)C(=O)N[C@H](CC(=...
2,CC(C(c1ccc2cc(OCc3ccc([N+](=O)[O-])cc3)ccc2c1)...
3,c12OCOc1cc(c(c2)C(=O)O)-c1cccc(c1N(C)C)C=C
4,C1N2C(c3ccccc3)(SC1)c1c(Cl)cccc1C2=O
...,...
526838,O=S(=O)(c1ccc(C)cc1)N(CC(CNCc1ccco1)O)c1ccc(F)cc1
526839,C(C1C(=CCc2c1c(ccc2O)O)C)C=C(C)C
526840,C(OP([O-])(OCCC12CC3CC(CC(C2)C3)(CCOCCCCCCCC)C...
526841,O=C/C=C/c1ccccc1O


In [2]:
df

Unnamed: 0,canonical_smiles
0,O=P(OP(=O)(OP(=O)(O)OC[C@@H]1O[C@H](n2c(nc(N)c...
1,CC(N1CCC[C@H]1C(N[C@@H](CCCCN)C(=O)N[C@H](CC(=...
2,CC(C(c1ccc2cc(OCc3ccc([N+](=O)[O-])cc3)ccc2c1)...
3,c12OCOc1cc(c(c2)C(=O)O)-c1cccc(c1N(C)C)C=C
4,C1N2C(c3ccccc3)(SC1)c1c(Cl)cccc1C2=O
...,...
526838,O=S(=O)(c1ccc(C)cc1)N(CC(CNCc1ccco1)O)c1ccc(F)cc1
526839,C(C1C(=CCc2c1c(ccc2O)O)C)C=C(C)C
526840,C(OP([O-])(OCCC12CC3CC(CC(C2)C3)(CCOCCCCCCCC)C...
526841,O=C/C=C/c1ccccc1O


In [4]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.DataStructs import TanimotoSimilarity

def calculate_tanimoto_similarity(molecule1, molecule2):
    if molecule1 is None or molecule2 is None:
        return None

    # Generate Morgan fingerprints
    fp1 = AllChem.GetMorganFingerprintAsBitVect(molecule1, 2, nBits=2048)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(molecule2, 2, nBits=2048)


    similarity = TanimotoSimilarity(fp1, fp2)

    return similarity

mol1 = Chem.MolFromSmiles("Cc1ccn(-c2ccc(C(=O)N3CCC(F)(F)/C(=C\C(=O)NCc4ccccn4)c4ccccc43)c(Cl)c2)n1")
mol2 = Chem.MolFromSmiles("Cc1ccn(-c2ccc(C(=O)N3CCC(F)(F)/C(=C\C(=O)NCc4cccnn4)c4ccccc43)c(Cl)c2)n1")

similarity_score = calculate_tanimoto_similarity(mol1, mol2)

if similarity_score is not None:
    print(f"Tanimoto Similarity: {similarity_score}")
else:
    print("Invalid SMILES string. Unable to calculate similarity.")

Tanimoto Similarity: 0.85


  mol1 = Chem.MolFromSmiles("Cc1ccn(-c2ccc(C(=O)N3CCC(F)(F)/C(=C\C(=O)NCc4ccccn4)c4ccccc43)c(Cl)c2)n1")
  mol2 = Chem.MolFromSmiles("Cc1ccn(-c2ccc(C(=O)N3CCC(F)(F)/C(=C\C(=O)NCc4cccnn4)c4ccccc43)c(Cl)c2)n1")


In [1]:
import pandas as pd
import os

file_numbers = 20
files = [f'generated/generated_molecules_{i}.csv' for i in range(1, file_numbers + 1)]

df_list = [pd.read_csv(file) for file in files if os.path.exists(file)]
df = pd.concat(df_list, ignore_index=True)

df = df.drop_duplicates(subset=['canonical_smiles'])

valid_count = df[df['validity'] == 'valid'].shape[0]
invalid_count = df[df['validity'] == 'invalid'].shape[0]

print(f"Number of valid entries: {valid_count}")
print(f"Number of invalid entries: {invalid_count}")

Number of valid entries: 100889
Number of invalid entries: 18604


In [2]:
df

Unnamed: 0,canonical_smiles,validity
0,CCCCCc1ccc(-c2nsc(-c3ccccn3)n2)cc1,valid
1,O=C(/C=C/c1ccco1)N1CCN(Cc2ccccc2)C1,valid
2,COc1ccc(/C=N/NC(=O)c2ccccc2)cc1,valid
3,O=C(NC1CCCCC1)c1ccc2ccccc2c1O,valid
4,COc1ccc(/C=N/NC(=O)c2ccccc2OCc2ccccc2)cc1,valid
...,...,...
269462,CCCCC(C(=O)O)c1ccccc1C#Cc1ccccc1,valid
269463,CC(C)(C)c1ccc(NC(=O)Nc2ccc(NC(=O)C(C)C)cc2)cc1,valid
269465,O=S(=O)(=O)c1cccc([N+](=O)[O-])c1,invalid
269466,CCCCNC(=O)c1scnc1S(=O)(=O)c1ccccc1O,valid


In [3]:
# Save the merged DataFrame to a new CSV file
output_file = 'merged_molecules.csv'
df.to_csv(output_file, index=False)
print(f"Merged file saved as {output_file}")

Merged file saved as merged_molecules.csv


# DATA AUGMENTATION

In [1]:
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

# Load the dataset
df = pd.read_csv('NTD_Drugs.csv')

# Function to generate original and augmented SMILES
def augment_smiles(smiles, num_augments=1):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return [None] * num_augments  # Invalid SMILES string
    augmented_smiles_list = []
    for _ in range(num_augments):
        augmented_smiles = Chem.MolToSmiles(mol, doRandom=True)
        augmented_smiles_list.append(augmented_smiles)
    return augmented_smiles_list

# Function to calculate Tanimoto similarity
def calculate_similarity(smiles1, smiles2):
    mol1 = Chem.MolFromSmiles(smiles1)
    mol2 = Chem.MolFromSmiles(smiles2)
    if mol1 is None or mol2 is None:
        return 0.0  # Invalid SMILES string
    fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2)
    similarity = DataStructs.TanimotoSimilarity(fp1, fp2)
    return similarity

# Augment the dataset
augmented_data = []
num_augments = 1000
for smiles in df['canonical_smiles']:
    augmented_smiles_list = augment_smiles(smiles, num_augments=num_augments)
    for augmented_smiles in augmented_smiles_list:
        if augmented_smiles is not None:
            similarity = calculate_similarity(smiles, augmented_smiles)
            augmented_data.append({
                'Original_SMILES': smiles,
                'Augmented_SMILES': augmented_smiles,
                'Augmentation_Type': 'Augmented',
                'Tanimoto_Score': similarity
            })
        else:
            augmented_data.append({
                'Original_SMILES': smiles,
                'Augmented_SMILES': smiles,
                'Augmentation_Type': 'Original',
                'Tanimoto_Score': similarity
            })

# Create a new DataFrame with the augmented SMILES and augmentation type
augmented_df = pd.DataFrame(augmented_data)

augmented_df.to_csv('NTD_augmented_smiles_dataset.csv', index=False)

# Display the augmented DataFrame
augmented_df


Unnamed: 0,Original_SMILES,Augmented_SMILES,Augmentation_Type,Tanimoto_Score
0,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,C(N[C@H]1[C@@H]2N(C1=O)[C@H](C(C)(C)S2)C(O)=O)...,Augmented,1.0
1,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,[C@H](c1ccc(O)cc1)(N)C(N[C@H]1[C@@H]2N(C1=O)[C...,Augmented,1.0
2,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,CC1(S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)C(...,Augmented,1.0
3,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,[C@H](c1ccc(O)cc1)(N)C(N[C@@H]1C(N2[C@@H]1SC([...,Augmented,1.0
4,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,c1([C@@H](N)C(N[C@H]2[C@@H]3N([C@H](C(C)(C)S3)...,Augmented,1.0
...,...,...,...,...
32995,c1cc([C@H]2CC[C@]3(CC2)OO[C@]2(O3)C3CC4CC(C3)C...,C12[C@]3(OO[C@@]4(O3)CC[C@@H](c3ccc(cc3)OCCN3C...,Augmented,1.0
32996,c1cc([C@H]2CC[C@]3(CC2)OO[C@]2(O3)C3CC4CC(C3)C...,[C@H]1(CC[C@@]2(O[C@]3(C4CC5CC3CC(C5)C4)OO2)CC...,Augmented,1.0
32997,c1cc([C@H]2CC[C@]3(CC2)OO[C@]2(O3)C3CC4CC(C3)C...,O1O[C@]2(C3CC4CC(CC2C4)C3)O[C@@]21CC[C@@H](CC2...,Augmented,1.0
32998,c1cc([C@H]2CC[C@]3(CC2)OO[C@]2(O3)C3CC4CC(C3)C...,c1c(ccc(c1)OCCN1CCOCC1)[C@H]1CC[C@]2(O[C@@]3(O...,Augmented,1.0


In [2]:
# Remove non similar canonical smiles
augmented_df = augmented_df[augmented_df['Tanimoto_Score'] == 1.0]

# Save the augmented dataset
augmented_df.to_csv('NTD_augmented_smiles_dataset.csv', index=False)

print("Data augmentation completed and saved to 'augmented_smiles_dataset.csv'.")

Data augmentation completed and saved to 'augmented_smiles_dataset.csv'.


In [3]:
augmented_df

Unnamed: 0,Original_SMILES,Augmented_SMILES,Augmentation_Type,Tanimoto_Score
0,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,C(N[C@H]1[C@@H]2N(C1=O)[C@H](C(C)(C)S2)C(O)=O)...,Augmented,1.0
1,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,[C@H](c1ccc(O)cc1)(N)C(N[C@H]1[C@@H]2N(C1=O)[C...,Augmented,1.0
2,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,CC1(S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)C(...,Augmented,1.0
3,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,[C@H](c1ccc(O)cc1)(N)C(N[C@@H]1C(N2[C@@H]1SC([...,Augmented,1.0
4,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...,c1([C@@H](N)C(N[C@H]2[C@@H]3N([C@H](C(C)(C)S3)...,Augmented,1.0
...,...,...,...,...
32995,c1cc([C@H]2CC[C@]3(CC2)OO[C@]2(O3)C3CC4CC(C3)C...,C12[C@]3(OO[C@@]4(O3)CC[C@@H](c3ccc(cc3)OCCN3C...,Augmented,1.0
32996,c1cc([C@H]2CC[C@]3(CC2)OO[C@]2(O3)C3CC4CC(C3)C...,[C@H]1(CC[C@@]2(O[C@]3(C4CC5CC3CC(C5)C4)OO2)CC...,Augmented,1.0
32997,c1cc([C@H]2CC[C@]3(CC2)OO[C@]2(O3)C3CC4CC(C3)C...,O1O[C@]2(C3CC4CC(CC2C4)C3)O[C@@]21CC[C@@H](CC2...,Augmented,1.0
32998,c1cc([C@H]2CC[C@]3(CC2)OO[C@]2(O3)C3CC4CC(C3)C...,c1c(ccc(c1)OCCN1CCOCC1)[C@H]1CC[C@]2(O[C@@]3(O...,Augmented,1.0


In [4]:
# Count how many entries are 'Original'
original_count = augmented_df[augmented_df['Augmentation_Type'] == 'Original'].shape[0]

# Print the count
print(f"Number of 'Original' entries: {original_count}")

Number of 'Original' entries: 0


In [5]:
import pandas as pd
import numpy as np

# Load the augmented dataset
df = pd.read_csv('NTD_augmented_smiles_dataset.csv')

# Convert 'Original_SMILES' and 'Augmented_SMILES' columns to arrays
original_smiles_array = df['Original_SMILES'].fillna('').to_numpy()
augmented_smiles_array = df['Augmented_SMILES'].fillna('').to_numpy()

# Concatenate arrays
combined_smiles_array = np.concatenate([original_smiles_array, augmented_smiles_array])

# Calculate sizes
size_original_smiles_array = len(original_smiles_array)
size_augmented_smiles_array = len(augmented_smiles_array)
size_combined_smiles_array = len(combined_smiles_array)

print(f"Size of original_smiles_array: {size_original_smiles_array}")
print(f"Size of augmented_smiles_array: {size_augmented_smiles_array}")
print(f"Size of combined_smiles_array: {size_combined_smiles_array}")


Size of original_smiles_array: 33000
Size of augmented_smiles_array: 33000
Size of combined_smiles_array: 66000


In [6]:
# Create a DataFrame from combined_smiles_array
combined_df = pd.DataFrame({'canonical_smiles': combined_smiles_array})

# Remove duplicates
combined_df.drop_duplicates(subset=['canonical_smiles'], keep='first', inplace=True)

# Save the processed DataFrame
combined_df.to_csv('NTD_processed_combined_smiles.csv', index=False)

print("Processed combined SMILES saved to 'processed_combined_smiles.csv'.")

Processed combined SMILES saved to 'processed_combined_smiles.csv'.


In [7]:
combined_df

Unnamed: 0,canonical_smiles
0,CC1(C)S[C@@H]2[C@H](NC(=O)[C@H](N)c3ccc(O)cc3)...
1000,O=C(O)[C@H]1/C(=C/CO)O[C@@H]2CC(=O)N21
2000,CC1CS(=O)(=O)CCN1/N=C/c1ccc([N+](=O)[O-])o1
3000,CSc1ccc(OCc2ncc([N+](=O)[O-])n2C)cc1
4000,O=C(Cn1ccnc1[N+](=O)[O-])NCc1ccccc1
...,...
65995,C12[C@]3(OO[C@@]4(O3)CC[C@@H](c3ccc(cc3)OCCN3C...
65996,[C@H]1(CC[C@@]2(O[C@]3(C4CC5CC3CC(C5)C4)OO2)CC...
65997,O1O[C@]2(C3CC4CC(CC2C4)C3)O[C@@]21CC[C@@H](CC2...
65998,c1c(ccc(c1)OCCN1CCOCC1)[C@H]1CC[C@]2(O[C@@]3(O...


In [8]:
import pandas as pd
from rdkit import Chem

# Load the processed dataset
df = pd.read_csv('NTD_processed_combined_smiles.csv')

# Function to check validity of SMILES string
def check_validity(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return 'invalid'
    else:
        return 'valid'

# Add 'Validity' column
df['Validity'] = df['canonical_smiles'].apply(check_validity)

# Count valid and invalid entries
valid_count = df[df['Validity'] == 'valid'].shape[0]
invalid_count = df[df['Validity'] == 'invalid'].shape[0]

# Print counts
print(f"Number of valid SMILES: {valid_count}")
print(f"Number of invalid SMILES: {invalid_count}")

Number of valid SMILES: 20812
Number of invalid SMILES: 0


In [9]:
import pandas as pd

# Load the processed dataset
df = pd.read_csv('NTD_processed_combined_smiles.csv')

# Drop duplicate SMILES strings
df.drop_duplicates(subset=['canonical_smiles'], keep='first', inplace=True)

# Shuffle the DataFrame
df_shuffled = df.sample(frac=1).reset_index(drop=True)

# Save the shuffled DataFrame
df_shuffled.to_csv('NTD_final_smiles_dataset.csv', index=False)

print("Shuffled SMILES data saved to 'NTD_final_smiles_dataset.csv'.")



Shuffled SMILES data saved to 'NTD_final_smiles_dataset.csv'.


In [10]:
df_shuffled

Unnamed: 0,canonical_smiles
0,NCCCC(C(O)=O)(C(F)F)N
1,[C@@H]1([C@H](O)[C@H](CN2[C@@H]1[C@H](CC2)O)OC...
2,c1cc(cc2c1[nH]c(n2)[S+]([O-])Cc1ncc(c(c1C)OC)C)OC
3,C1COCCN1CCOc1ccc([C@H]2CC[C@]3(CC2)OO[C@]2(C4C...
4,c12[nH]c(nc1cc(cc2)SCCC)NC(=O)OC
...,...
20807,n1c(c2c(n(cn2)CC(C)C)c2c1cccc2)N
20808,C(Nc1sc(S(N)(=O)=O)nn1)(C)=O
20809,c1c(CNC(Cn2ccnc2[N+]([O-])=O)=O)cccc1
20810,O[C@@H]1[C@H]2[C@H]([C@H]3[C@@](C1)(C)[C@@](O)...


In [11]:
import pandas as pd

final_df = pd.read_csv('final_smiles_dataset.csv')
combined_df = pd.concat([final_df, df_shuffled[['canonical_smiles']]], ignore_index=True)
combined_df.drop_duplicates(subset='canonical_smiles', inplace=True)
combined_df.to_csv('NTD_final_smiles_dataset.csv', index=False)

combined_df


Unnamed: 0,canonical_smiles
0,CC(=O)N[C@@H](Cc1ccc(C(F)(F)P(=O)(O)O)cc1)C(=O...
1,N#CC1(NC(=O)[C@@H]2Cc3ccc(c(Cl)c3)OCCCCOc3cc(C...
2,COc1cc2[nH]c3ccc(Nc4cccc(OC(F)(F)F)c4)cc3c(=O)...
3,c1[nH]c2ccccc2c1CC[N+](Cc1cccs1)(Cc1sccc1)[O-]
4,C(CCC(S([O-])(=O)=O)P(=O)([O-])[O-])c1cc(ccc1)...
...,...
279201,n1c(c2c(n(cn2)CC(C)C)c2c1cccc2)N
279202,C(Nc1sc(S(N)(=O)=O)nn1)(C)=O
279203,c1c(CNC(Cn2ccnc2[N+]([O-])=O)=O)cccc1
279204,O[C@@H]1[C@H]2[C@H]([C@H]3[C@@](C1)(C)[C@@](O)...
