<a href="https://colab.research.google.com/github/abazabaaa/colab_tutorial/blob/main/pandas_mol2grd_basics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q condacolab
import condacolab
condacolab.install()
import condacolab
condacolab.check()
!mamba install -c conda-forge rdkit
!mamba install -c pyarrow
!pip install mols2grid
!wget https://raw.githubusercontent.com/abazabaaa/colab_tutorial/main/dopamine_3.csv

In [2]:
import pyarrow as pa
from pyarrow import csv
import pandas as pd
import pyarrow.feather as feather
from rdkit.Chem import PandasTools
import mols2grid
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs


The first action we will take is to ingest the CHEMBL csv which is used to store compound information associated with the dopamine 3 receptor. We will use pyarrow to do this -- it will become apparent later why we are employing this library, as it has some rather useful tools that will help us deal with very large CSV files.

In [3]:
# We select the file we are interested in (we used wget to download this from
# github)

filename = '/content/dopamine_3.csv'

# Next we will the multithreaded read options that pyarrow allows for.

opts = pa.csv.ReadOptions(use_threads=True)

# Then we tell pyarrow that the columns in our csv file are seperated by ';'
# If they were tab seperated we would use '\t' and if it was comma we would use 
# ','
parse_options= pa.csv.ParseOptions(delimiter=';')

# Now we read the CSV into a pyarrow table. This is a columular dataset. More
# on this later. Note how we specified the options above.

table = pa.csv.read_csv(filename, opts, parse_options)







In [4]:
# Now we will use a function that converts the pyarrow table into a pandas 
# dataframe. We could have done this without arrow, but again -- there are 
# very powerful tools that arrow will grant us.

df_new = table.to_pandas()

In [5]:
# A dataframe is like an excel spreadsheet. You can view the table by running 
# the cell below. Note that it will be truncated.

df_new

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Comment,Uo Units,Ligand Efficiency BEI,Ligand Efficiency LE,Ligand Efficiency LLE,Ligand Efficiency SEI,Potential Duplicate,Assay ChEMBL ID,Assay Description,Assay Type,BAO Format ID,BAO Label,Assay Organism,Assay Tissue ChEMBL ID,Assay Tissue Name,Assay Cell Type,Assay Subcellular Fraction,Assay Parameters,Assay Variant Accession,Assay Variant Mutation,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties
0,CHEMBL298875,,0,439.35,0,4.62,ent3g (Optical antipode),COc1c(C(=O)N[C@@H]2CCN(Cc3ccccc3)C2)cc(Br)c2cc...,Ki,'=',84.000,nM,7.08,,,UO_0000065,16.10,0.35,2.46,17.02,False,CHEMBL674986,Displacement of [3H]spiperone [0.5 nM (Kd=0.1 ...,B,BAO_0000219,cell-based format,,,,CHO,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL1136241,1,Scientific Literature,Bioorg. Med. Chem. Lett.,2003,CHEMBL3308072,
1,CHEMBL3664844,,0,472.63,0,3.59,BDBM107528,O=C(CC1(O)CCOCC1)N[C@H]1CC[C@H](CCN2CCC(c3cccc...,Ki,'=',7.150,nM,8.15,,227001,UO_0000065,17.23,0.33,4.56,10.15,False,CHEMBL3707696,Biochemical Assay: The ability of the compound...,B,BAO_0000219,cell-based format,Homo sapiens,,,EBNA,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL3638396,37,BindingDB Database,,2013,,
2,CHEMBL3642730,,0,442.60,0,2.63,BDBM106464,O=C(N[C@H]1CC[C@H](CCN2CCN(c3nccc4c3CCO4)CC2)C...,Ki,'=',28.800,nM,7.54,,225075,UO_0000065,17.04,0.32,4.91,11.27,False,CHEMBL3707834,Radioligand Binding Assay: The ability of the ...,B,BAO_0000219,cell-based format,Homo sapiens,,,EBNA,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL3639183,37,BindingDB Database,,2013,,
3,CHEMBL3323013,,0,503.57,1,4.00,27,CC(C)(C)c1nc(N2CCN(CCCCNC(=O)c3nc4ccccc4[nH]3)...,Ki,'=',4.300,nM,8.37,,,UO_0000065,16.61,0.32,4.37,9.29,False,CHEMBL3379350,Displacement of [125I]IABN from human D3R expr...,B,BAO_0000219,cell-based format,Homo sapiens,,,HEK293,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL3351252,1,Scientific Literature,J. Med. Chem.,2014,CHEMBL3307715,
4,CHEMBL4476784,,0,379.46,0,3.01,20; ML417,COc1ccc(OCCN2CCN(C(=O)c3cc4ccccc4[nH]3)CC2)cc1,EC50,'=',1.200,nM,8.92,,,UO_0000065,23.51,0.44,5.91,15.43,False,CHEMBL4406540,Agonist activity at RLuc8-fused human D3R expr...,B,BAO_0000219,cell-based format,Homo sapiens,,,HEK293,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL4402608,1,Scientific Literature,J Med Chem,2020,CHEMBL3307715,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5967,CHEMBL4517102,,0,455.99,0,3.95,25,CCc1cccc(N2CCN(CC(O)CCNC(=O)c3cc4ccccc4o3)CC2)...,Ki,'=',0.985,nM,9.01,,,UO_0000065,19.75,0.38,5.06,13.06,False,CHEMBL4429154,Displacement of [3H]N-methylspiperone from hum...,B,BAO_0000219,cell-based format,Homo sapiens,,,HEK293,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL4428060,1,Scientific Literature,J Med Chem,2016,CHEMBL3307715,
5968,CHEMBL4072282,,0,488.59,0,3.76,22,COc1ccccc1N1CCN(CCCNC(=O)Oc2ccc(-c3cccc(C(N)=O...,EC50,'=',7.400,nM,8.13,,,UO_0000065,16.64,0.31,4.37,8.37,False,CHEMBL4008738,Agonist activity at recombinant human dopamine...,B,BAO_0000219,cell-based format,Homo sapiens,,,,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL4007414,1,Scientific Literature,J Med Chem,2017,,
5969,CHEMBL4536530,,0,508.46,1,4.49,45,O=C(NCCN1CCN(c2ccc(Br)cc2)CC1)NN(Cc1ccccc1)c1c...,Ki,'=',1800.000,nM,5.75,,,UO_0000065,11.30,0.24,1.25,11.30,False,CHEMBL4323480,Displacement of [3H] Spiperone from human D3 r...,B,BAO_0000219,cell-based format,Homo sapiens,,,CHO,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL4321823,1,Scientific Literature,J Med Chem,2019,CHEMBL3308072,
5970,CHEMBL4466114,,0,555.46,1,4.33,47,O=C(NCCN1CCN(c2ccc(I)cc2)CC1)NN(Cc1ccccc1)c1cc...,Ki,'=',1800.000,nM,5.75,,,UO_0000065,10.34,0.24,1.41,11.30,False,CHEMBL4323480,Displacement of [3H] Spiperone from human D3 r...,B,BAO_0000219,cell-based format,Homo sapiens,,,CHO,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL4321823,1,Scientific Literature,J Med Chem,2019,CHEMBL3308072,


In [6]:
# We generally don't want to view the whole dataframe -- sometimes we just need
# to make sure it is formated properly for later tasks. You can use the syntax
# below to enable you to only see the first 5 rows.

df_new.head(5)

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Comment,Uo Units,Ligand Efficiency BEI,Ligand Efficiency LE,Ligand Efficiency LLE,Ligand Efficiency SEI,Potential Duplicate,Assay ChEMBL ID,Assay Description,Assay Type,BAO Format ID,BAO Label,Assay Organism,Assay Tissue ChEMBL ID,Assay Tissue Name,Assay Cell Type,Assay Subcellular Fraction,Assay Parameters,Assay Variant Accession,Assay Variant Mutation,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties
0,CHEMBL298875,,0,439.35,0,4.62,ent3g (Optical antipode),COc1c(C(=O)N[C@@H]2CCN(Cc3ccccc3)C2)cc(Br)c2cc...,Ki,'=',84.0,nM,7.08,,,UO_0000065,16.1,0.35,2.46,17.02,False,CHEMBL674986,Displacement of [3H]spiperone [0.5 nM (Kd=0.1 ...,B,BAO_0000219,cell-based format,,,,CHO,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL1136241,1,Scientific Literature,Bioorg. Med. Chem. Lett.,2003,CHEMBL3308072,
1,CHEMBL3664844,,0,472.63,0,3.59,BDBM107528,O=C(CC1(O)CCOCC1)N[C@H]1CC[C@H](CCN2CCC(c3cccc...,Ki,'=',7.15,nM,8.15,,227001.0,UO_0000065,17.23,0.33,4.56,10.15,False,CHEMBL3707696,Biochemical Assay: The ability of the compound...,B,BAO_0000219,cell-based format,Homo sapiens,,,EBNA,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL3638396,37,BindingDB Database,,2013,,
2,CHEMBL3642730,,0,442.6,0,2.63,BDBM106464,O=C(N[C@H]1CC[C@H](CCN2CCN(c3nccc4c3CCO4)CC2)C...,Ki,'=',28.8,nM,7.54,,225075.0,UO_0000065,17.04,0.32,4.91,11.27,False,CHEMBL3707834,Radioligand Binding Assay: The ability of the ...,B,BAO_0000219,cell-based format,Homo sapiens,,,EBNA,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL3639183,37,BindingDB Database,,2013,,
3,CHEMBL3323013,,0,503.57,1,4.0,27,CC(C)(C)c1nc(N2CCN(CCCCNC(=O)c3nc4ccccc4[nH]3)...,Ki,'=',4.3,nM,8.37,,,UO_0000065,16.61,0.32,4.37,9.29,False,CHEMBL3379350,Displacement of [125I]IABN from human D3R expr...,B,BAO_0000219,cell-based format,Homo sapiens,,,HEK293,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL3351252,1,Scientific Literature,J. Med. Chem.,2014,CHEMBL3307715,
4,CHEMBL4476784,,0,379.46,0,3.01,20; ML417,COc1ccc(OCCN2CCN(C(=O)c3cc4ccccc4[nH]3)CC2)cc1,EC50,'=',1.2,nM,8.92,,,UO_0000065,23.51,0.44,5.91,15.43,False,CHEMBL4406540,Agonist activity at RLuc8-fused human D3R expr...,B,BAO_0000219,cell-based format,Homo sapiens,,,HEK293,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL4402608,1,Scientific Literature,J Med Chem,2020,CHEMBL3307715,


In [7]:
# Lets say we don't want all the columns and we need to remove some. Lets
# get a list of the column names so we can select some to be removed. We will
# cast the column names to a list with the syntax below.

list(df_new.columns)

['Molecule ChEMBL ID',
 'Molecule Name',
 'Molecule Max Phase',
 'Molecular Weight',
 '#RO5 Violations',
 'AlogP',
 'Compound Key',
 'Smiles',
 'Standard Type',
 'Standard Relation',
 'Standard Value',
 'Standard Units',
 'pChEMBL Value',
 'Data Validity Comment',
 'Comment',
 'Uo Units',
 'Ligand Efficiency BEI',
 'Ligand Efficiency LE',
 'Ligand Efficiency LLE',
 'Ligand Efficiency SEI',
 'Potential Duplicate',
 'Assay ChEMBL ID',
 'Assay Description',
 'Assay Type',
 'BAO Format ID',
 'BAO Label',
 'Assay Organism',
 'Assay Tissue ChEMBL ID',
 'Assay Tissue Name',
 'Assay Cell Type',
 'Assay Subcellular Fraction',
 'Assay Parameters',
 'Assay Variant Accession',
 'Assay Variant Mutation',
 'Target ChEMBL ID',
 'Target Name',
 'Target Organism',
 'Target Type',
 'Document ChEMBL ID',
 'Source ID',
 'Source Description',
 'Document Journal',
 'Document Year',
 'Cell ChEMBL ID',
 'Properties']

In [8]:
# Standard type, relation, value and units are all related to a compounds affinity.
# Lets combine these into one column. You may notive that some columns are blank.
# These are called NaNs, or NoneType entries. Additionally, some of the columns
# may be of different types -- strings, float, integers, etc.
# These can cause headaches for various downstream processes, so lets use some 
# tricks to handle these

df_new['ki'] = df_new[['Standard Type', 'Standard Relation', 'Standard Value', 'Standard Units']].astype(str).agg('-'.join, axis=1)

In [9]:
# Check to see if the new column is at the end. 
df_new.head(2)

Unnamed: 0,Molecule ChEMBL ID,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Compound Key,Smiles,Standard Type,Standard Relation,Standard Value,Standard Units,pChEMBL Value,Data Validity Comment,Comment,Uo Units,Ligand Efficiency BEI,Ligand Efficiency LE,Ligand Efficiency LLE,Ligand Efficiency SEI,Potential Duplicate,Assay ChEMBL ID,Assay Description,Assay Type,BAO Format ID,BAO Label,Assay Organism,Assay Tissue ChEMBL ID,Assay Tissue Name,Assay Cell Type,Assay Subcellular Fraction,Assay Parameters,Assay Variant Accession,Assay Variant Mutation,Target ChEMBL ID,Target Name,Target Organism,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,ki
0,CHEMBL298875,,0,439.35,0,4.62,ent3g (Optical antipode),COc1c(C(=O)N[C@@H]2CCN(Cc3ccccc3)C2)cc(Br)c2cc...,Ki,'=',84.0,nM,7.08,,,UO_0000065,16.1,0.35,2.46,17.02,False,CHEMBL674986,Displacement of [3H]spiperone [0.5 nM (Kd=0.1 ...,B,BAO_0000219,cell-based format,,,,CHO,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL1136241,1,Scientific Literature,Bioorg. Med. Chem. Lett.,2003,CHEMBL3308072,,Ki-'='-84.0-nM
1,CHEMBL3664844,,0,472.63,0,3.59,BDBM107528,O=C(CC1(O)CCOCC1)N[C@H]1CC[C@H](CCN2CCC(c3cccc...,Ki,'=',7.15,nM,8.15,,227001.0,UO_0000065,17.23,0.33,4.56,10.15,False,CHEMBL3707696,Biochemical Assay: The ability of the compound...,B,BAO_0000219,cell-based format,Homo sapiens,,,EBNA,,,,,CHEMBL234,Dopamine D3 receptor,Homo sapiens,SINGLE PROTEIN,CHEMBL3638396,37,BindingDB Database,,2013,,,Ki-'='-7.15-nM


In [10]:
# Ok. Now lets clean up the dataframe and only keep columns that are of interest
# to us.

df_clean = df_new[['Molecule ChEMBL ID', 'ki', 'Smiles', 'Document Journal', 'Document Year']]

In [11]:
df_clean.head(1)

Unnamed: 0,Molecule ChEMBL ID,ki,Smiles,Document Journal,Document Year
0,CHEMBL298875,Ki-'='-84.0-nM,COc1c(C(=O)N[C@@H]2CCN(Cc3ccccc3)C2)cc(Br)c2cc...,Bioorg. Med. Chem. Lett.,2003


In [12]:
# Lets add a column that allows us to see the structures.
PandasTools.AddMoleculeColumnToFrame(df_clean, smilesCol='Smiles')
df_clean.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  frame[molCol] = frame[smilesCol].map(Chem.MolFromSmiles)


Unnamed: 0,Molecule ChEMBL ID,ki,Smiles,Document Journal,Document Year,ROMol
0,CHEMBL298875,Ki-'='-84.0-nM,COc1c(C(=O)N[C@@H]2CCN(Cc3ccccc3)C2)cc(Br)c2cc...,Bioorg. Med. Chem. Lett.,2003,
1,CHEMBL3664844,Ki-'='-7.15-nM,O=C(CC1(O)CCOCC1)N[C@H]1CC[C@H](CCN2CCC(c3cccc...,,2013,


In [13]:
# OK. Lets get sophisticated and say we want to compare a molecule we have
# to an existing d3 ligand. 

ref_smiles = 'CCC1=CC(Cl)=C(OC)C(C(NC[C@H]2C[C@H](OC)CN2CC)=O)=C1O'
ref_mol = Chem.MolFromSmiles(ref_smiles)

# Now, lets take a look how similar our molecule is to existing structures.

ref_ECFP4_fps = AllChem.GetMorganFingerprintAsBitVect(ref_mol,2)
bulk_ECFP4_fps = [AllChem.GetMorganFingerprintAsBitVect(x,2) for x in df_clean['ROMol']]
similarity_efcp4 = [DataStructs.FingerprintSimilarity(ref_ECFP4_fps,x) for x in bulk_ECFP4_fps]
df_clean['Tanimoto_Similarity (ECFP4)'] = similarity_efcp4

final_df = df_clean.sort_values(by=['Tanimoto_Similarity (ECFP4)'], ascending=False)
final_df.head(100)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if sys.path[0] == '':


Unnamed: 0,Molecule ChEMBL ID,ki,Smiles,Document Journal,Document Year,ROMol,Tanimoto_Similarity (ECFP4)
417,CHEMBL8946,Ki-'='-0.11-nM,CCc1cc(Cl)c(OC)c(C(=O)NC[C@@H]2CCCN2CC)c1O,J Med Chem,2016,,0.700000
4947,CHEMBL8946,Ki-'='-0.134-nM,CCc1cc(Cl)c(OC)c(C(=O)NC[C@@H]2CCCN2CC)c1O,J. Med. Chem.,2014,,0.700000
3821,CHEMBL8946,Ki-'='-0.16-nM,CCc1cc(Cl)c(OC)c(C(=O)NC[C@@H]2CCCN2CC)c1O,J. Med. Chem.,2003,,0.700000
236,CHEMBL8809,Ki-'='-15.0-nM,CCN1CCC[C@H]1CNC(=O)c1c(O)c(Cl)cc(Cl)c1OC,Bioorg. Med. Chem.,2007,,0.587302
5926,CHEMBL8809,Ki-'='-13.4-nM,CCN1CCC[C@H]1CNC(=O)c1c(O)c(Cl)cc(Cl)c1OC,J Med Chem,2016,,0.587302
...,...,...,...,...,...,...,...
3328,CHEMBL287198,Ki-'='-8.0-nM,COc1c(C(=O)NCCN2CCN(c3cccc(Cl)c3Cl)CC2)cc(Br)c...,Bioorg. Med. Chem. Lett.,1996,,0.267442
3044,CHEMBL169229,Ki-'='-50.12-nM,COc1c(C(=O)NCCN2CCN(c3cccc(Cl)c3C)CC2)cc(Br)c2...,J. Med. Chem.,1997,,0.264368
1229,CHEMBL285816,Ki-'='-132.0-nM,COc1c(C(=O)NCCN2CCN(c3ccc(Cl)cc3)CC2)cc(Br)c2c...,Bioorg. Med. Chem. Lett.,1996,,0.261905
3228,CHEMBL3597642,Ki-'='-0.907-nM,COc1c(Cl)cccc1N1CCN(CCCCNC(=O)c2cc3ccccc3o2)CC1,J. Med. Chem.,2015,,0.261364


In [14]:
df = final_df.head(400)


In [15]:
mg = mols2grid.MolGrid(df, smiles_col="Smiles", size=(220, 180))
mg.display(subset=["Molecule ChEMBL ID", 'Tanimoto_Similarity (ECFP4)', 'ki', "img"], n_cols=7)