# SMILES lookup

Using the SMILES data extracted from Wikipedia by [Ertl et al (2015)](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-015-0061-y), and matching by molecule name to the data extracted from [PsychonautWiki](https://psychonautwiki.org/wiki/PsychonautWiki).

In [1]:
import pandas as pd

print(f"Pandas version {pd.__version__}")

pd.set_option("display.max_rows", 10)

Pandas version 1.2.4


In [2]:
# Load the PsychonautWiki data, treating all columns as strings.
psychonaut = pd.read_csv("psychonaut.csv", converters={i: str for i in range(5)})
psychonaut

Unnamed: 0,name,url,class0,class1,class2
0,"1,4-Butanediol","https://psychonautwiki.org/wiki/1,4-Butanediol",Depressant,,
1,1B-LSD,https://psychonautwiki.org/wiki/1B-LSD,Psychedelics,,
2,1P-LSD,https://psychonautwiki.org/wiki/1P-LSD,Psychedelics,,
3,1cP-LSD,https://psychonautwiki.org/wiki/1cP-LSD,Psychedelics,,
4,2-Aminoindane,https://psychonautwiki.org/wiki/2-Aminoindane,Stimulants,,
...,...,...,...,...,...
248,4-HO-DiPT,https://psychonautwiki.org/wiki/4-HO-DiPT,Psychedelics,,
249,2C-T-21,https://psychonautwiki.org/wiki/2C-T-21,Psychedelics,,
250,2C-T-7,https://psychonautwiki.org/wiki/2C-T-7,Psychedelics,,
251,DOI,https://psychonautwiki.org/wiki/DOI,Psychedelics,,


In [3]:
wikipedia = pd.read_csv("https://www.cheminfo.org/wikipedia/smiles.txt", header=0, names=["name", "smiles"], sep='\t')

# NB: the wikipedia dataset contains multiple SMILES representations for some molecules
# We remove them here.
wikipedia = wikipedia.drop_duplicates(subset=['name'])
wikipedia

Unnamed: 0,name,smiles
0,Aspirin,O=C(C)Oc1ccccc1C(=O)O
1,Acetylene,C#C
2,Adenosine triphosphate,O=P(O)(O)OP(=O)(O)OP(=O)(O)OC[C@H]3O[C@@H](n2c...
3,Ampicillin,CC1(C(N2C(S1)C(C2=O)NC(=O)C(C3=CC=CC=C3)N)C(=O...
4,Chemistry of ascorbic acid,C([C@@H]([C@@H]1C(=C(C(=O)O1)O)O)O)O
...,...,...
18634,Pentaphenylantimony,C1=CC=C(C=C1)[Sb](C2=CC=CC=C2)(C3=CC=CC=C3)(C4...
18635,CUMYL-BC-HPMEGACLONE-221,CC(C)(c1ccccc1)N1C=Cc2n(CC3CC4CCC3C4)c3ccccc3c...
18636,Para-Quaterphenyl,C1=CC=C(C=C1)C2=CC=C(C=C2)C3=CC=C(C=C3)C4=CC=C...
18637,3-Quinuclidinyl thiochromane-4-carboxylate,O=C(OC1CN2CCC1CC2)C1CCSc2ccccc21


# Merge Psychonaut substances with SMILES from WIkipedia

Merge using keys on from the psychonaut (left) dataframe, to match on the "name" column with the Wikipedia SMILES data.

In [4]:
matched = pd.merge(psychonaut, wikipedia, on="name", how="left", indicator=True, validate="one_to_one")
matched

Unnamed: 0,name,url,class0,class1,class2,smiles,_merge
0,"1,4-Butanediol","https://psychonautwiki.org/wiki/1,4-Butanediol",Depressant,,,C(CCO)CO,both
1,1B-LSD,https://psychonautwiki.org/wiki/1B-LSD,Psychedelics,,,CN1[C@](C2=C[C@@H](C(N(CC)CC)=O)C1)([H])CC3=CN...,both
2,1P-LSD,https://psychonautwiki.org/wiki/1P-LSD,Psychedelics,,,CCN(CC)C(=O)[C@H]1CN(C)[C@@H]2Cc3cn(C(=O)CC)c4...,both
3,1cP-LSD,https://psychonautwiki.org/wiki/1cP-LSD,Psychedelics,,,CCN(CC)C(=O)[C@@H]5C=C2[C@@H](Cc3cn(C(=O)C1CC1...,both
4,2-Aminoindane,https://psychonautwiki.org/wiki/2-Aminoindane,Stimulants,,,C1C(CC2=CC=CC=C21)N,both
...,...,...,...,...,...,...,...
248,4-HO-DiPT,https://psychonautwiki.org/wiki/4-HO-DiPT,Psychedelics,,,CC(C)N(CCc1c[nH]c2cccc(O)c12)C(C)C,both
249,2C-T-21,https://psychonautwiki.org/wiki/2C-T-21,Psychedelics,,,COc1cc(SCCF)c(cc1CCN)OC,both
250,2C-T-7,https://psychonautwiki.org/wiki/2C-T-7,Psychedelics,,,COc1cc(SCCC)c(cc1CCN)OC,both
251,DOI,https://psychonautwiki.org/wiki/DOI,Psychedelics,,,,left_only


# Which substances were not matched by the above process?

These are records we need to manually find SMILES for.

In [5]:
unmatched = matched[matched["_merge"] == "left_only"] 
unmatched

Unnamed: 0,name,url,class0,class1,class2,smiles,_merge
5,2-FA,https://psychonautwiki.org/wiki/2-FA,Stimulants,,,,left_only
6,2-FEA,https://psychonautwiki.org/wiki/2-FEA,Stimulants,Entactogens,,,left_only
7,2-FMA,https://psychonautwiki.org/wiki/2-FMA,Stimulants,,,,left_only
25,2M2B,https://psychonautwiki.org/wiki/2M2B,Depressant,,,,left_only
26,3-FA,https://psychonautwiki.org/wiki/3-FA,Stimulants,,,,left_only
...,...,...,...,...,...,...,...
242,MiPLA,https://psychonautwiki.org/wiki/MIPLA,Psychedelics,,,,left_only
246,MET,https://psychonautwiki.org/wiki/MET,Psychedelics,,,,left_only
247,EPT,https://psychonautwiki.org/wiki/EPT,Psychedelics,,,,left_only
251,DOI,https://psychonautwiki.org/wiki/DOI,Psychedelics,,,,left_only


In [6]:
unmatched.to_csv("intermediate/unmatched.csv", index=False)

# Sanity check: which molecules were matched?

In [7]:
resovled = matched[matched["_merge"] == "both"] 
resovled

Unnamed: 0,name,url,class0,class1,class2,smiles,_merge
0,"1,4-Butanediol","https://psychonautwiki.org/wiki/1,4-Butanediol",Depressant,,,C(CCO)CO,both
1,1B-LSD,https://psychonautwiki.org/wiki/1B-LSD,Psychedelics,,,CN1[C@](C2=C[C@@H](C(N(CC)CC)=O)C1)([H])CC3=CN...,both
2,1P-LSD,https://psychonautwiki.org/wiki/1P-LSD,Psychedelics,,,CCN(CC)C(=O)[C@H]1CN(C)[C@@H]2Cc3cn(C(=O)CC)c4...,both
3,1cP-LSD,https://psychonautwiki.org/wiki/1cP-LSD,Psychedelics,,,CCN(CC)C(=O)[C@@H]5C=C2[C@@H](Cc3cn(C(=O)C1CC1...,both
4,2-Aminoindane,https://psychonautwiki.org/wiki/2-Aminoindane,Stimulants,,,C1C(CC2=CC=CC=C21)N,both
...,...,...,...,...,...,...,...
244,AL-LAD,https://psychonautwiki.org/wiki/AL-LAD,Psychedelics,,,[H][C@@]12Cc3c[nH]c4cccc(C1=C[C@@H](C(=O)N(CC)...,both
245,1P-ETH-LAD,https://psychonautwiki.org/wiki/1P-ETH-LAD,Psychedelics,,,CCC(=O)N1C=C2C[C@@H]3C(=C[C@H](CN3CC)C(=O)N(CC...,both
248,4-HO-DiPT,https://psychonautwiki.org/wiki/4-HO-DiPT,Psychedelics,,,CC(C)N(CCc1c[nH]c2cccc(O)c12)C(C)C,both
249,2C-T-21,https://psychonautwiki.org/wiki/2C-T-21,Psychedelics,,,COc1cc(SCCF)c(cc1CCN)OC,both


# Locate SMILES for "missing" records

Looking at the "missing.txt" compound names, try to match up with Wikipedia records to find SMILES values.

In [8]:
missing = pd.read_csv("intermediate/missing.txt", header=0, names=["name"], sep="\t")
missing = missing.drop_duplicates(subset=['name'])
missing

Unnamed: 0,name
0,2-Me-DET
1,2-Me-DMT
2,25E-NBOH
3,25E-NBOMe
4,25G-NBOMe
...,...
197,2C-T-15
198,2C-T-16
199,2C-T-17
200,2C-T-18


# Match the missing substances against the Wikipedia dataset

And how many did we find from this process?

In [9]:
matched_missing = pd.merge(missing, wikipedia, on="name", how="left", indicator=True, validate="one_to_one")

In [10]:
is_found = (matched_missing["_merge"] == "both")
found_missing = matched_missing[is_found]
found_missing

Unnamed: 0,name,smiles,_merge
0,2-Me-DET,c1cccc2c1c(c([nH]2)C)CCN(CC)CC,both
2,25E-NBOH,CCC1=CC(=C(C=C1OC)CCNCC2=CC=CC=C2O)OC,both
3,25E-NBOMe,COC(C=CC=C1)=C1CNCCC2=C(OC)C=C(CC)C(OC)=C2,both
4,25G-NBOMe,CC1=C(C=C(C(=C1C)OC)CCNCC2=CC=CC=C2OC)OC,both
5,25H-NBOMe,COC1=CC(=C(C=C1)OC)CCNCC2=CC=CC=C2OC,both
...,...,...,...
189,2C-T-8,COc2cc(SCC1CC1)c(cc2CCN)OC,both
194,2C-T-13,COc1cc(SCCOC)c(cc1CCN)OC,both
196,2C-T-15,COc2cc(SC1CC1)c(cc2CCN)OC,both
197,2C-T-16,COc1cc(SCC=C)c(OC)cc1CCN,both


# Combine "found_missing" records and "matched" records 

And write to CSV

In [11]:
combined = pd.concat([matched, found_missing], axis=0, ignore_index=True)
combined

Unnamed: 0,name,url,class0,class1,class2,smiles,_merge
0,"1,4-Butanediol","https://psychonautwiki.org/wiki/1,4-Butanediol",Depressant,,,C(CCO)CO,both
1,1B-LSD,https://psychonautwiki.org/wiki/1B-LSD,Psychedelics,,,CN1[C@](C2=C[C@@H](C(N(CC)CC)=O)C1)([H])CC3=CN...,both
2,1P-LSD,https://psychonautwiki.org/wiki/1P-LSD,Psychedelics,,,CCN(CC)C(=O)[C@H]1CN(C)[C@@H]2Cc3cn(C(=O)CC)c4...,both
3,1cP-LSD,https://psychonautwiki.org/wiki/1cP-LSD,Psychedelics,,,CCN(CC)C(=O)[C@@H]5C=C2[C@@H](Cc3cn(C(=O)C1CC1...,both
4,2-Aminoindane,https://psychonautwiki.org/wiki/2-Aminoindane,Stimulants,,,C1C(CC2=CC=CC=C21)N,both
...,...,...,...,...,...,...,...
330,2C-T-8,,,,,COc2cc(SCC1CC1)c(cc2CCN)OC,both
331,2C-T-13,,,,,COc1cc(SCCOC)c(cc1CCN)OC,both
332,2C-T-15,,,,,COc2cc(SC1CC1)c(cc2CCN)OC,both
333,2C-T-16,,,,,COc1cc(SCC=C)c(OC)cc1CCN,both


In [12]:
combined.to_csv("intermediate/combined.csv", index=False)

# Merge in the manually curated records to create a library.csv

In [13]:
manually_found = pd.read_csv("intermediate/manually-found.csv", converters={i: str for i in range(8)})
manually_found

Unnamed: 0,name,url,class0,class1,class2,smiles,wikipedia_url,isomerdesign_url
0,2-FA,https://psychonautwiki.org/wiki/2-FA,Stimulants,,,CC(CC1=CC=CC=C1F)N,https://en.wikipedia.org/wiki/2-Fluoroamphetamine,
1,2-FEA,https://psychonautwiki.org/wiki/2-FEA,Stimulants,Entactogens,,CCNC(Cc1ccccc1F)C,,https://isomerdesign.com/PiHKAL/explore.php?do...
2,2-FMA,https://psychonautwiki.org/wiki/2-FMA,Stimulants,,,CC(CC1=CC=CC=C1F)NC,https://en.wikipedia.org/wiki/2-Fluoromethamph...,
3,2M2B,https://psychonautwiki.org/wiki/2M2B,Depressant,,,CCC(C)(C)O,https://en.wikipedia.org/wiki/Tert-Amyl_alcohol,
4,3-FA,https://psychonautwiki.org/wiki/3-FA,Stimulants,,,Fc1cccc(c1)CC(C)N,https://en.wikipedia.org/wiki/3-Fluoroamphetamine,
...,...,...,...,...,...,...,...,...
68,MiPLA,https://psychonautwiki.org/wiki/MIPLA,Psychedelics,,,C4N(C)C1Cc2c[nH]c(ccc3)c2c3C1=CC4C(=O)N(C)C(C)C,https://en.wikipedia.org/wiki/Methylisopropyll...,
69,MET,https://psychonautwiki.org/wiki/MET,Psychedelics,,,c1cccc2c1c(c[nH]2)CCN(CC)C,https://en.wikipedia.org/wiki/N-Methyl-N-ethyl...,
70,EPT,https://psychonautwiki.org/wiki/EPT,Psychedelics,,,CCCN(CC)CCC1=CNC2=CC=CC=C12,https://en.wikipedia.org/wiki/Ethylpropyltrypt...,
71,DOI,https://psychonautwiki.org/wiki/DOI,Psychedelics,,,IC(C=C1OC)=C(OC)C=C1CC(C)N,"https://en.wikipedia.org/wiki/2,5-Dimethoxy-4-...",


In [14]:
library = pd.concat([combined, manually_found], axis=0, ignore_index=True)
library

Unnamed: 0,name,url,class0,class1,class2,smiles,_merge,wikipedia_url,isomerdesign_url
0,"1,4-Butanediol","https://psychonautwiki.org/wiki/1,4-Butanediol",Depressant,,,C(CCO)CO,both,,
1,1B-LSD,https://psychonautwiki.org/wiki/1B-LSD,Psychedelics,,,CN1[C@](C2=C[C@@H](C(N(CC)CC)=O)C1)([H])CC3=CN...,both,,
2,1P-LSD,https://psychonautwiki.org/wiki/1P-LSD,Psychedelics,,,CCN(CC)C(=O)[C@H]1CN(C)[C@@H]2Cc3cn(C(=O)CC)c4...,both,,
3,1cP-LSD,https://psychonautwiki.org/wiki/1cP-LSD,Psychedelics,,,CCN(CC)C(=O)[C@@H]5C=C2[C@@H](Cc3cn(C(=O)C1CC1...,both,,
4,2-Aminoindane,https://psychonautwiki.org/wiki/2-Aminoindane,Stimulants,,,C1C(CC2=CC=CC=C21)N,both,,
...,...,...,...,...,...,...,...,...,...
403,MiPLA,https://psychonautwiki.org/wiki/MIPLA,Psychedelics,,,C4N(C)C1Cc2c[nH]c(ccc3)c2c3C1=CC4C(=O)N(C)C(C)C,,https://en.wikipedia.org/wiki/Methylisopropyll...,
404,MET,https://psychonautwiki.org/wiki/MET,Psychedelics,,,c1cccc2c1c(c[nH]2)CCN(CC)C,,https://en.wikipedia.org/wiki/N-Methyl-N-ethyl...,
405,EPT,https://psychonautwiki.org/wiki/EPT,Psychedelics,,,CCCN(CC)CCC1=CNC2=CC=CC=C12,,https://en.wikipedia.org/wiki/Ethylpropyltrypt...,
406,DOI,https://psychonautwiki.org/wiki/DOI,Psychedelics,,,IC(C=C1OC)=C(OC)C=C1CC(C)N,,"https://en.wikipedia.org/wiki/2,5-Dimethoxy-4-...",


In [15]:
library.to_csv("library.csv", index=False)

# Count of class frequency

Combining class0, 1, and 2 into a single class column to see how popular each class is in in the combined data.

In [16]:
classes = pd.melt(library, id_vars='name', var_name="original_class_column", value_vars=["class0", "class1", "class2"], value_name='class')
classes["class"].value_counts()

                 619
Psychedelics     104
Stimulants        82
Entactogens       36
Depressant        36
                ... 
Hallucinogens      4
Deliriant          3
Oneirogen          2
Eugeroic           1
Sedative           1
Name: class, Length: 15, dtype: int64

# Which compounds are classed as Psychedelic?

In [17]:
is_psychedeics = (classes["class"] == "Psychedelics")
classes[is_psychedeics]

Unnamed: 0,name,original_class_column,class
1,1B-LSD,class0,Psychedelics
2,1P-LSD,class0,Psychedelics
3,1cP-LSD,class0,Psychedelics
9,25B-NBOH,class0,Psychedelics
10,25C-NBOH,class0,Psychedelics
...,...,...,...
618,TMA-2,class1,Psychedelics
619,TMA-6,class1,Psychedelics
782,MDA,class1,Psychedelics
801,TMA-2,class1,Psychedelics


In [18]:
# Display all the psychedelics
pd.set_option("display.max_rows", None)
classes[is_psychedeics]

Unnamed: 0,name,original_class_column,class
1,1B-LSD,class0,Psychedelics
2,1P-LSD,class0,Psychedelics
3,1cP-LSD,class0,Psychedelics
9,25B-NBOH,class0,Psychedelics
10,25C-NBOH,class0,Psychedelics
11,25C-NBOMe,class0,Psychedelics
12,25D-NBOMe,class0,Psychedelics
13,25I-NBOH,class0,Psychedelics
14,25I-NBOMe,class0,Psychedelics
15,25N-NBOMe,class0,Psychedelics
