In [1]:
import pubchempy as pcp
import pandas as pd

# 5. Properties
## - Retrieval of specific properties using get_properties() without having to deal with entire compound records.
## - This is especially useful for retrieving the properties of a large number of compounds at once.

In [2]:
p = pcp.get_properties('IsomericSMILES', 'CC', 'smiles', searchtype= 'superstructure')
p

[{'CID': 297, 'IsomericSMILES': 'C'},
 {'CID': 783, 'IsomericSMILES': '[HH]'},
 {'CID': 5462310, 'IsomericSMILES': '[C]'},
 {'CID': 6324, 'IsomericSMILES': 'CC'},
 {'CID': 24824, 'IsomericSMILES': '[3H][3H]'},
 {'CID': 24523, 'IsomericSMILES': '[2H][2H]'},
 {'CID': 26873, 'IsomericSMILES': '[14CH4]'},
 {'CID': 167583, 'IsomericSMILES': '[2HH]'},
 {'CID': 137127, 'IsomericSMILES': '[2H]C([2H])([2H])C([2H])([2H])[2H]'},
 {'CID': 123070, 'IsomericSMILES': '[2H]C([2H])([2H])[2H]'},
 {'CID': 166653, 'IsomericSMILES': '[H-]'},
 {'CID': 15244547, 'IsomericSMILES': '[2H]C([2H])C([2H])[2H]'},
 {'CID': 12053198, 'IsomericSMILES': '[2H]C([2H])C([2H])([2H])[2H]'},
 {'CID': 10953798, 'IsomericSMILES': '[2H]C([2H])[2H]'},
 {'CID': 10866248, 'IsomericSMILES': '[2H]C[2H]'},
 {'CID': 5460631, 'IsomericSMILES': '[1HH]'},
 {'CID': 5362549, 'IsomericSMILES': '[H]'},
 {'CID': 3034819, 'IsomericSMILES': '[CH3]'},
 {'CID': 644094, 'IsomericSMILES': '[CH3+]'},
 {'CID': 138525, 'IsomericSMILES': '[2H]CC'},
 {'

In [7]:
p[0]

{'CID': 297, 'IsomericSMILES': 'C'}

Multiple properties may be specified in a list, or in a comma-separated string. The available properties are: MolecularFormula, MolecularWeight, CanonicalSMILES, IsomericSMILES, InChI, InChIKey, IUPACName, XLogP, ExactMass, MonoisotopicMass, TPSA, Complexity, Charge, HBondDonorCount, HBondAcceptorCount, RotatableBondCount, HeavyAtomCount, IsotopeAtomCount, AtomStereoCount, DefinedAtomStereoCount, UndefinedAtomStereoCount, BondStereoCount, DefinedBondStereoCount, UndefinedBondStereoCount, CovalentUnitCount, Volume3D, XStericQuadrupole3D, YStericQuadrupole3D, ZStericQuadrupole3D, FeatureCount3D, FeatureAcceptorCount3D, FeatureDonorCount3D, FeatureAnionCount3D, FeatureCationCount3D, FeatureRingCount3D, FeatureHydrophobeCount3D, ConformerModelRMSD3D, EffectiveRotorCount3D, ConformerCount3D.

## Synonyms
### - Get a list of synonyms for a given input using the 'get_synonyms' function:"

In [3]:
a = pcp.get_synonyms('aspirin', 'name')
b = pcp.get_synonyms('aspirin', 'name', 'substance')

In [6]:
print(a)

[{'CID': 2244, 'Synonym': ['aspirin', 'ACETYLSALICYLIC ACID', '50-78-2', '2-Acetoxybenzoic acid', '2-(Acetyloxy)benzoic acid', 'O-Acetylsalicylic acid', 'Acetylsalicylate', 'o-Acetoxybenzoic acid', 'Acylpyrin', 'Easprin', 'Ecotrin', 'Salicylic acid acetate', 'Acenterine', 'Acetophen', 'Polopiryna', 'Acetosal', 'Colfarit', 'o-Carboxyphenyl acetate', 'Acidum acetylsalicylicum', 'Enterosarein', 'Aceticyl', 'Acetonyl', 'Acetosalin', 'Acetylin', 'Aspergum', 'Aspirdrops', 'Benaspir', 'Measurin', 'Micristin', 'Pharmacin', 'Premaspin', 'Salcetogen', 'Temperal', 'Ecolen', 'Empirin', 'Endydol', 'Rhodine', 'Saletin', 'Rheumintabletten', 'Solprin acid', '2-acetyloxybenzoic acid', 'Benzoic acid, 2-(acetyloxy)-', 'Acetisal', 'Acetylsal', 'Aspirine', 'Bialpirina', 'Bialpirinia', 'Claradin', 'Clariprin', 'Entericin', 'Enterophen', 'Enterosarine', 'Globentyl', 'Neuronika', 'Salacetin', 'Solpyron', 'Acesal', 'Acisal', 'Asagran', 'Asteric', 'Cemirit', 'Decaten', 'Duramax', 'Extren', 'Globoid', 'Helicon',

inputs that match more than one SID/CID will have multiple, separate synonyms lists returned

## Immdentifiers
### - pcp.get_cids
### - pcp.get_sids
### - pcp.get_aids

For example, passing a CID to get_sids will return a list of SIDs corresponding to the Substance records that were standardised and merged to produce the given Compound.

# 8. pandas integration

## Usage
### - get_compounds(), get_substances(), get_properties() to return a pandas dataframe

In [2]:
df1 = pcp.get_compounds('C20H41Br', 'formula', as_dataframe=True)
df2 = pcp.get_substances([1, 2, 3, 4], as_dataframe=True)
df3 = pcp.get_properties(['isomeric_smiles', 'xlogp', 'rotatable_bond_count'], 'C20H41Br', 'formula', as_dataframe=True)

In [3]:
df1.head()

Unnamed: 0_level_0,atom_stereo_count,atoms,bond_stereo_count,bonds,cactvs_fingerprint,canonical_smiles,charge,complexity,conformer_id_3d,conformer_rmsd_3d,...,pharmacophore_features_3d,record,rotatable_bond_count,shape_fingerprint_3d,shape_selfoverlap_3d,tpsa,undefined_atom_stereo_count,undefined_bond_stereo_count,volume_3d,xlogp
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
20271,0,"[{'aid': 1, 'number': 35, 'element': 'Br', 'x'...",0,"[{'aid1': 1, 'aid2': 20, 'order': 1}, {'aid1':...",1111000001111000000000000000000000000000000100...,CCCCCCCCCCCCCCCCCCCCBr,0,167,,,...,,"{'id': {'id': {'cid': 20271}}, 'atoms': {'aid'...",18,,,0,0,0,,11.4
23148745,1,"[{'aid': 1, 'number': 35, 'element': 'Br', 'x'...",0,"[{'aid1': 1, 'aid2': 10, 'order': 1}, {'aid1':...",1111000001111000000000000000000000000000000100...,CCCCCCCCCCC(CCCCCCCC)CBr,0,179,,,...,,"{'id': {'id': {'cid': 23148745}}, 'atoms': {'a...",17,,,0,1,0,,10.6
10808570,3,"[{'aid': 1, 'number': 35, 'element': 'Br', 'x'...",0,"[{'aid1': 1, 'aid2': 21, 'order': 1}, {'aid1':...",1111000001111000000000000000000000000000000100...,CC(C)CCCC(C)CCCC(C)CCCC(C)CCBr,0,212,,,...,,"{'id': {'id': {'cid': 10808570}}, 'atoms': {'a...",14,,,0,3,0,,9.9
154322467,2,"[{'aid': 1, 'number': 35, 'element': 'Br', 'x'...",0,"[{'aid1': 1, 'aid2': 12, 'order': 1}, {'aid1':...",1111000001111000000000000000000000000000000100...,CCCCCCCCCCCCC(CC)CC(CC)CBr,0,190,,,...,,"{'id': {'id': {'cid': 154322467}}, 'atoms': {'...",16,,,0,2,0,,10.3
154285858,1,"[{'aid': 1, 'number': 35, 'element': 'Br', 'x'...",0,"[{'aid1': 1, 'aid2': 2, 'order': 1, 'style': 3...",1111000001111000000000000000000000000000000100...,CCCCCCCCCCCCC(CCC)(C(C)(C)C)Br,0,229,,,...,,"{'id': {'id': {'cid': 154285858}}, 'atoms': {'...",14,,,0,1,0,,10.0


In [4]:
df2.head()

Unnamed: 0_level_0,source_id,source_name,standardized_cid,synonyms
sid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,MOLI000002,MOLI,135398523.0,[MOLI000002]
2,MOLI000003,MOLI,,[MOLI000003]
3,MOLI000005,MOLI,449639.0,[MOLI000005]
4,MOLI000006,MOLI,449640.0,[MOLI000006]


In [5]:
df3.head()

Unnamed: 0_level_0,IsomericSMILES,XLogP,RotatableBondCount
CID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20271,CCCCCCCCCCCCCCCCCCCCBr,11.4,18
23148745,CCCCCCCCCCC(CCCCCCCC)CBr,10.6,17
10808570,CC(C)CCCC(C)CCCC(C)CCCC(C)CCBr,9.9,14
154322467,CCCCCCCCCCCCC(CC)CC(CC)CBr,10.3,16
154285858,CCCCCCCCCCCCC(CCC)(C(C)(C)C)Br,10.0,14


### An existing list of Compound objects can be converted into a dataframe, optionaally specifiying the desired columns

In [6]:
cs = pcp.get_compounds('C20H41Br', 'formula')
cs[0]

Compound(20271)

In [7]:
cs[:10]

[Compound(20271),
 Compound(23148745),
 Compound(10808570),
 Compound(154322467),
 Compound(154285858),
 Compound(154147689),
 Compound(153970566),
 Compound(153855828),
 Compound(153796878),
 Compound(141727929)]

In [8]:
for i in range(10):
    cc = cs[i].isomeric_smiles, cs[i].xlogp, cs[i].rotatable_bond_count
    print(cc)
    

('CCCCCCCCCCCCCCCCCCCCBr', 11.4, 18)
('CCCCCCCCCCC(CCCCCCCC)CBr', 10.6, 17)
('CC(C)CCCC(C)CCCC(C)CCCC(C)CCBr', 9.9, 14)
('CCCCCCCCCCCCC(CC)CC(CC)CBr', 10.3, 16)
('CCCCCCCCCCCCC(CCC)(C(C)(C)C)Br', 10, 14)
('CCCCCCCCC(CC(C)CC)C(CCCCC)Br', 10, 15)
('CCCCC(CC)(CCCC)C(CCCC)(CCCC)Br', 9.4, 14)
('CCCCCCCCCCCCCCCCC(CCC)Br', 10.6, 17)
('CCCCCCCCCCC(CCCCCCCCC)Br', 10.6, 17)
('CCCCCCCCCCCCC(CCCC)CCCBr', 10.6, 17)


In [9]:
df4 = pcp.compounds_to_frame(cs, properties=['isomeric_smiles', 'xlogp', 'rotatable_bond_count'])
df4.head()

Unnamed: 0_level_0,isomeric_smiles,xlogp,rotatable_bond_count
cid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
20271,CCCCCCCCCCCCCCCCCCCCBr,11.4,18
23148745,CCCCCCCCCCC(CCCCCCCC)CBr,10.6,17
10808570,CC(C)CCCC(C)CCCC(C)CCCC(C)CCBr,9.9,14
154322467,CCCCCCCCCCCCC(CC)CC(CC)CBr,10.3,16
154285858,CCCCCCCCCCCCC(CCC)(C(C)(C)C)Br,10.0,14


# 9. Download
## - The following formats available: XML, ASNT/B, JSON, SDF, CSV, PNG, TXT.
## - SDF and PNG are only available for full Compound and Substance records
## - CSV is best suited to tables of properties and identifiers.

In [10]:
pcp.download('PNG', 'asp.png', 'Aspirin', 'name')
pcp.download('CSV', 's.csv', [1,2,3], operation = 'property/CanonicalSMILES,IsomericSMILES')

In [None]:
!jupyter nbconvert --to html Pubchempy_train03.ipynb