# Create data set with PubChem2D fingerprint

In [1]:
import pubchempy as pcp
import pandas as pd
import numpy as np

In [10]:
final_db = pd.read_csv('data/processed/final_db_processed.csv')

cas_to_smiles = pd.read_csv('data/processed/cas_to_smiles.csv')
cas_to_smiles = cas_to_smiles.drop(columns=cas_to_smiles.columns[0])

cas_to_pubchemcid = pd.read_csv('data/processed/cas_to_pubchemcid.csv')
cas_to_pubchemcid = cas_to_pubchemcid.drop(columns=cas_to_pubchemcid.columns[0])
cas_to_pubchemcid.cid = cas_to_pubchemcid.cid.fillna(-1)

data = pd.merge(cas_to_smiles, cas_to_pubchemcid, on='cas')
data.cid = data.cid.astype(int)
data = data[['cas','cid','smiles']] # rearrange columns
data.head()

Unnamed: 0,cas,cid,smiles
0,10108-64-2,24947,[Cl-].[Cl-].[Cd++]
1,88-30-2,6931,Oc1ccc(c(c1)C(F)(F)F)[N+]([O-])=O
2,1397-94-0,14957,CCCCCC[C@@H]1[C@@H](OC(=O)CC(C)C)[C@H](C)OC(=O...
3,540-72-7,516871,[Na+].[S-]C#N
4,72-43-5,4115,COc1ccc(cc1)C(c2ccc(OC)cc2)C(Cl)(Cl)Cl


Get the PubChem2D fingerprint based on the smiles or the CID (whichever is available).

In [68]:
def get_finger(x):
    try:
        # try to get fingerprint from the smiles (safer)
        y = pcp.get_compounds(x[2],'smiles')[0].cactvs_fingerprint
    except:
        # if it doesn't work, get the fingerprint from the CID
        if x[1]==-1:
            # if the CID is missing, return NaN
            y = 'NaN'
        else:
            if isinstance(x[1], np.int32):
                x[1] = x[1].item()
            y = pcp.Compound.from_cid(x[1]).cactvs_fingerprint
    with open('data/processed/cas_pubchemfinger.csv','a') as fd:
        fd.write(','.join([str(x[0]),''.join([str(y),'\n'])]))
    return y

#c = get_finger(data.iloc[349,])
c = data.apply(func=get_finger, axis=1)

Add the fingerprint that was written in the cell above to the ecotoxicological data. Drop samples that contain an NA value.

In [44]:
cas_to_finger = pd.read_csv('data/processed/cas_pubchemfinger.csv', names=['test_cas','fingerprint'])
# drop duplicates (there are still different CAS with the same fingerprint, which is ok)
cas_to_finger = cas_to_finger[~cas_to_finger.duplicated()].dropna()
final_db_update = pd.merge(final_db, cas_to_finger, on='test_cas')
final_db_update = final_db_update.drop(columns=['test_cas', 'atom_number', 'alone_atom_number', 'tripleBond','doubleBond', 'bonds_number', 'ring_number', 'Mol', 
    'MorganDensity', 'LogP'])
final_db_update.to_csv("data/processed/final_db_update.csv", index=False)