#### this script calculates morgan similarity features
#### test compounds are compared to themselves and a number of other known odorants 

In [1]:
import pandas as pd
import numpy as np
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
from rdkit.ML.Descriptors import MoleculeDescriptors


In [2]:
# get train, leaderboard and test CIDs
with open('CIDs.txt') as f: 
    content = f.readlines()
CIDs = list(content)  
CIDs = [int(x) for x in CIDs]

# get smiles
smiles = pd.read_csv('all_smiles.csv', index_col=0) # load smiles if the file already exists


In [3]:
# function to calculate the similarity features from Morgan fingerprints
# creates the fingerprints and calculates similarities 
# Inputs: 
#       list of ids
#       Morgan radius
# Returns:
#       feature vector with size of len(cids) x number of features

def calulate_similarities(ids, radius):
    ms = [Chem.MolFromSmiles(x) for x in smiles.smiles]
    fps = [AllChem.GetMorganFingerprint(x,radius) for x in ms]
    all_features =[]
    for idx, cid in enumerate(ids):
        ms_sample = Chem.MolFromSmiles(smiles.loc[cid].smiles)
        fp_sample = AllChem.GetMorganFingerprint(ms_sample,radius)
        features = [cid]
        for fp in fps:
            features.append(DataStructs.DiceSimilarity(fp,fp_sample))
        all_features.append(features)
    all_features = pd.DataFrame(all_features)
    all_features = all_features.set_index(0)
    all_features.columns = smiles.index
    return all_features    


In [4]:
# get the similarity features
features_sim = calulate_similarities(CIDs, 5)

print(features_sim.shape)

(476, 2437)


In [5]:
features_sim.head()

CID,58,102,107,125,126,174,176,177,178,179,...,91305518,91411526,91541756,91552833,91563027,91595028,91614181,91617014,91617930,91618238
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
126,0.086957,0.37931,0.268657,0.586207,1.0,0.052632,0.108108,0.171429,0.054054,0.095238,...,0.003578,0.018182,0.118421,0.017192,0.145251,0.013652,0.056206,0.047945,0.01173,0.105263
176,0.48,0.054054,0.217391,0.054054,0.108108,0.117647,1.0,0.285714,0.625,0.571429,...,0.011152,0.01432,0.091603,0.030488,0.037975,0.009695,0.024631,0.0369,0.015129,0.064516
177,0.173913,0.0,0.045455,0.0,0.171429,0.0,0.285714,1.0,0.285714,0.210526,...,0.003731,0.009592,0.031008,0.018405,0.025641,0.002774,0.009901,0.01487,0.00607,0.043956
180,0.32,0.0,0.130435,0.0,0.054054,0.0,0.625,0.285714,0.625,0.571429,...,0.011152,0.01432,0.061069,0.030488,0.025316,0.009695,0.024631,0.04428,0.015129,0.043011
196,0.375,0.1,0.289855,0.1,0.066667,0.2,0.25641,0.054054,0.153846,0.181818,...,0.032086,0.027149,0.116883,0.096866,0.044199,0.02863,0.055944,0.07483,0.046784,0.103448


In [11]:
# save it 
features_sim.to_csv('morgan_sim.csv')
