In [None]:
import sys

sys.path.append("..")
import itertools

import pandas as pd
from rdkit import DataStructs
from rdkit import Chem
from rdkit.Chem.rdMolDescriptors import GetMorganFingerprintAsBitVect
from rdkit.Chem import PandasTools
from rdkit.Chem import AllChem
from rdkit.Chem import DataStructs
import numpy as np
from rdkit.Chem import MACCSkeys

dataset = pd.read_csv('chembl_smiles.csv', nrows=10000,low_memory=False)
dataset.head(5)

In [None]:

i = 0
j = 0
dat = pd.DataFrame()
try:
    for ind, row in dataset.iterrows():
        mol = Chem.MolFromSmiles(dataset['canonical_smiles'][ind])
        fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2,
                                                   nBits=2048).ToBitString()
        ms = list(MACCSkeys.GenMACCSKeys(mol).ToBitString())
        i = i + 1
        dat = dat.append(
            {
                'chembl_id': dataset['chembl_id'][ind],
                'SMILES': dataset['canonical_smiles'][ind],
                'MACCS': list(ms),
                'ECFP4': list(fp)
            },
            ignore_index=True)
        if i == 5000:
            j = j + 1
            print("Batch : ", j)
            i = 0
except:
    print(dataset['chembl_id'][ind])
    #pass

dat['ECFP4'] = dat.apply(lambda row: (','.join(row['ECFP4'])), axis=1)
dat['MACCS'] = dat.apply(lambda row: (','.join(row['MACCS'])), axis=1)
dat.to_csv('chembl_test.csv', sep=',', index=False)

In [2]:
from neo4j import GraphDatabase


class Neo4jConnection:

    def __init__(self, uri, user, pwd):
        self.__uri = uri
        self.__user = user
        self.__pwd = pwd
        self.__driver = None
        try:
            self.__driver = GraphDatabase.driver(self.__uri,
                                                 auth=(self.__user,
                                                       self.__pwd))
        except Exception as e:
            print("Failed to create the driver:", e)

    def close(self):
        if self.__driver is not None:
            self.__driver.close()

    def query(self, query, db=None):
        assert self.__driver is not None, "Driver not initialized!"
        session = None
        response = None
        try:
            session = self.__driver.session(
                database=db) if db is not None else self.__driver.session()
            response = list(session.run(query))
        except Exception as e:
            print("Query failed:", e)
        finally:
            if session is not None:
                session.close()
        return response

In [3]:
gph_conn = Neo4jConnection(uri="bolt://localhost:7687", user="neo4j", pwd="pass1")

In [None]:
gph_conn.query("""MATCH (n:Mol)
DETACH DELETE n""")

gph_conn.query("""
               // USING PERIODIC COMMIT 5000
               LOAD CSV WITH HEADERS FROM 'file:///D:/Github/neo4j_book/chembl_test.csv' AS row
               CREATE (m:Mol {CHEMBL_ID: row.chembl_id, SMILES: row.SMILES, ECFP4:[s IN split(row.ECFP4,",") | toFloat(s)],
               MACCS:[m IN split(row.MACCS,",") | toFloat(m)]
               }
               )
               """)

In [5]:
import pandas as pd
results = gph_conn.query("""MATCH (n1:Mol {CHEMBL_ID: 'CHEMBL6254'})
                            MATCH (n2:Mol)
                            RETURN n2.CHEMBL_ID,gds.similarity.cosine(n1.ECFP4,n2.ECFP4) AS ECFP4,
                            gds.similarity.cosine(n1.MACCS,n2.MACCS) AS MACCS order by ECFP4 desc ;
                                           """)
result = pd.DataFrame(results)
result = result.set_axis(['CHEMBL_ID', 'ECFP4', 'MACCS'], axis=1, inplace=False)
result.head(10)

Unnamed: 0,CHEMBL_ID,ECFP4,MACCS
0,CHEMBL6254,1.0,1.0
1,CHEMBL6240,0.904762,1.0
2,CHEMBL6363,0.833333,0.948908
3,CHEMBL268365,0.820347,0.959166
4,CHEMBL6237,0.811754,0.959166
5,CHEMBL6222,0.809524,1.0
6,CHEMBL6329,0.782074,0.990148
7,CHEMBL267864,0.776524,0.990148
8,CHEMBL6214,0.765254,0.944911
9,CHEMBL266457,0.75724,0.863868
