In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/leash-BELKA/sample_submission.csv
/kaggle/input/leash-BELKA/train.parquet
/kaggle/input/leash-BELKA/test.parquet
/kaggle/input/leash-BELKA/train.csv
/kaggle/input/leash-BELKA/test.csv


In [5]:
!conda install -y -c intel mkl mkl-devel mkl-static mkl-include
!apt-get install -y gfortran
%env MKLROOT=/opt/conda/lib
!pip3 install git+https://github.com/qmlcode/qml@develop --user -U
!pip install duckdb
!pip install molSimplify

Channels:
 - intel
 - rapidsai
 - nvidia
 - conda-forge
 - defaults
Platform: linux-64
Collecting package metadata (repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /opt/conda

  added / updated specs:
    - mkl
    - mkl-devel
    - mkl-include
    - mkl-static


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    intel-openmp-2024.1.2      |        intel_995        20.3 MB  intel
    mkl-2024.1.0               |        intel_691       156.3 MB  intel
    mkl-devel-2024.1.0         |        intel_691          28 KB  intel
    mkl-include-2024.1.0       |        intel_691         781 KB  intel
    mkl-static-2024.1.0        |        intel_691       147.7 MB  intel
    ------------------------------------------------------------
                                           Total:       325.1 MB

The following NEW packages will be INSTALLED:

  intel-op

In [11]:
# import libraries
import duckdb
import pandas as pd
from qml.fchl import get_local_kernels, get_local_symmetric_kernels
from qml import Compound
from sklearn.base import BaseEstimator
from sklearn.kernel_ridge import KernelRidge
from io import StringIO
from tqdm import tqdm
import pandas as pd
import numpy as np
import os
from molSimplify.Classes.mol3D import mol3D

In [13]:
# extract the 1145 unique building blocks
train_path = '/kaggle/input/leash-BELKA/train.parquet'
test_path = '/kaggle/input/leash-BELKA/test.parquet'

con = duckdb.connect()

mols = con.query(f"""(SELECT DISTINCT(building_blocks) as building_blocks FROM
((SELECT DISTINCT(buildingblock1_smiles) as building_blocks
                        FROM parquet_scan('{train_path}')
                        )
                        UNION ALL
                        (SELECT DISTINCT(buildingblock2_smiles) as building_blocks
                        FROM parquet_scan('{train_path}'))
                        UNION ALL
                        (SELECT DISTINCT(buildingblock3_smiles) as building_blocks
                        FROM parquet_scan('{train_path}'))))""").df()

con.close()

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

In [14]:
mols.head()

Unnamed: 0,building_blocks
0,C#CC[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
1,C=CCC[C@@H](NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O
2,CC(C)(C)OC(=O)CCC(NC(=O)OCC1c2ccccc2-c2ccccc21...
3,CC(C)(C)OC(=O)N1CCN(C(=O)OCC2c3ccccc3-c3ccccc3...
4,CCCCC(NC(=O)OCC1c2ccccc2-c2ccccc21)C(=O)O


In [15]:
mols.shape

(1145, 1)

In [17]:
# generate the FCHL representations for those building blocks
def make_xyz(mol_smiles):
    mol_3d = mol3D()
    mol_3d.read_smiles(mol_smiles)
    return mol_3d.writexyz('', writestring=True)

def extract_num_atoms(molecule):
    # Split the molecule string by new line and get the first line
    first_line = molecule.split('\n')[0]
    # Convert the first line to an integer (number of atoms)
    return int(first_line)

def make_representation(x, max_atoms):
    # Step 1: Compute the representation
    c = Compound(StringIO(x))
    c.generate_fchl_representation(max_size=max_atoms)
    
    # Step 2: Return it
    return c.representation

class FCHLKernel(BaseEstimator):
    """Class for computing the kernel matrix using the qml utility functions
    
    The input `X` to all of the function is a list of FCHL representation vectors
    
    Follows the "BaseEstimator" API so that we can 
    """
    
    def __init__(self):
        super(FCHLKernel, self).__init__()
        self.train_points = None
    
    def fit(self, X, y=None):
        # Store the training set
        self.train_points = np.array(X)
        return self
        
    def fit_transform(self, X, y=None):
        self.fit(X)
        # Uses the get_localget_local_symmetric_kernels to halve the
        #  computational cost (as the matrix is symmetric)
        return np.squeeze(get_local_symmetric_kernels(self.train_points))
    
    def transform(self, X, y=None):
        return get_local_kernels(np.array(X), self.train_points)[0]

# mols['xyz'] = mols['building_blocks'].apply(make_xyz)
# num_atoms = mols['xyz'].apply(extract_num_atoms)
# max_atoms = num_atoms.max()
# mols['fchl_rep'] = mols['xyz'].apply(lambda x: make_representation(x, max_atoms))
fchl_kernel = FCHLKernel()
building_block_kernel = fchl_kernel.fit_transform(mols['fchl_rep'].tolist())

In [20]:
type(building_block_kernel)

numpy.ndarray

In [21]:
np.save('building_block_kernel.npy', building_block_kernel)

In [22]:
load_kernel = np.load('building_block_kernel.npy')

In [25]:
for i in (load_kernel == building_block_kernel)

(1145, 1145)

### Going from building block to molecular-level FCHL reps
### see pseudo code below

```python
collect N_train required molecules in training_set variable
building_block_kernel = 1145 x 1145 dimensional kernel matrix obtained from above
feature_matrix = [] which will contain the featurization of all the above N_train molecules
for molecule in training_set:
    b1_fchl_transform = do a fchl_kernel.transform on the fchl_rep of building block 1 for that molecule using the above building_block_kernel to get a (1,1145) dimensional vector
    b2_fchl = do the above for building block 2 for that molecule
    b3_fchl = do the above for building block 3 for that molecule
    molecule_fchl = normalize(b1_fchl + b2_fchl + b3_fchl); a normalized (1,1145) dimensional vector
    add molecule_fchl to feature_matrix

now, feature_matrix is of dimensionality N_train x 1145
now apply logistic + KRR to fit the above molecule_fchl with their corresponding binary binding labels

for molecule in test_set:
    generate_molecule_fchl as done above
    use the above logisitic + KRR model + some probability threshold to assing prediction binary binding label
```