<a href="https://www.kaggle.com/code/alirezaataei/feature-extraction-with-resource-utilization?scriptVersionId=185991394" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
!pip install duckdb rdkit mordred

In [2]:
import pandas as pd
import joblib
import duckdb

from rdkit import Chem
from rdkit.Chem import AllChem, Descriptors, Crippen, rdMolDescriptors
from sklearn.preprocessing import MinMaxScaler, MaxAbsScaler
import numpy as np
from mordred import Calculator, descriptors

import multiprocessing
import concurrent.futures


## PreProcessing Functions and their resource utilizations

#### mol_descriptors/ utilizaed: parallelize_dataframe

In [8]:
# Initialize Mordred calculator with all descriptors
calc = Calculator(descriptors, ignore_3D=True)

def mol_descriptors(smiles_list, pre=''):
    moldescriptors = []
    for smile in smiles_list:
        mol = Chem.MolFromSmiles(smile)
        if mol:
            rdkit_desc = {
                pre+'MolecularWeight': Descriptors.MolWt(mol),
                pre+'LogP': Descriptors.MolLogP(mol),
                pre+'TPSA': Descriptors.TPSA(mol),
                pre+'NumRotatableBonds': Descriptors.NumRotatableBonds(mol),
                pre+'NumHDonors': Descriptors.NumHDonors(mol),
                pre+'NumHAcceptors': Descriptors.NumHAcceptors(mol),
                pre+'NumRings': Descriptors.RingCount(mol),
                pre+'NumAromaticRings': Descriptors.NumAromaticRings(mol),
                pre+'ExactMass': Descriptors.ExactMolWt(mol),
                pre+'HeavyAtomCount': Descriptors.HeavyAtomCount(mol),
                pre+'NumValenceElectrons': Descriptors.NumValenceElectrons(mol),
                pre+'FractionCSP3': Descriptors.FractionCSP3(mol),
                pre+'MolMR': Descriptors.MolMR(mol),
                pre+'FormalCharge': Chem.GetFormalCharge(mol),
                pre+'NumAliphaticRings': Descriptors.NumAliphaticRings(mol),
                pre+'NumSaturatedRings': Descriptors.NumSaturatedRings(mol),
                pre+'NumHeteroatoms': Descriptors.NumHeteroatoms(mol),
                pre+'NumSaturatedCarbocycles': Descriptors.NumSaturatedCarbocycles(mol),
                pre+'NumAliphaticHeterocycles': Descriptors.NumAliphaticHeterocycles(mol),
                pre+'NumAromaticHeterocycles': Descriptors.NumAromaticHeterocycles(mol),
                pre+'AtomCount': mol.GetNumAtoms(),
                pre+'NumSingleBonds': len([bond for bond in mol.GetBonds() if bond.GetBondType() == Chem.rdchem.BondType.SINGLE]),
                pre+'NumDoubleBonds': len([bond for bond in mol.GetBonds() if bond.GetBondType() == Chem.rdchem.BondType.DOUBLE]),
                pre+'NumTripleBonds': len([bond for bond in mol.GetBonds() if bond.GetBondType() == Chem.rdchem.BondType.TRIPLE]),
                pre+'NumAromaticBonds': len([bond for bond in mol.GetBonds() if bond.GetIsAromatic()]),
                pre+'MolecularConnectivityIndex': Descriptors.MolLogP(mol),  # Example
                pre+'Kier_Hall_Alpha': Descriptors.Kappa3(mol),  # Example
                pre+'HOMO': Descriptors.MaxAbsEStateIndex(mol),  # Example (approximation)
                pre+'LUMO': Descriptors.MinAbsEStateIndex(mol),  # Example (approximation)
            }
            
            # Calculate Mordred descriptors
            mordred_desc = calc(mol)
            mordred_desc = {f"{pre}{str(key)}": value for key, value in mordred_desc.items()}
            
            # Combine RDKit and Mordred descriptors
            rdkit_desc.update(mordred_desc)
            moldescriptors.append(rdkit_desc)
            
        else:
            # If molecule is invalid, append None for all descriptors
            rdkit_desc = {key: None for key in [
                pre+'MolecularWeight',
                pre+'LogP',
                pre+'TPSA',
                pre+'NumRotatableBonds',
                pre+'NumHDonors',
                pre+'NumHAcceptors',
                pre+'NumRings',
                pre+'NumAromaticRings',
                pre+'ExactMass',
                pre+'HeavyAtomCount',
                pre+'NumValenceElectrons',
                pre+'FractionCSP3',
                pre+'MolMR',
                pre+'FormalCharge',
                pre+'NumAliphaticRings',
                pre+'NumSaturatedRings',
                pre+'NumHeteroatoms',
                pre+'NumSaturatedCarbocycles',
                pre+'NumAliphaticHeterocycles',
                pre+'NumAromaticHeterocycles',
                pre+'AtomCount',
                pre+'NumSingleBonds',
                pre+'NumDoubleBonds',
                pre+'NumTripleBonds',
                pre+'NumAromaticBonds',
                pre+'MolecularConnectivityIndex',
                pre+'Kier_Hall_Alpha',
                pre+'HOMO',
                pre+'LUMO',
            ]}
            
            # Add None for Mordred descriptors
            mordred_desc = {f"{pre}{str(desc)}": None for desc in calc.descriptors}
            rdkit_desc.update(mordred_desc)
            
            moldescriptors.append(rdkit_desc)
            
    return moldescriptors

def parallelize_dataframe(df, molecule_smiles_cat, pre='', n_cores=10):
    df_split = np.array_split(df, n_cores)
    smiles_lists = [chunk[molecule_smiles_cat].tolist() for chunk in df_split]
    
    with concurrent.futures.ProcessPoolExecutor(max_workers=n_cores) as executor:
        results = executor.map(mol_descriptors, smiles_lists, [pre]*n_cores)
    
    results = [item for sublist in results for item in sublist]
    
    descriptors_df = pd.DataFrame(results)
    return pd.concat([df.reset_index(drop=True), descriptors_df.reset_index(drop=True)], axis=1)



#### ecfp/ utilized: parallelize_ecfp

In [9]:
def ecfp(smiles_list, pre=''):
    descriptors = []
    for smile in smiles_list:
        descriptors.append({pre+'ecfp': list(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile), 2, nBits=262))})
    return descriptors
    # Generate ECFPs
    #   filtered_result[pre+'molecule_ecfp'] = filtered_result[molecule_smiles_cat].apply(lambda smile: list(AllChem.GetMorganFingerprintAsBitVect(Chem.MolFromSmiles(smile), 100, nBits=2000)))
    #   return filtered_result

def parallelize_ecfp(df, molecule_smiles_cat, pre='', n_cores=5):
    df_split = np.array_split(df, n_cores)
    smiles_lists = [chunk[molecule_smiles_cat].tolist() for chunk in df_split]
    
    with concurrent.futures.ProcessPoolExecutor(max_workers=n_cores) as executor:
        results = executor.map(ecfp, smiles_lists, [pre]*n_cores)
    
    results = [item for sublist in results for item in sublist]
    
    descriptors_df = pd.DataFrame(results)
    return pd.concat([df.reset_index(drop=True), descriptors_df.reset_index(drop=True)], axis=1)



#### Normalization

In [10]:
def normalize_df(df, columns_to_scale, scalers):
    for column in columns_to_scale:
        if column not in scalers:
            scaler = MaxAbsScaler()
            scalers[column] = scaler  # Save the scaler in case you need it later
            df[column] = scaler.fit_transform(df[[column]])
        else:
            df[column] = scalers[column].transform(df[[column]])
    return df


#### Get n records of bindings and the same number of non-bindings and store to a csv

In [12]:
train_path = '/kaggle/input/leash-BELKA/train.parquet'
# test_path = '/kaggle/input/leash-BELKA/test.parquet'
range = 10000000
con = duckdb.connect()

df = con.query(f"""(SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 0
                        ORDER BY random()
                        LIMIT {range})
                        UNION ALL
                        (SELECT *
                        FROM parquet_scan('{train_path}')
                        WHERE binds = 1
                        ORDER BY random()
                        LIMIT {range})""").df()

con.close()
df.to_csv('somepart.csv')

FloatProgress(value=0.0, layout=Layout(width='auto'), style=ProgressStyle(bar_color='black'))

## Processing Building Block one, two, three

In [16]:
print(len(df[df['binds']==1]), len(df[df['binds']==0]))

1589906 10000000


#### Block one

In [24]:
import tensorflow as tf

# Check if TensorFlow is using the GPU
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))


2024-06-29 02:16:25.576903: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-06-29 02:16:25.577067: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-06-29 02:16:25.706435: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


Num GPUs Available:  1


In [3]:
multiprocessing.cpu_count()


4

In [18]:
b1processed = parallelize_dataframe(df, 'buildingblock1_smiles', 'b1')

KeyboardInterrupt: 

#### Block two

In [None]:
b2processed = parallelize_dataframe(b1processed, 'buildingblock2_smiles', 'b2')

#### Block three

In [None]:
b3processed = parallelize_dataframe(b2processed, 'buildingblock3_smiles', 'b3')

#### Store the extracted features beside the original data in another csv

In [None]:
b3processed.to_csv('somepart_processed.csv')

## Normalization of all the extracted features, the Scaler is set in the function
#### the columns are normalized separately

In [None]:
scalers = {}
normalized = normalize_df(b3processed, [
  'b1MolecularWeight',
  'b1LogP',
  'b1TPSA',
  'b1NumRotatableBonds',
  'b1NumHDonors',
  'b1NumHAcceptors',
  'b1NumRings',
  'b1NumAromaticRings',
  'b1ExactMass',
  'b1HeavyAtomCount',
  'b1NumValenceElectrons',
  'b1FractionCSP3',
  'b1MolMR',
  'b1FormalCharge',
  'b1NumAliphaticRings',
  'b1NumSaturatedRings',
  'b1NumHeteroatoms',
  'b1NumHeterocycles',
  'b1NumSaturatedCarbocycles',
  'b1NumAliphaticHeterocycles',
  'b1NumAromaticHeterocycles',
  'b1BondCount',
  'b1AtomCount',
  'b1NumSingleBonds',
  'b1NumDoubleBonds',
  'b1NumTripleBonds',
  'b1NumAromaticBonds',

  'b2MolecularWeight',
  'b2LogP',
  'b2TPSA',
  'b2NumRotatableBonds',
  'b2NumHDonors',
  'b2NumHAcceptors',
  'b2NumRings',
  'b2NumAromaticRings',
  'b2ExactMass',
  'b2HeavyAtomCount',
  'b2NumValenceElectrons',
  'b2FractionCSP3',
  'b2MolMR',
  'b2FormalCharge',
  'b2NumAliphaticRings',
  'b2NumSaturatedRings',
  'b2NumHeteroatoms',
  'b2NumHeterocycles',
  'b2NumSaturatedCarbocycles',
  'b2NumAliphaticHeterocycles',
  'b2NumAromaticHeterocycles',
  'b2BondCount',
  'b2AtomCount',
  'b2NumSingleBonds',
  'b2NumDoubleBonds',
  'b2NumTripleBonds',
  'b2NumAromaticBonds',

  'b3MolecularWeight',
  'b3LogP',
  'b3TPSA',
  'b3NumRotatableBonds',
  'b3NumHDonors',
  'b3NumHAcceptors',
  'b3NumRings',
  'b3NumAromaticRings',
  'b3ExactMass',
  'b3HeavyAtomCount',
  'b3NumValenceElectrons',
  'b3FractionCSP3',
  'b3MolMR',
  'b3FormalCharge',
  'b3NumAliphaticRings',
  'b3NumSaturatedRings',
  'b3NumHeteroatoms',
  'b3NumHeterocycles',
  'b3NumSaturatedCarbocycles',
  'b3NumAliphaticHeterocycles',
  'b3NumAromaticHeterocycles',
  'b3BondCount',
  'b3AtomCount',
  'b3NumSingleBonds',
  'b3NumDoubleBonds',
  'b3NumTripleBonds',
  'b3NumAromaticBonds',
], scalers)

### Store the Scalers for later (for test ds)

In [None]:
joblib.dump(scalers, 'scalers.joblib')

#### Store the normalized features again beside the original data in another csv

In [None]:
normalized.to_csv('somepart_normalized.csv')