In [1]:
import pandas as pd
import numpy as np
import os
import time
from rdkit import Chem
from fp_gen import KlekFPGenerator, MACCSFPGenerator, SubFPGenerator

In [2]:
def could_be_valid(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None
    except:
        return False

In [3]:
def sparse_to_dense(sparse):
    return np.nonzero(sparse)[0].tolist()

In [8]:
generator = KlekFPGenerator(n_jobs=os.cpu_count())
data = pd.read_csv('../smiles/h1_smiles.csv', chunksize=250, names=['SMILES', 'Ki'])

for i, chunk in enumerate(data):
    print(chunk)
    chunk = chunk[chunk['SMILES'].apply(could_be_valid)]
    mols = list(map(Chem.MolFromSmiles, chunk['SMILES']))
    fps = generator.transform(mols)
    chunk['fps'] = list(map(sparse_to_dense, fps))
    if i == 0:
        chunk.to_csv('../smiles/h1_klek.csv', index=False)
    else: # append if already exists, otherwise without else firts chunk will be written twice
        with open('../smiles/h1_klek.csv', 'a') as f:
            chunk.to_csv(f, header=True, index=False)

                                                SMILES       Ki
0                                               SMILES     "Ki"
1      OC(=O)CCCCCN1CCC(CNC(=O)c2c3OCCCn3c3ccccc23)CC1  10000.0
2    OC=O.O=C(N1CCN(C(=O)C1)c1ccc(OC2CCN(CC2)C2CCCC...  2511.89
3    C[C@@H]1C[C@@H](C)N1C(=O)[C@H]1CN(C)C2Cc3c[nH]...   1400.0
4                       CN(C)CCCNc1c2CCCCCc2nc2ccccc12   7696.0
..                                                 ...      ...
245  Cc1cn([C@H]2C[C@H](N=[N+]=[N-])[C@@H](CO)O2)c(...      NaN
246             COC1=C(OC)C(=O)C(CCCCCCCCCCO)=C(C)C1=O      NaN
247                     CCCOc1ccc(cc1N)C(=O)OCCN(CC)CC      NaN
248                  Cl.Fc1cccc(CN2C3CC4CC(C3)CC2C4)c1  10000.0
249        OC(=O)COCCN1CCN(CC1)C(c1ccccc1)c1ccc(Cl)cc1    50.12

[250 rows x 2 columns]


[09:19:24] SMILES Parse Error: syntax error while parsing: SMILES
[09:19:24] SMILES Parse Error: Failed parsing SMILES 'SMILES' for input: 'SMILES'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['fps'] = list(map(sparse_to_dense, fps))


                                                SMILES            Ki
250    CN(C1CCCCC1)C1CCN(CC1)c1nc2ccccc2n1Cc1ccc(F)cc1  2.800000e+00
251                     CN(C)CCSc1nc2cc(C)c(C)cc2[nH]1  1.000000e+04
252  CS(=O)(=O)c1ccc(cc1)[C@@H](O)[C@@H](CO)NC(=O)C...           NaN
253                        CC1NC(N)=Nc2ccc(Cl)c(Cl)c12  8.000000e+00
254                      O=C1C(C(=O)c2ccccc12)c1ccccc1           NaN
..                                                 ...           ...
495               CN(C)CCC1=C(Cc2cccnn2)c2ccc(Cl)cc2C1  2.100000e+00
496  [3H]C([3H])([3H])Oc1ccccc1CNCCc1cc(OC)c(I)cc1O...  1.890000e+02
497  CN(CCC=C1c2ccccc2Sc2ccc(Cl)cc12)Cc1ccc(OCCCN2C...  2.950000e+02
498   OC(=O)C1(O)CCN(CC1)C1CCC2(C1)Cc1ccccc1Cc1ccccc21  5.012000e+01
499                        CN(C)CCN(Cc1ccccc1)c1ccccn1  1.000000e+17

[250 rows x 2 columns]
                                                SMILES            Ki
500   CC1(CCN(CC1)C1CCC2(C1)Cc1ccccc1Cc1ccccc21)C(O)=O  5.012000e+01
501       

In [1]:
import numpy as np
import pandas as pd
import pandas.api.types as ptypes
import matplotlib.pyplot as plt
import seaborn as sns
import rdkit
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem, DataStructs
import os
import sys

In [28]:
class DataProcessor():
    
    """
    This class loads molecular fingerprints into a DataFrame and performs clean-up on them.

    Parameters
    ----------
    protein : str
        The protein name, one of ['5ht1a', '5ht7', 'beta2', 'd2', 'h1'].
    y_col : str, optional
        The name of the column representing the dependent variable, default is 'Ki'.

    Attributes
    ----------
    data_paths_ : dict
        A dictionary containing the paths to the fingerprint files.
    proteins_ : list of str
        The list of valid protein names.
    fingerprints_ : list of str
        The list of valid fingerprint types.
    protein : str
        The protein name.
    fingerprint : str
        The fingerprint type.
    path : str
        The path to the fingerprint file.
    df : pandas.DataFrame
        The DataFrame containing the loaded fingerprint data.

    Methods
    -------
    remove_missing()
        Removes rows with missing values in the dependent variable column.
    remove_duplicates()
        Removes duplicate rows in the DataFrame.
    remove_redundant()
        Removes redundant columns in the DataFrame.
    convert_data()
        Converts the data types of the columns in the DataFrame.
    add_classification(threshold)
        Adds a Class column to the DataFrame based on the threshold parameter.
    write_cleaned()
        Writes the cleaned DataFrame to a csv file.
    """
    
    def __init__(self, protein, y_col='Ki'):
        sys.path.append('..')
        self.data_paths = {
    '5ht1a': '../protein_klek/5ht1a_klek.csv',
    '5ht7': '../protein_klek/5ht7_klek.csv',
    'beta2': '../protein_klek/beta2_klek.csv',
    'd2': '../protein_klek/d2_klek.csv',
    'h1': '../protein_klek/h1_klek.csv'
    }
            
        self.proteins_ = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
        self.y_col = y_col
        self.missing = None
        self.duplicated = None
        self.redundant = None
        
        self.protein = protein
        self.path = self.data_paths[protein]
        
        self.activities = {
    '5ht1a': 54,
    '5ht7': 89,
    'beta2': 270,
    'd2': 240.1,
    'h1': 501
    }
        
        self.threshold = self.activities[self.protein]
        
    def load_data(self):
        self.df = pd.read_csv(self.path)
        self.df[self.y_col] = self.df[self.y_col].astype('float')
        
        
    def remove_missing(self):
        print(f'The initial size of dataset: {len(self.df)}')
        missing = self.df[self.y_col].isnull()
        zero_or_neg = self.df[self.y_col] <= 0
        to_remove = pd.Series([a or b for a, b in zip(missing,zero_or_neg)])
        print(f'The percent of rows with missing {self.y_col} values: {to_remove.sum()/len(self.df)*100:.2f} %')
        self.df = self.df[~to_remove]
        print(f'New size of the dataset: {len(self.df)}')
        self.missing = int(to_remove.sum())
        
    
    def remove_duplicates(self):
        print(f'The initial size of dataset: {len(self.df)}')
        duplicates = self.df.duplicated(keep = 'first')
        print(f'The percent of duplicated rows: {duplicates.sum()/len(self.df)*100:.2f} %')
        self.df = self.df[~duplicates]
        print(f'New size of the dataset: {len(self.df)}')
        self.duplicated = int(duplicates.sum())
        
        
    def add_classification(self):
        classes = [1 if x < self.threshold else 0 for x in self.df[self.y_col]]
        self.df.insert(1, "Class", classes)
        print(f'The percent of compounds classified as active is {self.df["Class"].sum()/len(self.df)*100:.2f} %')
        
                
    def write_cleaned(self):
        write_path = './cleaned_datasets/' + self.path + '_clean.csv'
        self.df.to_csv(path_or_buf=write_path, sep=',', index=False)
        print(f'Cleaned file saved at {write_path}')
    
    
    def return_parameters(self): # zwraca listę list dotyczącą ile czego brakowało/usunięto w kolejności wczytania do klasy
        parameters = []
        parameters.append(self.missing)
        parameters.append(self.duplicated)
        parameters.append(self.redundant)
        return parameters
    
    def write_parquet(self):
        path = '..' + self.path.strip('.csv') + '_balanced.parquet'
        print(path)
        self.df.to_parquet(path)

In [29]:
proteins = ['5ht1a','5ht7','beta2','d2','h1']

In [30]:
for protein in proteins:
    data = DataProcessor(protein)
    data.load_data()
    data.remove_missing()
    data.remove_duplicates()
    data.add_classification()
    data.write_parquet()

The initial size of dataset: 5840
The percent of rows with missing Ki values: 3.78 %
New size of the dataset: 5619
The initial size of dataset: 5619
The percent of duplicated rows: 6.57 %
New size of the dataset: 5250
The percent of compounds classified as active is 50.04 %
../protein_klek/5ht1a_klek_balanced.parquet
The initial size of dataset: 3262
The percent of rows with missing Ki values: 3.86 %
New size of the dataset: 3136
The initial size of dataset: 3136
The percent of duplicated rows: 5.52 %
New size of the dataset: 2963
The percent of compounds classified as active is 49.98 %
../protein_klek/5ht7_klek_balanced.parquet
The initial size of dataset: 1660
The percent of rows with missing Ki values: 51.51 %
New size of the dataset: 805
The initial size of dataset: 805
The percent of duplicated rows: 2.86 %
New size of the dataset: 782
The percent of compounds classified as active is 50.00 %
../protein_klek/beta2_klek_balanced.parquet
The initial size of dataset: 11816
The percent