In [11]:
import pandas as pd
import numpy as np
import os
import sys
sys.path.append('..')
import time
from rdkit import Chem
from fp_gen import KlekFPGenerator, MACCSFPGenerator, SubFPGenerator

In [12]:
def could_be_valid(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None
    except:
        return False

In [13]:
def sparse_to_dense(sparse):
    return np.nonzero(sparse)[0].tolist()

In [18]:
generator = KlekFPGenerator(n_jobs=os.cpu_count())
data = pd.read_csv('../original_datasets/smiles/5ht1a_smiles.csv', chunksize=1000, names=['SMILES', 'Ki'])

for i, chunk in enumerate(data):
    print(chunk)
    chunk = chunk[chunk['SMILES'].apply(could_be_valid)]
    mols = list(map(Chem.MolFromSmiles, chunk['SMILES']))
    fps = generator.transform(mols)
    chunk['fps'] = list(map(sparse_to_dense, fps))
    if i == 0:
        chunk.to_csv('../original_datasets/klek/5ht1a_klek.csv', index=False)
    else: # append if already exists, otherwise without else firts chunk will be written twice
        with open('../original_datasets/klek/5ht1a_klek.csv', 'a') as f:
            chunk.to_csv(f, header=False, index=False)

                                                SMILES      Ki
0                                               SMILES    "Ki"
1                        COc1ccc2occ3CC[C@@H](CN)c1c23   860.0
2                  CN1CCc2cccc-3c2[C@H]1Cc1cccc(C)c-31    14.4
3                    CN(C)[C@@H]1Cc2cccc3nc(O)n(C1)c23    92.0
4                  CCCN1CCC2[C@@H]1CCc1cccc(C(N)=O)c21    17.0
..                                                 ...     ...
995  Cl.COc1ccc2c(c1)oc1c(CN3CCN(CC3)c3ccccc3OC)ccc...    24.0
996   Fc1ccc2cccc(N3CCN(CCCOc4ccc5CNC(=O)c5c4)CC3)c2c1  0.0447
997  FC(F)(F)c1cccc(c1)N1CCN(CCN2C(=O)CC3(CCCC3)CC2...    8.22
998  Cl.COc1ccc2c(c1)oc1ccc(CN3CCN(CC3)c3ccccc3OC)c...   429.0
999  CNC(=O)c1ccc2[C@@H](CCN3CCN(CC3)c3ccc(OC)cc3)O...  3356.0

[1000 rows x 2 columns]


[11:17:38] SMILES Parse Error: syntax error while parsing: SMILES
[11:17:38] SMILES Parse Error: Failed parsing SMILES 'SMILES' for input: 'SMILES'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  chunk['fps'] = list(map(sparse_to_dense, fps))


                                                 SMILES       Ki
1000  O=S(=O)(N1CCC[C@H]1CCN1CCN(CC1)c1nsc2ccccc12)c...    37.00
1001         Fc1ccc(cc1)C(=O)CCCN1CCCN(CC1)c1ccc(Cl)cc1   117.40
1002       Clc1ccc(cc1)C1=CCN(CCN2C(=O)c3ccccc3C2=O)CC1   631.00
1003          C(CNCCOc1cccc2[nH]cnc12)Cc1c[nH]c2ccccc12     0.87
1004       OC(COCc1ccccc1)CN1CCC2(CC1)OCc1c2ccc2ccccc12     2.20
...                                                 ...      ...
1995  Clc1cccc(c1)S(=O)(=O)NCCN1CCC(=CC1)c1c[nH]c2cc...     7.40
1996       CC1CCN(CC[C@H]2CCCN2S(=O)(=O)c2cccc(C)c2)CC1  1000.00
1997  CCCCOC(=O)c1cc2c3OC(CN4CCC5(CC4)N(CNC5=O)c4ccc...     3.00
1998            COc1cccc2c(CCCN3CCN(CC3)c3ccccn3)cccc12     0.38
1999               COc1cccc2CC[C@@H]3[C@H](CCN3CC=C)c12     4.70

[1000 rows x 2 columns]
                                                 SMILES        Ki
2000      CN1C(=N)N(C)\\C(=C\\c2c[nH]c3c(Br)cccc23)C1=O  1812.000
2001       CC1(C)CC(=O)N(CCN2CCN(CC2)c2ccccc2Cl)C(=O)C1    65.8

In [20]:
import numpy as np
import pandas as pd
import pandas.api.types as ptypes
import matplotlib.pyplot as plt
import seaborn as sns
import rdkit
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem, DataStructs
import os
import sys

In [28]:
class DataProcessor():
    
    """
    This class loads molecular fingerprints into a DataFrame and performs clean-up on them.

    Parameters
    ----------
    protein : str
        The protein name, one of ['5ht1a', '5ht7', 'beta2', 'd2', 'h1'].
    y_col : str, optional
        The name of the column representing the dependent variable, default is 'Ki'.

    Attributes
    ----------
    data_paths_ : dict
        A dictionary containing the paths to the fingerprint files.
    proteins_ : list of str
        The list of valid protein names.
    fingerprints_ : list of str
        The list of valid fingerprint types.
    protein : str
        The protein name.
    fingerprint : str
        The fingerprint type.
    path : str
        The path to the fingerprint file.
    df : pandas.DataFrame
        The DataFrame containing the loaded fingerprint data.

    Methods
    -------
    remove_missing()
        Removes rows with missing values in the dependent variable column.
    remove_duplicates()
        Removes duplicate rows in the DataFrame.
    remove_redundant()
        Removes redundant columns in the DataFrame.
    convert_data()
        Converts the data types of the columns in the DataFrame.
    add_classification(threshold)
        Adds a Class column to the DataFrame based on the threshold parameter.
    write_cleaned()
        Writes the cleaned DataFrame to a csv file.
    """
    
    def __init__(self, protein, y_col='Ki'):
        sys.path.append('..')
        self.data_paths = {
    '5ht1a': '../original_datasets/klek/5ht1a_klek.csv',
    '5ht7': '../original_datasets/klek/5ht7_klek.csv',
    'beta2': '../original_datasets/klek/beta2_klek.csv',
    'd2': '../original_datasets/klek/d2_klek.csv',
    'h1': '../original_datasets/klek/h1_klek.csv'
    }
            
        self.proteins_ = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
        self.y_col = y_col
        self.missing = None
        self.duplicated = None
        self.redundant = None
        
        self.protein = protein
        self.path = self.data_paths[protein]
        
        self.activities = {
    '5ht1a': 54,
    '5ht7': 89,
    'beta2': 270,
    'd2': 240.1,
    'h1': 501
    }
        
        self.threshold = self.activities[self.protein]
        
    def load_data(self):
        self.df = pd.read_csv(self.path)
        self.df[self.y_col] = self.df[self.y_col].astype('float')
        print(f"Loaded data for {self.protein} protein")
        
        
    def remove_missing(self):
        print(f'The initial size of dataset: {len(self.df)}')
        missing = self.df[self.y_col].isnull()
        zero_or_neg = self.df[self.y_col] <= 0
        to_remove = pd.Series([a or b for a, b in zip(missing,zero_or_neg)])
        print(f'The percent of rows with missing {self.y_col} values: {to_remove.sum()/len(self.df)*100:.2f} %')
        self.df = self.df[~to_remove]
        print(f'New size of the dataset: {len(self.df)}')
        self.missing = int(to_remove.sum())
        
    
    def remove_duplicates(self):
        print(f'The initial size of dataset: {len(self.df)}')
        duplicates = self.df.duplicated(keep = 'first')
        print(f'The percent of duplicated rows: {duplicates.sum()/len(self.df)*100:.2f} %')
        self.df = self.df[~duplicates]
        print(f'New size of the dataset: {len(self.df)}')
        self.duplicated = int(duplicates.sum())
        
        
    def add_classification(self):
        classes = [1 if x < 100 else 0 for x in self.df[self.y_col]]
        self.df.insert(1, "Class", classes)
        print(f'The percent of compounds classified as active is {self.df["Class"].sum()/len(self.df)*100:.2f} %')
        
                
    def write_cleaned(self):
        write_path = '../original_datasets/klek_clean/' + self.protein + '_klek_100nM.csv'
        self.df.to_csv(path_or_buf=write_path, sep=',', index=False)
        print(f'Cleaned file saved at {write_path}')
    
    
    def return_parameters(self): # zwraca listę list dotyczącą ile czego brakowało/usunięto w kolejności wczytania do klasy
        parameters = []
        parameters.append(self.missing)
        parameters.append(self.duplicated)
        parameters.append(self.redundant)
        return parameters
    
    def write_parquet(self):
        path = '..' + self.path.strip('.csv') + '_balanced.parquet'
        print(path)
        self.df.to_parquet(path)

In [24]:
proteins = ['5ht1a','5ht7','beta2','d2','h1']

In [29]:
for protein in proteins:
    data = DataProcessor(protein)
    data.load_data()
    data.remove_missing()
    data.remove_duplicates()
    data.add_classification()
    data.write_cleaned()
    #data.write_parquet()

Loaded data for 5ht1a protein
The initial size of dataset: 5840
The percent of rows with missing Ki values: 3.78 %
New size of the dataset: 5619
The initial size of dataset: 5619
The percent of duplicated rows: 6.57 %
New size of the dataset: 5250
The percent of compounds classified as active is 57.96 %
Cleaned file saved at ../original_datasets/klek_clean/5ht1a_klek_100nM.csv
Loaded data for 5ht7 protein
The initial size of dataset: 3262
The percent of rows with missing Ki values: 3.86 %
New size of the dataset: 3136
The initial size of dataset: 3136
The percent of duplicated rows: 5.52 %
New size of the dataset: 2963
The percent of compounds classified as active is 51.50 %
Cleaned file saved at ../original_datasets/klek_clean/5ht7_klek_100nM.csv
Loaded data for beta2 protein
The initial size of dataset: 1660
The percent of rows with missing Ki values: 51.51 %
New size of the dataset: 805
The initial size of dataset: 805
The percent of duplicated rows: 2.86 %
New size of the dataset: 

In [37]:
proteins = ['5ht1a','5ht7','beta2','d2','h1']

read_paths = [f'../original_datasets/klek_clean/{x}_klek_balanced.csv' for x in proteins]
write_paths = [f'../original_datasets/parquet_clean/{x}_klek_balanced.parquet' for x in proteins]

def csv_to_parquet(read_path, write_path):
    data = pd.read_csv(read_path, sep=',')
    data.to_parquet(write_path)

In [38]:
for read_path, write_path in zip(read_paths, write_paths):
    print(read_path, write_path)
    csv_to_parquet(read_path, write_path)

../original_datasets/klek_clean/5ht1a_klek_balanced.csv ../original_datasets/parquet_clean/5ht1a_klek_balanced.parquet
../original_datasets/klek_clean/5ht7_klek_balanced.csv ../original_datasets/parquet_clean/5ht7_klek_balanced.parquet
../original_datasets/klek_clean/beta2_klek_balanced.csv ../original_datasets/parquet_clean/beta2_klek_balanced.parquet
../original_datasets/klek_clean/d2_klek_balanced.csv ../original_datasets/parquet_clean/d2_klek_balanced.parquet
../original_datasets/klek_clean/h1_klek_balanced.csv ../original_datasets/parquet_clean/h1_klek_balanced.parquet


In [185]:
class DataProcessor2():
    
    def __init__(self, protein, y_col='Ki', dataset='balanced'):
        sys.path.append('..')
        self.data_paths = {
            '5ht1a': f'../original_datasets/klek_clean/5ht1a_klek_{dataset}.csv',
            '5ht7': f'../original_datasets/klek_clean/5ht7_klek_{dataset}.csv',
            'beta2': f'../original_datasets/klek_clean/beta2_klek_{dataset}.csv',
            'd2': f'../original_datasets/klek_clean/d2_klek_{dataset}.csv',
            'h1': f'../original_datasets/klek_clean/h1_klek_{dataset}.csv'  
        }
        
        self.y_col = y_col        
        self.protein = protein
        
        self.path = self.data_paths[protein]
        
    def load_data(self):
        self.df = pd.read_csv(self.path)
        self.df[self.y_col] = self.df[self.y_col].astype('float')
        print(f"Loaded data for {self.protein} protein")
        
    def count_keys(self):
        keys = [f'KLEK_{x}' for x in range(0,4860)]
        values = [0 for x in range(0,4860)]
        self.dc = {key:value for key, value in zip(keys, values)}
        selection = self.df['Class'] == 1
        tenten = self.df[selection]['fps']
        for row in tenten:
            new = row.strip(']').strip('[').split()
            newest = [f"KLEK_{x.strip(',')}" for x in new]
            for key in newest:
                self.dc[key] += 1         
    
    def combine(self):
        Key = self.dc.keys()
        Value = self.dc.values()
        return Key, Value
        #self.df2 = pd.DataFrame({'KLEK':Key, 'COUNT':Value})
        #print(self.df2.head())
        
    def write_cleaned(self):
        write_path = '../original_datasets/klek_clean/' + self.protein + '_klek_100nM.csv'
        self.df.to_csv(path_or_buf=write_path, sep=',', index=False)
        print(f'Cleaned file saved at {write_path}')

In [186]:
proteins = ['5ht1a','5ht7','beta2','d2','h1']

In [187]:
data = DataProcessor2('5ht1a')
data.load_data()
data.count_keys()
data.combine()

Loaded data for 5ht1a protein


(dict_keys(['KLEK_0', 'KLEK_1', 'KLEK_2', 'KLEK_3', 'KLEK_4', 'KLEK_5', 'KLEK_6', 'KLEK_7', 'KLEK_8', 'KLEK_9', 'KLEK_10', 'KLEK_11', 'KLEK_12', 'KLEK_13', 'KLEK_14', 'KLEK_15', 'KLEK_16', 'KLEK_17', 'KLEK_18', 'KLEK_19', 'KLEK_20', 'KLEK_21', 'KLEK_22', 'KLEK_23', 'KLEK_24', 'KLEK_25', 'KLEK_26', 'KLEK_27', 'KLEK_28', 'KLEK_29', 'KLEK_30', 'KLEK_31', 'KLEK_32', 'KLEK_33', 'KLEK_34', 'KLEK_35', 'KLEK_36', 'KLEK_37', 'KLEK_38', 'KLEK_39', 'KLEK_40', 'KLEK_41', 'KLEK_42', 'KLEK_43', 'KLEK_44', 'KLEK_45', 'KLEK_46', 'KLEK_47', 'KLEK_48', 'KLEK_49', 'KLEK_50', 'KLEK_51', 'KLEK_52', 'KLEK_53', 'KLEK_54', 'KLEK_55', 'KLEK_56', 'KLEK_57', 'KLEK_58', 'KLEK_59', 'KLEK_60', 'KLEK_61', 'KLEK_62', 'KLEK_63', 'KLEK_64', 'KLEK_65', 'KLEK_66', 'KLEK_67', 'KLEK_68', 'KLEK_69', 'KLEK_70', 'KLEK_71', 'KLEK_72', 'KLEK_73', 'KLEK_74', 'KLEK_75', 'KLEK_76', 'KLEK_77', 'KLEK_78', 'KLEK_79', 'KLEK_80', 'KLEK_81', 'KLEK_82', 'KLEK_83', 'KLEK_84', 'KLEK_85', 'KLEK_86', 'KLEK_87', 'KLEK_88', 'KLEK_89', 'KLEK_90

In [188]:
prot = {}
Keys = None

for i, protein in enumerate(proteins):
    data = DataProcessor2(protein)
    data.load_data()
    data.count_keys()
    Keys, prot[protein] = data.combine()

Loaded data for 5ht1a protein
Loaded data for 5ht7 protein
Loaded data for beta2 protein
Loaded data for d2 protein
Loaded data for h1 protein


In [189]:
with open('../keys/KlekFP_keys.txt', 'r') as file:
    f = file.readlines()
    
smarts = [str(x).strip('\n') for x in f]

In [190]:
Combined_df = pd.DataFrame({
    'KEYS': Keys, 
    '5ht1a': prot['5ht1a'],
    '5ht7': prot['5ht7'],
    'beta2': prot['beta2'],
    'd2': prot['d2'],
    'h1': prot['h1'],
    'SMARTS': smarts
})

In [191]:
print(Combined_df.head())

     KEYS  5ht1a  5ht7  beta2    d2   h1   
0  KLEK_0   1423   703    380  2626  559  \
1  KLEK_1    186    40     50   345   54   
2  KLEK_2      0     0      0     0    0   
3  KLEK_3      0     0      0     0    0   
4  KLEK_4      0     0      0     0    0   

                                              SMARTS  
0                              [!#1][CH]([!#1])[!#1]  
1                   [!#1][CH]([!#1])[CH]([!#1])[!#1]  
2    [!#1][CH]([!#1])[CH]([!#1])C([CH3])([CH3])[CH3]  
3       [!#1][CH]([!#1])[CH](C(=O)O[CH3])C(=O)O[CH3]  
4  [!#1][CH]([!#1])[CH]1[CH2][CH]([CH]([!#1])[!#1...  


In [192]:
Combined_df.to_csv('../original_datasets/counts_active.csv', sep=',', header=True, index=False)