In [57]:
import numpy as np
import pandas as pd
import pandas.api.types as ptypes
import matplotlib.pyplot as plt
import seaborn as sns
import rdkit
from rdkit import Chem
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem import rdMolDescriptors
from rdkit.Chem import AllChem, DataStructs
import os
import sys

In [94]:
class FingerprintGenerator():
    
    def __init__(self, protein, fingerprint):
        sys.path.append('..')
        self.proteins_ = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']  # allowed proteins
        self.fingerprints_ = ['Klek', 'Sub', 'MACCS']  # allowed fingerprints
        
        self.protein = protein
        self.fingerprint = fingerprint
        
        self.df = None
        
        self.data_paths = {
            '5ht1a': './smiles/5ht1a_smiles.csv',
            '5ht7': './smiles/5ht7_smiles.csv',
            'beta2': './smiles/beta2_smiles2.csv',
            'd2': './smiles/d2_smiles.csv',
            'h1': './smiles/h1_smiles.csv'
        }
        
        self.keys_paths = {
            'Klek': './keys/KlekFP_keys.txt',
            'Sub': './keys/SubFP_keys.txt',
            'MACCS': './keys/MACCSFP_keys.txt'
        }
        
        self.load_data()
        
            
    def load_data(self):
        if self.protein in self.proteins_:
            self.df = pd.read_csv(self.data_paths[self.protein], sep=',', header=0, names=["SMILES", "Ki"])
            self.df['Ki'] = self.df['Ki'].astype('float')
        else:
            print('Protein not found, please check the spelling and allowed proteins')
        
    def generate_fingerprint(self, mol, smarts):
        
        fingerprint = DataStructs.ExplicitBitVect(len(smarts))
        
        for key, bit_position in smarts.items():
            pattern = Chem.MolFromSmarts(key)
            if mol.HasSubstructMatch(pattern):
                fingerprint.SetBit(bit_position)
        
        return fingerprint
    
    def smiles_to_fingerprint(self):
        
        with open(self.keys_paths[self.fingerprint], 'r') as f:
            smarts = f.readlines()
        
        smarts_dict = {}
        
        for index, key in enumerate(smarts):
            smarts_dict[key] = index
        
        fingerprints = []
        
        for index, row in self.df.iterrows():
            smiles = row['SMILES']
            mol = Chem.MolFromSmiles(smiles)
            fp = self.generate_fingerprint(mol, smarts_dict)
            fingerprints.append(fp)
        
        self.df[self.fingerprint] = fingerprints
    
    def move_fingerprints(self):
        unpacked_fp = {}
        for index, row in self.df.iterrows():
            fingerprint = row[self.fingerprint]
            unpacked_fingerprint = [x for x in fingerprint]
            
        unpacked_df = pd.DataFrame(unpacked_fp)
        print(unpacked_df)
        self.df = self.df.join(unpacked_df)
    
    def save_fingerprints(self):
        write_path = './data_new_FP/' + self.protein + '_' + self.fingerprint + '.csv'
        self.df.to_csv(write_path, sep=',', index=False)

In [95]:
protein = 'beta2'
data = FingerprintGenerator(protein, 'Klek')
data.smiles_to_fingerprint()
data.move_fingerprints()
print(data.df.head())

      0  1  2
0     0  1  1
1     0  0  0
2     0  0  0
3     0  0  0
4     0  0  0
...  .. .. ..
4855  0  0  0
4856  0  0  0
4857  0  0  0
4858  0  0  0
4859  0  0  0

[4860 rows x 3 columns]
                                              SMILES     Ki   
0                                  NCCCNCCSP(O)(O)=O    NaN  \
1  CCc1cc2CC(Cc2cc1CC)NC[C@H](O)c1ccc(O)c2NC(=O)C...   45.0   
2                  CC(C)(C)NCC(O)c1cc(Cl)c(N)c(Cl)c1  570.0   

                                                Klek  0  1  2  
0  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  0  1  1  
1  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  0  0  0  
2  [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  0  0  0  


In [61]:
print(len(data.df['Klek'][0]))

4860


In [2]:
class DataProcessor():
    
    """
    This class loads molecular fingerprints into a DataFrame and performs clean-up on them.

    Parameters
    ----------
    protein : str
        The protein name, one of ['5ht1a', '5ht7', 'beta2', 'd2', 'h1'].
    fingerprint : str
        The fingerprint type, one of ['Klek', 'Sub', 'MACCS'].
    y_col : str, optional
        The name of the column representing the dependent variable, default is 'Ki'.

    Attributes
    ----------
    data_paths_ : dict
        A dictionary containing the paths to the fingerprint files.
    proteins_ : list of str
        The list of valid protein names.
    fingerprints_ : list of str
        The list of valid fingerprint types.
    protein : str
        The protein name.
    fingerprint : str
        The fingerprint type.
    path : str
        The path to the fingerprint file.
    df : pandas.DataFrame
        The DataFrame containing the loaded fingerprint data.

    Methods
    -------
    remove_missing()
        Removes rows with missing values in the dependent variable column.
    remove_duplicates()
        Removes duplicate rows in the DataFrame.
    remove_redundant()
        Removes redundant columns in the DataFrame.
    convert_data()
        Converts the data types of the columns in the DataFrame.
    add_classification(threshold)
        Adds a Class column to the DataFrame based on the threshold parameter.
    write_cleaned()
        Writes the cleaned DataFrame to a csv file.
    """
    
    def __init__(self, protein, fingerprint, y_col='Ki'):
        sys.path.append('..')
        self.data_paths_ = {
    '5ht1a_Klek' : './datasets/5ht1a_KlekFP.csv',
    '5ht1a_MACCS' : './datasets/5ht1a_MACCSFP.csv',
    '5ht1a_Sub' : './datasets/5ht1a_SubFP.csv',
    '5ht7_Klek' : './datasets/5ht7_KlekFP.csv',
    '5ht7_MACCS' : './datasets/5ht7_MACCSFP.csv',
    '5ht7_Sub' : './datasets/5ht7_SubFP.csv',
    'beta2_Klek' : './datasets/beta2_KlekFP.csv',
    'beta2_MACCS' : './datasets/beta2_MACCSFP.csv',
    'beta2_Sub' : './datasets/beta2_SubFP.csv',
    'd2_Klek' : './datasets/d2_KlekFP.csv',
    'd2_MACCS' : './datasets/d2_MACCSFP.csv',
    'd2_Sub' : './datasets/d2_SubFP.csv',
    'h1_Klek' : './datasets/h1_KlekFP.csv',
    'h1_MACCS' : './datasets/h1_MACCSFP.csv',
    'h1_Sub' : './datasets/h1_SubFP.csv'
    }
            
        self.proteins_ = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
        self.fingerprints_ = ['Klek', 'Sub', 'MACCS']
        self.y_col = y_col
        self.missing = None
        self.duplicated = None
        self.redundant = None
        
        self.protein = protein
        self.fingerprint = fingerprint
        self.path = self.protein + '_' + self.fingerprint
        if self.path in self.data_paths_.keys():
            self.df = pd.read_csv(self.data_paths_[self.path])
            print(f'{self.fingerprint} FP for protein {self.protein} loaded')
        else:
            self.df = None
            print("Protein and fingerprint combination not found")
        
        
    def remove_missing(self):
        print(f'The initial size of dataset: {len(self.df)}')
        missing = self.df[self.y_col].isnull()
        zero_or_neg = self.df[self.y_col] <= 0
        to_remove = pd.Series([a or b for a, b in zip(missing,zero_or_neg)])
        print(f'The percent of rows with missing {self.y_col} values: {to_remove.sum()/len(self.df)*100:.2f} %')
        self.df = self.df[~to_remove]
        print(f'New size of the dataset: {len(self.df)}')
        self.missing = int(to_remove.sum())
        
    
    def remove_duplicates(self):
        print(f'The initial size of dataset: {len(self.df)}')
        duplicates = self.df.duplicated(keep = 'first')
        print(f'The percent of duplicated rows: {duplicates.sum()/len(self.df)*100:.2f} %')
        self.df = self.df[~duplicates]
        print(f'New size of the dataset: {len(self.df)}')
        self.duplicated = int(duplicates.sum())
        
        
    def remove_redundant(self):
        col1 = len(self.df.columns)
        selection = [True if sum > 0 else False for sum in self.df.sum(axis=0)]
        self.df = self.df.loc[:, selection]
        col2 = len(self.df.columns)
        print(f'There were {col1-col2} redundant columns in the dataset.')
        self.redundant = int(col1 - col2)
        
        
    def convert_data(self):
        self.df[self.y_col] = self.df[self.y_col].astype(float)
        columns = self.df.columns[1:]
        self.df[columns] = self.df[columns].astype(int)
        
        
    def add_classification(self, threshold = 100):
        classes = [1 if x < threshold else 0 for x in self.df[self.y_col]]
        self.df.insert(1, "Class", classes)
        print(f'The percent of compounds classified as active is {self.df["Class"].sum()/len(self.df)*100:.2f} %')
        
                
    def write_cleaned(self):
        write_path = './cleaned_datasets/' + self.path + '_clean.csv'
        self.df.to_csv(path_or_buf=write_path, sep=',', index=False)
        print(f'Cleaned file saved at {write_path}')
    
    
    def return_parameters(self): # zwraca listę list dotyczącą ile czego brakowało/usunięto w kolejności wczytania do klasy
        parameters = []
        parameters.append(self.missing)
        parameters.append(self.duplicated)
        parameters.append(self.redundant)
        return parameters

In [11]:
# test

data = DataProcessor(protein='5ht1a', fingerprint='MACCS')
data.remove_missing()
data.remove_duplicates()
data.remove_redundant()
data.add_classification()
data.convert_data()

# data.write_cleaned()

MACCS FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.79 %
New size of the dataset: 5629
The initial size of dataset: 5629
The percent of duplicated rows: 8.33 %
New size of the dataset: 5160
There were 18 redundant columns in the dataset.
The percent of compounds classified as active is 58.18 %


In [35]:
# przygotowanie danych

proteins = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
keys = ['Klek', 'Sub', 'MACCS']

deleted = [] # przechowywanie informacji o zmianach

for protein in proteins:
        for key in keys:
            data = DataProcessor(protein=protein, fingerprint=key)
            data.remove_missing()
            data.remove_duplicates()
            data.remove_redundant()
            data.add_classification()
            data.convert_data()
            # parameters = data.return_parameters()
            # deleted.append(parameters)
            data.write_cleaned()
            print('\n')
            del data

Klek FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.79 %
New size of the dataset: 5629
The initial size of dataset: 5629
The percent of duplicated rows: 7.66 %
New size of the dataset: 5198
There were 3263 redundant columns in the dataset.
The percent of compounds classified as active is 58.10 %
Cleaned file saved at ./cleaned_datasets/5ht1a_Klek_clean.csv


Sub FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.79 %
New size of the dataset: 5629
The initial size of dataset: 5629
The percent of duplicated rows: 10.23 %
New size of the dataset: 5053
There were 190 redundant columns in the dataset.
The percent of compounds classified as active is 58.60 %
Cleaned file saved at ./cleaned_datasets/5ht1a_Sub_clean.csv


MACCS FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.79 %
New size of the dataset: 5629
The initi

In [90]:
# Zliczenia dla MACCSÓW, jeżeli FP to powinno Ci wypluć normalnie wszystkie dane ile czego w nich jest; raczej nie próbuj 
# loopa bo się potem dalsze zepsują

proteins = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
data = []
new = pd.DataFrame()

for protein in proteins:
    df = DataProcessor(protein, "MACCS")
    df.remove_missing()
    df.remove_duplicates()
    x = list(df.df.columns[1:])
    y = list(df.df.iloc[:, 1:].sum().astype('int'))
    frame = pd.DataFrame({'Key' : x, 'Count' : y})
    freq = list(frame['Count'])
    leng = np.max(freq)
    new_values = []
    for value in freq:
        new_values.append(value/leng*100)
    frame['Freq'] = pd.Series(new_values).astype('float')
    data.append(frame)
    new[protein] = y
new['keys'] = x

MACCS FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.79 %
New size of the dataset: 5629
The initial size of dataset: 5629
The percent of duplicated rows: 8.33 %
New size of the dataset: 5160
MACCS FP for protein 5ht7 loaded
The initial size of dataset: 3266
The percent of rows with missing Ki values: 3.89 %
New size of the dataset: 3139
The initial size of dataset: 3139
The percent of duplicated rows: 7.45 %
New size of the dataset: 2905
MACCS FP for protein beta2 loaded
The initial size of dataset: 1667
The percent of rows with missing Ki values: 51.35 %
New size of the dataset: 811
The initial size of dataset: 811
The percent of duplicated rows: 4.44 %
New size of the dataset: 775
MACCS FP for protein d2 loaded
The initial size of dataset: 11821
The percent of rows with missing Ki values: 9.09 %
New size of the dataset: 10746
The initial size of dataset: 10746
The percent of duplicated rows: 7.57 %
New size of the dataset: 