In [1]:
import scipy 
import numpy as np
import pandas as pd
import pandas.api.types as ptypes
import matplotlib.pyplot as plt
import seaborn as sns
import rdkit
import os
import sys

In [41]:
class DataProcessor():
    
    """
    This class loads molecular fingerprints for drugs acting on 5 different proteins into a DataFrame and performs 
    clean-up on them. The cleaned datasets can be written as csv files.

    Parameters
    ----------
    protein : str
        The protein name, one of ['5ht1a', '5ht7', 'beta2', 'd2', 'h1'].
    fingerprint : str
        The fingerprint type, one of ['Klek', 'Sub', 'MACCS'].
    y_col : str, optional
        The name of the column representing the dependent variable, default is 'Ki'.

    Attributes
    ----------
    data_paths_ : dict
        A dictionary containing the paths to the fingerprint files.
    proteins_ : list of str
        The list of valid protein names.
    fingerprints_ : list of str
        The list of valid fingerprint types.
    protein : str
        The protein name.
    fingerprint : str
        The fingerprint type.
    path : str
        The path to the fingerprint file.
    df : pandas.DataFrame
        The DataFrame containing the loaded fingerprint data.

    Methods
    -------
    remove_missing()
        Removes rows with missing values in the dependent variable column.
    remove_duplicates()
        Removes duplicate rows in the DataFrame.
    remove_redundant()
        Removes redundant columns in the DataFrame.
    convert_data()
        Converts the data types of the columns in the DataFrame.
    write_cleaned()
        Writes the cleaned DataFrame to a csv file.
    """
    
    def __init__(self, protein, fingerprint, y_col='Ki'):
        sys.path.append('..')
        self.data_paths_ = {
    '5ht1a_Klek' : './datasets/5ht1a_KlekFP.csv',
    '5ht1a_MACCS' : './datasets/5ht1a_MACCSFP.csv',
    '5ht1a_Sub' : './datasets/5ht1a_SubFP.csv',
    '5ht7_Klek' : './datasets/5ht7_KlekFP.csv',
    '5ht7_MACCS' : './datasets/5ht7_MACCSFP.csv',
    '5ht7_Sub' : './datasets/5ht7_SubFP.csv',
    'beta2_Klek' : './datasets/beta2_KlekFP.csv',
    'beta2_MACCS' : './datasets/beta2_MACCSFP.csv',
    'beta2_Sub' : './datasets/beta2_SubFP.csv',
    'd2_Klek' : './datasets/d2_KlekFP.csv',
    'd2_MACCS' : './datasets/d2_MACCSFP.csv',
    'd2_Sub' : './datasets/d2_SubFP.csv',
    'h1_Klek' : './datasets/h1_KlekFP.csv',
    'h1_MACCS' : './datasets/h1_MACCSFP.csv',
    'h1_Sub' : './datasets/h1_SubFP.csv'
    }
            
        self.proteins_ = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
        self.fingerprints_ = ['Klek', 'Sub', 'MACCS']
        self.y_col = y_col
        
        self.protein = protein
        self.fingerprint = fingerprint
        self.path = self.protein + '_' + self.fingerprint
        if self.path in self.data_paths_.keys():
            self.df = pd.read_csv(self.data_paths_[self.path])
            print(f'{self.fingerprint} FP for protein {self.protein} loaded')
        else:
            self.df = None
            print("Protein and fingerprint combination not found")
        
        
    def remove_missing(self):
        print(f'The initial size of dataset: {len(self.df)}')
        missing = self.df[self.y_col].isnull()
        print(f'The percent of rows with missing {self.y_col} values: {missing.sum()/len(self.df)*100:.2f} %')
        self.df = self.df[~missing]
        print(f'New size of the dataset: {len(self.df)}')
        
    
    def remove_duplicates(self):
        print(f'The initial size of dataset: {len(self.df)}')
        duplicates = self.df.duplicated(keep = 'first')
        print(f'The percent of duplicated rows: {duplicates.sum()/len(self.df)*100:.2f} %')
        self.df = self.df[~duplicates]
        print(f'New size of the dataset: {len(self.df)}')
        
        
    def remove_redundant(self):
        col1 = len(self.df.columns)
        selection = [True if sum > 0 else False for sum in self.df.sum(axis=0)]
        self.df = self.df.loc[:, selection]
        col2 = len(self.df.columns)
        print(f'There were {col1-col2} redundant columns in the dataset.')
        
        
    def convert_data(self):
        self.df[self.y_col] = self.df[self.y_col].astype(float)
        columns = self.df.columns[1:]
        self.df[columns] = self.df[columns].astype(int)
        
        
    def write_cleaned(self):
        write_path = './cleaned_datasets/' + self.path + '_clean.csv'
        self.df.to_csv(path_or_buf=write_path, sep=',', index=False)
        print(f'Cleaned file saved at {write_path}')

In [37]:
data = DataProcessor(protein='5ht1a', fingerprint='MACCS')
data.remove_missing()
data.remove_duplicates()
data.remove_redundant()
data.convert_data()

data.write_cleaned()

MACCS FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.78 %
New size of the dataset: 5630
The initial size of dataset: 5630
The percent of duplicated rows: 8.33 %
New size of the dataset: 5161
There were 18 redundant columns in the dataset.
Cleaned file saved at ./cleaned_datasets/5ht1a_MACCS_test.csv


In [27]:
selection = [True if x > 0 else False for x in data.df.sum(axis=0)]
#print(selection, len(selection))
print(data.df.loc[:, selection].sum(axis=0))

#selection = [1 if x > 0 else 0 for x in sums]
#print(selection)


Ki            5.279294e+08
MACCSFP3      4.000000e+00
MACCSFP8      3.000000e+01
MACCSFP11     8.400000e+01
MACCSFP13     4.000000e+00
                  ...     
MACCSFP162    5.142000e+03
MACCSFP163    5.158000e+03
MACCSFP164    4.678000e+03
MACCSFP165    5.160000e+03
MACCSFP166    3.110000e+02
Length: 149, dtype: float64


In [40]:
proteins = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
keys = ['Klek', 'Sub', 'MACCS']

for protein in proteins:
        for key in keys:
            data = DataProcessor(protein=protein, fingerprint=key)
            data.remove_missing()
            data.remove_duplicates()
            data.remove_redundant()
            data.convert_data()
            data.write_cleaned()
            print('\n')
            del data

Klek FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.78 %
New size of the dataset: 5630
The initial size of dataset: 5630
The percent of duplicated rows: 7.66 %
New size of the dataset: 5199
There were 3263 redundant columns in the dataset.
Cleaned file saved at ./cleaned_datasets/5ht1a_Klek_clean.csv


Sub FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.78 %
New size of the dataset: 5630
The initial size of dataset: 5630
The percent of duplicated rows: 10.23 %
New size of the dataset: 5054
There were 190 redundant columns in the dataset.
Cleaned file saved at ./cleaned_datasets/5ht1a_Sub_clean.csv


MACCS FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.78 %
New size of the dataset: 5630
The initial size of dataset: 5630
The percent of duplicated rows: 8.33 %
New size of the dataset: 5161
There were 18 redund