In [130]:
import scipy 
import numpy as np
import pandas as pd
import pandas.api.types as ptypes
import matplotlib.pyplot as plt
import seaborn as sns
import rdkit
import os
import sys

In [136]:
class DataProcessor():
    
    def __init__(self, protein, fingerprint, y_col='Ki'):
        sys.path.append('..')
        self.data_paths_ = {
    '5ht1a_Klek' : './datasets/5ht1a_KlekFP.csv',
    '5ht1a_MACCS' : './datasets/5ht1a_MACCSFP.csv',
    '5ht1a_Sub' : './datasets/5ht1a_SubFP.csv',
    '5ht7_Klek' : './datasets/5ht7_KlekFP.csv',
    '5ht7_MACCS' : './datasets/5ht7_MACCSFP.csv',
    '5ht7_Sub' : './datasets/5ht7_SubFP.csv',
    'beta2_Klek' : './datasets/beta2_KlekFP.csv',
    'beta2_MACCS' : './datasets/beta2_MACCSFP.csv',
    'beta2_Sub' : './datasets/beta2_SubFP.csv',
    'd2_Klek' : './datasets/d2_KlekFP.csv',
    'd2_MACCS' : './datasets/d2_MACCSFP.csv',
    'd2_Sub' : './datasets/d2_SubFP.csv',
    'h1_Klek' : './datasets/h1_KlekFP.csv',
    'h1_MACCS' : './datasets/h1_MACCSFP.csv',
    'h1_Sub' : './datasets/h1_SubFP.csv'
    }
            
        self.proteins_ = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
        self.fingerprints_ = ['Klek', 'Sub', 'MACCS']
        self.y_col = y_col
        
        self.protein = protein
        self.fingerprint = fingerprint
        self.path = self.protein + '_' + self.fingerprint
        if self.path in self.data_paths_.keys():
            self.df = pd.read_csv(self.data_paths_[self.path])
            print(f'{self.fingerprint} FP for protein {self.protein} loaded')
        else:
            self.df = None
            print("Protein and fingerprint combination not found")
        
        
    def drop_missing(self):
        print(f'The initial size of dataset: {len(self.df)}')
        missing = self.df[self.y_col].isnull()
        print(f'The percent of rows with missing {self.y_col} values: {missing.sum()/len(self.df)*100:.2f} %')
        self.df = self.df[~missing]
        print(f'New size of the dataset: {len(self.df)}')
        
    
    def remove_duplicates(self):
        print(f'The initial size of dataset: {len(self.df)}')
        duplicates = self.df.duplicated(keep = 'first')
        print(f'The percent of duplicated rows: {duplicates.sum()/len(self.df)*100:.2f} %')
        self.df = self.df[~duplicates]
        print(f'New size of the dataset: {len(self.df)}')
        self.df.reset_index()
        
        
    def convert_data(self):
        self.df[self.y_col] = self.df[self.y_col].astype(float)
        columns = self.df.columns[1:]
        self.df[columns] = self.df[columns].astype(int)
        """for column in self.df.columns:
            assert isinstance(self.df[column][0], int)
        """
    
    
    def write_cleaned(self):
        write_path = './cleaned_datasets/' + self.path + '_clean.csv'
        self.df.to_csv(path_or_buf=write_path, sep=',')
        print(f'Cleaned file saved at {write_path}')

In [135]:
data = DataProcessor(protein='5ht1a', fingerprint='Klek')
data.drop_missing()
data.convert_data()
data.remove_duplicates()
data.write_cleaned()

Klek FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.78 %
New size of the dataset: 5630
Index(['KRFP1', 'KRFP2', 'KRFP3', 'KRFP4', 'KRFP5', 'KRFP6', 'KRFP7', 'KRFP8',
       'KRFP9', 'KRFP10',
       ...
       'KRFP4851', 'KRFP4852', 'KRFP4853', 'KRFP4854', 'KRFP4855', 'KRFP4856',
       'KRFP4857', 'KRFP4858', 'KRFP4859', 'KRFP4860'],
      dtype='object', length=4860)
The initial size of dataset: 5630
The percent of duplicated rows: 7.66 %
New size of the dataset: 5199
Cleaned file saved at ./cleaned_datasets/5ht1a_Klek_clean.csv


In [138]:
proteins = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
keys = ['Klek', 'Sub', 'MACCS']

for protein in proteins:
        for key in keys:
            data = DataProcessor(protein=protein, fingerprint=key)
            data.drop_missing()
            data.remove_duplicates()
            data.convert_data()
            data.write_cleaned()
            print('\n')
            del data

Klek FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.78 %
New size of the dataset: 5630
The initial size of dataset: 5630
The percent of duplicated rows: 7.66 %
New size of the dataset: 5199
Cleaned file saved at ./cleaned_datasets/5ht1a_Klek_clean.csv


Sub FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.78 %
New size of the dataset: 5630
The initial size of dataset: 5630
The percent of duplicated rows: 10.23 %
New size of the dataset: 5054
Cleaned file saved at ./cleaned_datasets/5ht1a_Sub_clean.csv


MACCS FP for protein 5ht1a loaded
The initial size of dataset: 5851
The percent of rows with missing Ki values: 3.78 %
New size of the dataset: 5630
The initial size of dataset: 5630
The percent of duplicated rows: 8.33 %
New size of the dataset: 5161
Cleaned file saved at ./cleaned_datasets/5ht1a_MACCS_clean.csv


Klek FP for protein 5ht7 loaded
The initial size of da