In [1]:
import scipy 
import numpy as np
import pandas as pd
import pandas.api.types as ptypes
import matplotlib.pyplot as plt
import seaborn as sns
import rdkit
import os
import sys

In [142]:
class DataAnalysis():
    
    def __init__(self, protein, fingerprint, target_value='Ki', activity_class='Class'):
        sys.path.append('..')
        self.data_paths_ = {
    '5ht1a_Klek' : './cleaned_datasets/5ht1a_Klek_clean.csv',
    '5ht1a_MACCS' : './cleaned_datasets/5ht1a_MACCS_clean.csv',
    '5ht1a_Sub' : './cleaned_datasets/5ht1a_Sub_clean.csv',
    '5ht7_Klek' : './cleaned_datasets/5ht7_Klek_clean.csv',
    '5ht7_MACCS' : './cleaned_datasets/5ht7_MACCS_clean.csv',
    '5ht7_Sub' : './cleaned_datasets/5ht7_Sub_clean.csv',
    'beta2_Klek' : './cleaned_datasets/beta2_Klek_clean.csv',
    'beta2_MACCS' : './cleaned_datasets/beta2_MACCS_clean.csv',
    'beta2_Sub' : './cleaned_datasets/beta2_Sub_clean.csv',
    'd2_Klek' : './cleaned_datasets/d2_Klek_clean.csv',
    'd2_MACCS' : './cleaned_datasets/d2_MACCS_clean.csv',
    'd2_Sub' : './cleaned_datasets/d2_Sub_clean.csv',
    'h1_Klek' : './cleaned_datasets/h1_Klek_clean.csv',
    'h1_MACCS' : './cleaned_datasets/h1_MACCS_clean.csv',
    'h1_Sub' : './cleaned_datasets/h1_Sub_clean.csv'
    }
     
        self.proteins_ = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
        self.fingerprints_ = ['Klek', 'Sub', 'MACCS']
        self.y_col = target_value
        self.act_class = activity_class
        
        self.protein = protein
        self.fingerprint = fingerprint
        self.path = self.protein + '_' + self.fingerprint
        if self.path in self.data_paths_.keys():
            self.df = pd.read_csv(self.data_paths_[self.path])
            print(f'{self.fingerprint} FP for protein {self.protein} loaded')
        else:
            self.df = None
            print("Protein and fingerprint combination not found")
        
        self.frequencies = None
        self.differences = None
        
        #task1 
    def calculate_frequency(self):
        bits_df = self.df.drop([self.y_col, self.act_class], axis=1)
        frequency = (np.array(bits_df.sum()) / len(bits_df) * 100).round(4)
        self.frequencies = pd.DataFrame({"Key": bits_df.columns, "Frequency": frequency})
        self.frequencies = self.frequencies.sort_values(by='Frequency', ascending=False)
    
    def save_frequency(self):
        path = './frequencies/' + self.path + '_frequency.csv'
        self.frequencies.to_csv(path_or_buf = path, sep=',', index=False, columns=['Key', 'Frequency'])
        
    def calculate_difference(self):
        selection = self.df['Class'] == 1
        
        active_df = self.df[selection].drop([self.y_col, self.act_class], axis=1)
        inactive_df = self.df[~selection].drop([self.y_col, self.act_class], axis=1)
        
        active_frequency = (np.array(active_df.sum()) / len(active_df) * 100)
        inactive_frequency = (np.array(inactive_df.sum()) / len(inactive_df) * 100)
        differences = (active_frequency - inactive_frequency).round(4)
        self.differences = pd.DataFrame({'Key': active_df.columns, 'Difference': differences}).sort_values(by='Difference')

    def save_difference(self):
        path = './differences/' + self.path + '_difference.csv'
        self.differences.to_csv(path_or_buf = path, sep = ',', index=False, columns=['Key', 'Difference'])
        

In [104]:
proteins = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
fingerprints = ['Klek', 'Sub', 'MACCS']

In [None]:
Task 1
Identify the most frequently occuring substructures

Task 2 
Identify substructures with biggest difference between active and inactive compounds

In [143]:
for protein in proteins:
    for fingerprint in fingerprints:
        data = DataAnalysis(protein, fingerprint)
        data.calculate_frequency()
        data.save_frequency()
        data.calculate_difference()
        data.save_difference()

Klek FP for protein 5ht1a loaded
Sub FP for protein 5ht1a loaded
MACCS FP for protein 5ht1a loaded
Klek FP for protein 5ht7 loaded
Sub FP for protein 5ht7 loaded
MACCS FP for protein 5ht7 loaded
Klek FP for protein beta2 loaded
Sub FP for protein beta2 loaded
MACCS FP for protein beta2 loaded
Klek FP for protein d2 loaded
Sub FP for protein d2 loaded
MACCS FP for protein d2 loaded
Klek FP for protein h1 loaded
Sub FP for protein h1 loaded
MACCS FP for protein h1 loaded
