In [8]:
import numpy as np
import pandas as pd

In [12]:
proteins = ['5ht1a', '5ht7', 'beta2', 'd2', 'h1']
for protein in proteins:
    df = pd.read_csv(f"./klek_clean/{protein}_klek_100nM.csv")
    print(len(df))

5250
2963
782
10170
1691


In [13]:
class FP_Sampler:
    def __init__(self, protein, dtype: ['100nM', 'balanced'], magnitude=1, add_random=False):
        self.protein = protein
        self.dtype = dtype
        self.magnitude = magnitude
        self.add_random = add_random
        self.path = f"./fp_frequency_{self.dtype}/{self.protein}_frequency.csv"
        self.df = pd.read_csv(self.path, sep=',')
        self.sizes = {
            '5ht1a': 5250,
            '5ht7': 2963,
            'beta2': 782,
            'd2': 10170,
            'h1': 1691
        }
        print(f"{self.dtype} dataset for {self.protein.upper()} loaded")
        self.make_dummy_df()
        self.read_original()
        self.combine_df()
        self.convert_to_proba()

        #print(self.fp_df.head())
  
    def make_dummy_df(self):
        dummy_dict = {'KEYS': [f"KLEK_{n}" for n in range(4860)]}
        self.dummy_df = pd.DataFrame(dummy_dict)
        
    def read_original(self):
        self.org_df = pd.read_csv(f"./counts_full_{self.dtype}.csv", sep=',')
        self.org_df = self.org_df.loc[:,[f"{self.protein}", "KEYS"]]
        self.org_df['Freq'] = self.org_df[f'{self.protein}'] / self.sizes[f"{self.protein}"]
        #print(self.org_df.head())
        
    def combine_df(self):
        self.fp_df = self.dummy_df.merge(self.org_df, on='KEYS')
        self.fp_df = self.fp_df.merge(self.df, on='KEYS').drop(columns=['SMARTS'])
        #print(self.fp_df.head())
        
    def convert_to_proba(self):
        self.fp_df['Probability'] = self.fp_df['Freq'] * ((self.fp_df[f"{self.protein}_percentage"] * self.magnitude + 100)/100)
        self.fp_df['Probability'] = self.fp_df['Probability'] / self.fp_df['Probability'].sum()
        if self.add_random:
            self.fp_df['Probability'] = pd.Series([0.0001 if x < 0.0001 else x for x in self.fp_df['Probability']])
            self.fp_df['Probability'] = self.fp_df['Probability'] / self.fp_df['Probability'].sum()
        print(self.fp_df.head())
        
    def generate_fingerprints(self, av_bits=60, n=1000):
        fps = []
        leng = []
        for fp in range(n):
            vec1 = np.array(self.fp_df['Probability']) * av_bits
            vec2 = np.random.rand(4860)
            fp = (vec1 > vec2).astype('int')
            fps.append(fp)
            leng.append(np.sum(fp))
        print(f"Generated {n} vectors with mean length of {np.mean(leng):.3f} and SD of {np.std(leng):.3f}")
        return np.array(fps)

In [22]:
['5ht1a', '5ht7', 'beta2', 'd2', 'h1']

sampler_5ht1a = FP_Sampler('5ht1a', '100nM', 10, False)
sampler_5ht7 = FP_Sampler('5ht7', '100nM', 10, False)
sampler_beta2 = FP_Sampler('beta2', '100nM', 10, False)
sampler_d2 = FP_Sampler('d2', '100nM', 10, False)
sampler_h1 = FP_Sampler('h1', '100nM', 10, False)

100nM dataset for 5HT1A loaded
     KEYS  5ht1a      Freq  5ht1a_percentage  Probability
0  KLEK_0   2806  0.534476             9.829     0.009169
1  KLEK_1    338  0.064381             1.638     0.000648
2  KLEK_2      0  0.000000             0.000     0.000000
3  KLEK_3      0  0.000000             0.000     0.000000
4  KLEK_4      0  0.000000             0.000     0.000000
100nM dataset for 5HT7 loaded
     KEYS  5ht7      Freq  5ht7_percentage  Probability
0  KLEK_0  1298  0.438070            4.792     0.008679
1  KLEK_1   117  0.039487           -1.114     0.000470
2  KLEK_2     0  0.000000            0.000     0.000000
3  KLEK_3     0  0.000000            0.000     0.000000
4  KLEK_4     0  0.000000            0.000     0.000000
100nM dataset for BETA2 loaded
     KEYS  beta2      Freq  beta2_percentage  Probability
0  KLEK_0    705  0.901535            -8.312     0.003829
1  KLEK_1     71  0.090793             2.685     0.002898
2  KLEK_2      0  0.000000             0.000     0

In [24]:
samples_5ht1a = sampler_5ht1a.generate_fingerprints(av_bits=60, n=1000)
samples_5ht7 = sampler_5ht7.generate_fingerprints(av_bits=60, n=1000)
samples_beta2 = sampler_beta2.generate_fingerprints(av_bits=60, n=1000)
samples_d2 = sampler_d2.generate_fingerprints(av_bits=60, n=1000)
samples_h1 = sampler_h1.generate_fingerprints(av_bits=60, n=1000)

Generated 1000 vectors with mean length of 54.930 and SD of 5.021
Generated 1000 vectors with mean length of 58.970 and SD of 5.368
Generated 1000 vectors with mean length of 63.884 and SD of 5.698
Generated 1000 vectors with mean length of 1056.047 and SD of 11.145
Generated 1000 vectors with mean length of 504.356 and SD of 10.679


In [26]:
df['5ht1a'] = samples_5ht1a
df['5ht7'] = samples_5ht7
df['beta2'] = samples_beta2 
df['samples_d2'] = samples_d2
df['samples_h1'] = samples_h1 
df.head()

ValueError: Cannot set a frame with no defined index and a value that cannot be converted to a Series