In [3]:
import math
import keras
import csv
import json
import os
import time
from multiprocessing import Process

Using TensorFlow backend.


In [10]:
class Data():
    
    def __init__(self):
        
        self.zinc = '/home3/jwang/druglikeness_ML/zinc/fp/'
        self.dir_list = ['FP2/', 'FP3/', 'FP4/', 'MACCS/']
        self.selected_file = '/home3/jwang/druglikeness_ML/zinc/zinc_decoy_10k/decoy_10k_selected.regid'
        
        self.load_drug_names()
        self.load_drug_dic()
        self.load_data()
        
        
    def load_data(self):
        # Get list of location tuples
        location_dic = {}
        for drug_id in self.drug_list:
            file, index = self.get_drug_location(drug_id)
            if file in location_dic.keys():
                location_dic[file].append(index)
            else:
                location_dic[file] = [index]
        # Get data using location dic
        self.x = {}
        self.y = {}
        for directory in self.dir_list:
            self.x[directory] = []
            print(directory)
            file_index = 0
            for key in list(location_dic.keys()):
                start = time.time()
                with open(self.zinc+directory+key, 'r') as file:
                    reader = csv.reader(file, delimiter='\t')
                    index = 0
                    for row in reader:
                        if index in location_dic[key]:
                            self.x[directory].append(self.convert(row[0]))
                        index += 1
                print('{}: {} of {} - {}'.format(key, file_index, len(list(location_dic.keys())), round((time.time()-start),2)))
                file_index += 1
            li = [0]*len(self.x[directory])
            self.y[directory] = li
            with open(self.zinc+directory+'drug_3D.fp', 'r') as file:
                reader = csv.reader(file, delimiter='\t')
                for row in reader:
                    if list(row[0])[0] == '#':
                        continue
                    self.x[directory].append(self.convert(row[0]))
            self.y[directory] = self.y[directory] + [1]*(len(self.x[directory])-len(self.y[directory]))
            
            
            
    def get_drug_location(self, drug_id):
        location = self.drug_dic
        char_index = 0
        for char in list(drug_id):
            if char_index+1 == len(list(drug_id)):
                return location[char]
            else:
                location = location[char]
            char_index += 1
        
        
    def load_drug_dic(self):
        try:
            try:
                self.drug_dic = drug_dic
            except:
                with open('fp/drug_dic.json', 'r') as file:
                    self.drug_dic = json.load(file)
        except:
            self.drug_dic = {}
            directory = self.zinc+'FP2/'
            file_index = 0
            files = len(list(os.listdir(directory)))
            start = time.time()
            self.drug_dic = {}
            for file_name in list(os.listdir(directory)):
                with open(directory+file_name, 'r') as file:
                    reader = csv.reader(file, delimiter='\t')
                    index = 0
                    for row in reader:
                        if len(row) > 1:
                            name = row[1]
                            length = len(list(name))
                            char_index = 0
                            location = self.drug_dic
                            for char in list(name):
                                if char_index+1 == length:
                                    location[char] = (file_name, index)
                                elif char in location.keys():
                                    location = location[char]
                                else:
                                    location[char] = {}
                                    location = location[char]
                                char_index += 1
                        index += 1
                print('{}: {} of {} - {}'.format(file_name, file_index, files, round((time.time()-start),2)))
                file_index += 1
        
        
    def load_drug_names(self):
        try:
            self.drug_list = drug_list
        except:
            self.drug_list = []
            with open(self.selected_file, 'r') as file:
                reader = csv.reader(file, delimiter='\t')
                for row in reader:
                    if list(row[0])[0] == '#':
                        continue
                    print(row[0])
                    self.drug_list.append(row[0])
    

    def convert(self, string):
        letter_list = list('abcdef')
        jump = 1
        fp = []
        code = [0]*len(list(string))*4
        for index in range(len(list(string))):
            index += jump
            value = string[index].lower()
            if value in letter_list:
                value = letter_list.index(value) + 10
            index = (index-jump)*4
            value = int(value)
            code[index] = 1*value%2
            code[index+1] = 1*math.floor(value/2)%2
            code[index+2] = 1*math.floor(value/4)%2
            code[index+3] = 1*math.floor(value/8)%2
            jump *= -1
        return code

In [11]:
data = Data()
drug_list = data.drug_list
drug_dic = data.drug_dic

16_p0.45.fp: 0 of 151 - 0.95
16_p0.24.fp: 1 of 151 - 1.91
16_p1.11.fp: 2 of 151 - 2.8
16_p0.56.fp: 3 of 151 - 3.77
16_p0.83.fp: 4 of 151 - 4.77
16_p0.97.fp: 5 of 151 - 5.76
16_p0.101.fp: 6 of 151 - 6.77
16_p0.116.fp: 7 of 151 - 7.77
16_p0.49.fp: 8 of 151 - 8.75
16_p0.62.fp: 9 of 151 - 9.76
16_p0.4.fp: 10 of 151 - 10.76
16_p0.3.fp: 11 of 151 - 11.79
16_p0.82.fp: 12 of 151 - 12.82
drug_3D.fp: 13 of 151 - 12.85
16_p1.14.fp: 14 of 151 - 13.78
16_p0.18.fp: 15 of 151 - 14.78
16_p0.80.fp: 16 of 151 - 15.78
16_p0.111.fp: 17 of 151 - 16.86
16_p0.123.fp: 18 of 151 - 17.88
16_p1.16.fp: 19 of 151 - 18.76
16_p1.0.fp: 20 of 151 - 19.68
16_p0.22.fp: 21 of 151 - 20.7
16_p0.108.fp: 22 of 151 - 21.72
16_p1.15.fp: 23 of 151 - 22.62
16_p0.114.fp: 24 of 151 - 23.65
16_p1.17.fp: 25 of 151 - 24.57
16_p0.86.fp: 26 of 151 - 25.61
16_p0.119.fp: 27 of 151 - 26.63
16_p0.76.fp: 28 of 151 - 27.65
16_p0.2.fp: 29 of 151 - 28.68
16_p0.65.fp: 30 of 151 - 29.72
16_p0.84.fp: 31 of 151 - 30.75
16_p0.98.fp: 32 of 151 - 31.

16_p0.109.fp: 115 of 150 - 0.54
16_p0.23.fp: 116 of 150 - 0.54
16_p1.0.fp: 117 of 150 - 0.37
16_p0.41.fp: 118 of 150 - 0.48
16_p0.84.fp: 119 of 150 - 0.49
16_p0.124.fp: 120 of 150 - 0.34
16_p0.106.fp: 121 of 150 - 0.53
16_p1.13.fp: 122 of 150 - 0.56
16_p0.2.fp: 123 of 150 - 0.51
16_p1.17.fp: 124 of 150 - 0.35
16_p0.12.fp: 125 of 150 - 0.48
16_p0.101.fp: 126 of 150 - 0.44
16_p0.78.fp: 127 of 150 - 0.55
16_p0.68.fp: 128 of 150 - 0.49
16_p0.71.fp: 129 of 150 - 0.5
16_p0.108.fp: 130 of 150 - 0.46
16_p0.4.fp: 131 of 150 - 0.46
16_p0.20.fp: 132 of 150 - 0.53
16_p0.19.fp: 133 of 150 - 0.51
16_p0.13.fp: 134 of 150 - 0.52
16_p0.70.fp: 135 of 150 - 0.47
16_p0.3.fp: 136 of 150 - 0.51
16_p1.16.fp: 137 of 150 - 0.34
16_p0.24.fp: 138 of 150 - 0.49
16_p0.7.fp: 139 of 150 - 0.5
16_p0.122.fp: 140 of 150 - 0.4
16_p0.82.fp: 141 of 150 - 0.43
16_p1.12.fp: 142 of 150 - 0.41
16_p0.120.fp: 143 of 150 - 0.35
16_p0.121.fp: 144 of 150 - 0.39
16_p0.123.fp: 145 of 150 - 0.33
16_p0.125.fp: 146 of 150 - 0.35
16_p1.

16_p0.49.fp: 87 of 150 - 0.26
16_p0.91.fp: 88 of 150 - 0.29
16_p0.27.fp: 89 of 150 - 0.3
16_p0.85.fp: 90 of 150 - 0.27
16_p0.115.fp: 91 of 150 - 0.27
16_p1.1.fp: 92 of 150 - 0.32
16_p0.56.fp: 93 of 150 - 0.26
16_p0.118.fp: 94 of 150 - 0.32
16_p0.42.fp: 95 of 150 - 0.33
16_p0.57.fp: 96 of 150 - 0.31
16_p1.21.fp: 97 of 150 - 0.37
16_p0.30.fp: 98 of 150 - 0.3
16_p0.90.fp: 99 of 150 - 0.23
16_p0.51.fp: 100 of 150 - 0.28
16_p0.43.fp: 101 of 150 - 0.28
16_p0.36.fp: 102 of 150 - 0.29
16_p0.67.fp: 103 of 150 - 0.3
16_p0.114.fp: 104 of 150 - 0.27
16_p1.18.fp: 105 of 150 - 0.26
16_p0.44.fp: 106 of 150 - 0.28
16_p0.59.fp: 107 of 150 - 0.3
16_p0.47.fp: 108 of 150 - 0.32
16_p0.55.fp: 109 of 150 - 0.3
16_p0.33.fp: 110 of 150 - 0.28
16_p0.0.fp: 111 of 150 - 0.3
16_p0.28.fp: 112 of 150 - 0.28
16_p0.10.fp: 113 of 150 - 0.33
16_p0.105.fp: 114 of 150 - 0.33
16_p0.109.fp: 115 of 150 - 0.33
16_p0.23.fp: 116 of 150 - 0.32
16_p1.0.fp: 117 of 150 - 0.18
16_p0.41.fp: 118 of 150 - 0.27
16_p0.84.fp: 119 of 150 -

In [46]:
print(len(data.x[list(data.x.keys())[0]]))

7201


In [29]:
for key in data.x.keys():
    file_name = key.replace('/', '').lower()
    with open('fp/{}.json'.format(file_name), 'w') as file:
        json.dump({'x': data.x[key], 'y': data.y[key]}, file)

In [58]:
with open('fp/maccs.json', 'r') as file:
    test = json.load(file)

In [17]:
for key in data.y:
    print(len(data.y[key]))
    print(sum(data.y[key]))

17201
7201
17201
7201
17201
7201
17201
7201


In [28]:
print(data.x['MACCS/'][0])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0]
