In [2]:
import psutil

def get_free_memory():
    memory = psutil.virtual_memory()
    return memory.available / (1024.0 ** 3)  # Convert bytes to gigabytes

print(f"Free CPU Memory: {get_free_memory():.2f} GB")

Free CPU Memory: 328.99 GB


In [3]:
import torch
torch.backends.cuda.matmul.allow_tf32 = True
from datasets import Dataset
import os
from deepgo.utils import Ontology


In [4]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import WhitespaceSplit

tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = WhitespaceSplit()

In [5]:
tokenizer_path = "WordLevel_tokenizer_trained_InterPro.json"
tokenizer = tokenizer.from_file(tokenizer_path)
tokenizer.enable_truncation(512)

In [6]:
tokenizer.get_vocab_size()


544998

In [7]:
test = tokenizer.encode("WP_265490204 WP_206642677 WP_053312998 WP_251959347 WP_000076573 WP_227526754 WP_218401808 WP_106925592")
test.ids

[1, 2966, 754, 2545, 196, 9231, 2817, 7418, 2526, 2]

In [8]:
import json
with open("/home/toibazd/Most_frequent_IPs.json", "r") as f:
    ips = json.load(f)

sorted_dict = sorted(ips.items(), key=lambda x: x[1], reverse=True)
most_frequent_ips = [item[0] for item in sorted_dict[:100]]


In [9]:
print(most_frequent_ips[:10])

['IPR004090', 'IPR011701', 'IPR002514', 'IPR003719', 'IPR002155', 'IPR005750', 'IPR001001', 'IPR004604', 'IPR011603', 'IPR005252']


In [11]:
from collections import defaultdict
from sklearn.preprocessing import MultiLabelBinarizer
import csv
from tqdm.auto import tqdm
import numpy as np



ip_to_go = defaultdict(list)
data_dict = defaultdict(list)
enc = MultiLabelBinarizer()
new_tsv_filename = "/home/toibazd/Family_IPs_with_GO.tsv"
go = Ontology('data/go.obo')


with open(new_tsv_filename, "r") as new_tsvfile:
    reader = csv.reader(new_tsvfile, delimiter="\t")
    next(reader)
    for row in tqdm(reader):
        ip = row[0]  # Assuming the IP is in the first column
        go_terms = row[6]  # Assuming the GO terms are in the second column

        # Add IP and corresponding GO terms to data_dict
        ip_to_go[ip] = go_terms.split(',')


with open("/home/toibazd/Prot2IP_GO_filtered_MF.tsv", "r") as tsvfile:
    reader = csv.reader(tsvfile, delimiter = "\t")
    for row in tqdm(reader):
        key = row[0].split("prot_")[1].split(".")[0]
        iprs = eval(row[1])
        
        # Filter InterPro IDs that are in the words list
#         filtered_iprs = [ipr for ipr in iprs if ipr in most_frequent_ips]
        filtered_iprs = iprs
        # Save only if there are filtered InterPro IDs
        for ip in iprs:
            if ip in most_frequent_ips:
                for GO in ip_to_go[ip]:
                    data_dict[key].extend(go.get_ancestors(GO))
                    print(go.get_ancestors(GO))

one_hot_encoded = enc.fit_transform(data_dict.values())
one_hot_encoded_dict = {key: value for key, value in zip(data_dict.keys(), one_hot_encoded)}

print(len(one_hot_encoded_dict.keys()))

0it [00:00, ?it/s]

0it [00:00, ?it/s]

{'GO:0005215', 'GO:0043225', 'GO:0003674', 'GO:0022804', 'GO:0015318', 'GO:0015399', 'GO:0015419', 'GO:0022857', 'GO:0042626', 'GO:0015116', 'GO:0140359', 'GO:0140657', 'GO:1901682', 'GO:0015103'}
{'GO:0051234', 'GO:0051179', 'GO:0072348', 'GO:0008150', 'GO:0006810', 'GO:0008272', 'GO:0015698'}
{'GO:0110165', 'GO:0016020', 'GO:0005575'}
{'GO:0005215', 'GO:0022857', 'GO:0003674'}
{'GO:0051234', 'GO:0051179', 'GO:0008150', 'GO:0055085', 'GO:0006810', 'GO:0009987'}
{'GO:0003674', 'GO:0016614', 'GO:0003824', 'GO:0016616', 'GO:0016491'}
{'GO:0003979', 'GO:0003674', 'GO:0016614', 'GO:0003824', 'GO:0016616', 'GO:0016491'}
{'GO:1901576', 'GO:0009058', 'GO:0044237', 'GO:0016051', 'GO:0005975', 'GO:0044249', 'GO:0008150', 'GO:0005976', 'GO:0044238', 'GO:0071704', 'GO:0009059', 'GO:0000271', 'GO:0043170', 'GO:0009987', 'GO:0008152'}
{'GO:0140640', 'GO:0003674', 'GO:0003824', 'GO:0008452', 'GO:0016874', 'GO:0016886', 'GO:0140098'}
{'GO:0034641', 'GO:0044237', 'GO:0008152', 'GO:0008150', 'GO:004423

{'GO:0140640', 'GO:0004518', 'GO:0003674', 'GO:0004540', 'GO:0016788', 'GO:0016787', 'GO:0003824', 'GO:0140098'}
{'GO:0005215', 'GO:0003674'}
{'GO:0043167', 'GO:0032553', 'GO:0005524', 'GO:0030554', 'GO:0017076', 'GO:0005488', 'GO:0036094', 'GO:0003674', 'GO:1901363', 'GO:0097159', 'GO:0032555', 'GO:0043168', 'GO:0000166', 'GO:0035639', 'GO:0032559', 'GO:1901265', 'GO:0097367'}
{'GO:0016462', 'GO:0003674', 'GO:0016817', 'GO:0017111', 'GO:0016818', 'GO:0016887', 'GO:0016787', 'GO:0003824'}
{'GO:0110165', 'GO:0016020', 'GO:0005575'}
{'GO:0003674', 'GO:0005488', 'GO:0003677', 'GO:0003676', 'GO:0097159'}
{'GO:0140640', 'GO:0008094', 'GO:0003674', 'GO:0003916', 'GO:0003918', 'GO:0016853', 'GO:0003824', 'GO:0140657', 'GO:0140097'}
{'GO:0043167', 'GO:0032553', 'GO:0005524', 'GO:0030554', 'GO:0017076', 'GO:0005488', 'GO:0036094', 'GO:0003674', 'GO:1901363', 'GO:0097159', 'GO:0032555', 'GO:0043168', 'GO:0000166', 'GO:0035639', 'GO:0032559', 'GO:1901265', 'GO:0097367'}
{'GO:0034641', 'GO:0016043

{'GO:0005215', 'GO:0022857', 'GO:0003674'}
{'GO:0051234', 'GO:0051179', 'GO:0008150', 'GO:0055085', 'GO:0006810', 'GO:0009987'}
{'GO:0003674', 'GO:0005488', 'GO:0003677', 'GO:0003676', 'GO:0097159'}
{'GO:0140640', 'GO:0003674', 'GO:0004803', 'GO:0003824', 'GO:0140097'}
{'GO:0034641', 'GO:0006313', 'GO:0044237', 'GO:0006725', 'GO:0006259', 'GO:0008150', 'GO:0044238', 'GO:0071704', 'GO:1901360', 'GO:0009987', 'GO:0006310', 'GO:0090304', 'GO:0006139', 'GO:0043170', 'GO:0046483', 'GO:0032196', 'GO:0006807', 'GO:0008152'}
{'GO:0005215', 'GO:0022857', 'GO:0003674'}
{'GO:0051234', 'GO:0051179', 'GO:0008150', 'GO:0055085', 'GO:0006810', 'GO:0009987'}
{'GO:0003674', 'GO:0005488', 'GO:0036094', 'GO:1901363', 'GO:0097159', 'GO:0000166', 'GO:1901265'}
{'GO:0043167', 'GO:0032553', 'GO:0005524', 'GO:0030554', 'GO:0017076', 'GO:0005488', 'GO:0036094', 'GO:0003674', 'GO:1901363', 'GO:0097159', 'GO:0032555', 'GO:0043168', 'GO:0000166', 'GO:0035639', 'GO:0032559', 'GO:1901265', 'GO:0097367'}
{'GO:001687

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [12]:
len(one_hot_encoded_dict)

21791

In [13]:
len(one_hot_encoded)

21791

In [14]:
# Find unique numbers and their counts
unique_numbers, counts = np.unique(one_hot_encoded, return_counts=True)
all_count = 0
# Print the count of each number
for number, count in zip(unique_numbers, counts):
    all_count+=count
    print(f"Number {number}: Count {count}")
print(all_count)

Number 0: Count 11496627
Number 1: Count 401259
11897886


In [15]:
import json
with open('BERT_DNN_senteces.json', "r") as f:
    one_hot_encoded_sentences = json.load(f)

In [16]:
one_hot_encoded_sentences = {key: value for key, value in one_hot_encoded_sentences.items() if value}
len(one_hot_encoded_sentences)

100

In [17]:
matching_string = one_hot_encoded_sentences.values()
len(matching_string)

100

In [18]:
matching_string = [item for sublist in matching_string for item in sublist]
len(matching_string)

20155

In [19]:
from transformers import BertModel

# Check if CUDA is available and choose device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_path = "/home/toibazd/Data/BERT/BERT_context_pretrained_InterPro_final"

model = BertModel.from_pretrained(model_path).cuda()
model.pooler = None
model.eval()

[2024-03-05 22:05:49,648] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Some weights of BertModel were not initialized from the model checkpoint at /home/toibazd/Data/BERT/BERT_context_pretrained_InterPro_final and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(544998, 256, padding_idx=0)
    (position_embeddings): Embedding(512, 256)
    (token_type_embeddings): Embedding(2, 256)
    (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=256, out_features=256, bias=True)
            (key): Linear(in_features=256, out_features=256, bias=True)
            (value): Linear(in_features=256, out_features=256, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=256, out_features=256, bias=True)
            (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [20]:
print(len(one_hot_encoded[1]))

546


In [21]:
batch_size = 128 # Define your batch size

embeddings = []
labels = []

# Define your data iterator in batches
for i in tqdm(range(0, len(matching_string), batch_size)):
    batch_sentences = matching_string[i:i+batch_size]

    batch_inputs = tokenizer.encode_batch(batch_sentences)
    input_ids_list = []
    attention_mask_list = []

    for encoding in batch_inputs:
        input_ids_list.append(encoding.ids)
        attention_mask_list.append(encoding.attention_mask)

    # Convert lists to tensors and move to device
    input_ids = torch.tensor(input_ids_list).cuda()
    attention_mask = torch.tensor(attention_mask_list).cuda()

    with torch.inference_mode():
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
    hidden_states = outputs.last_hidden_state
    relevant_hidden_states = hidden_states[:, 20, :]  # Extract the 20th token's hidden state
    
    for j in range(len(batch_sentences)):
        # Move embeddings to CPU
        embeddings.append(relevant_hidden_states[j].cpu())
        indicator = batch_sentences[j].split()[19]
        labels.append(one_hot_encoded_dict[indicator])

# Ensure order in embeddings matches order in labels

# Now embeddings and labels are stored on the CPU


  0%|          | 0/158 [00:00<?, ?it/s]

In [22]:
def calculate_pos_weights(class_counts):
    pos_weights = np.ones_like(class_counts)
    
    neg_counts = [len(embeddings)-pos_count for pos_count in class_counts]
    for cdx, (pos_count, neg_count) in enumerate(zip(class_counts,  neg_counts)):
      pos_weights[cdx] = neg_count / (pos_count + 1e-5)

    return torch.as_tensor(pos_weights, dtype=torch.float)
class_counts = np.array(labels).sum(axis=0)
pos_weights = calculate_pos_weights(class_counts) 

In [23]:
pos_weights

tensor([25., 34., 98.,  3., 66., 62., 54., 49.,  0.,  4.,  4., 43., 95., 89.,
        37.,  0., 93., 98., 96., 49., 32., 99., 49., 49., 48., 48., 91., 91.,
        62., 98., 51., 47., 97., 99., 97., 82., 51., 43., 19., 25., 38., 38.,
        91., 28., 20., 21., 21., 53., 99., 95., 23., 94., 94., 90., 18., 37.,
         2., 83., 83., 91., 43., 83., 83., 69.,  1., 49.,  4., 31.,  1., 98.,
        37., 25., 62., 98., 20., 95.,  9., 46., 20.,  2., 14., 14., 53., 53.,
         4., 13., 96., 32., 90., 13., 87., 10., 23., 49., 89., 66., 26., 47.,
        37., 94., 94., 32., 37., 13., 55., 98., 53., 53., 14., 99., 99., 99.,
         2., 12., 51., 17., 17., 34.,  9., 10.,  1.,  3., 15., 15., 83., 69.,
        25., 83., 18., 13., 13., 32., 18., 95., 22., 91.,  0.,  0., 27., 66.,
        98., 66., 99., 32., 99., 99., 97., 99., 98., 94., 20., 76., 92., 95.,
        93., 43., 96., 66., 34., 90., 66., 20., 70., 87., 18., 14., 87.,  4.,
        12., 95., 98., 98., 26., 53., 98., 55., 95., 21., 12., 5

In [24]:
len(embeddings)

20155

In [25]:
len(labels)

20155

In [26]:
import random


# Zip the lists together
combined = list(zip(embeddings, labels))

# Shuffle the combined list
random.shuffle(combined)

# Unzip the shuffled list
embeddings, labels = zip(*combined)



In [27]:
model.to("cpu")
torch.cuda.empty_cache()

In [29]:
import torch.nn as nn

class Classification_V0(nn.Module):
    def __init__(self, input_dim, first_hidden, second_hidden, last_hidden, output_dim, dropout_prob):
        super(Classification_V0, self).__init__()
        self.fc1 = nn.Linear(input_dim, first_hidden)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(first_hidden, second_hidden)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(second_hidden, last_hidden)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(last_hidden, output_dim)
        
        self.dropout = nn.Dropout(dropout_prob)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout(x)
        x = self.fc4(x)
        return x

input_dim = 256
first_hidden = 128
second_hidden = 64
last_hidden = 32
output_dim = 546
dropout_prob = 0.25

clf_model = Classification_V0(input_dim, first_hidden, second_hidden, last_hidden, output_dim, dropout_prob)


In [32]:
from torch.utils.data import DataLoader, TensorDataset
import torch.optim.lr_scheduler as lr_scheduler

batch_size = 128
def data_generator(embeddings, labels, batch_size):
    num_samples = len(embeddings)
    for i in range(0, num_samples, batch_size):
        batch_embeddings = embeddings[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]
        yield batch_embeddings, batch_labels

# Define optimizer and loss function
optimizer = torch.optim.Adam(clf_model.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=10, gamma=0.05)
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weights)

In [33]:
import numpy as np

num_epochs = 20
epoch_loss = []
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}:")
    
    # Initialize data generator
    generator = data_generator(embeddings, labels, batch_size)
    train_loss = 0
    # Iterate over batches
    for batch_embeddings, batch_labels in tqdm(generator, desc="Training Batches", leave=False):
        
        optimizer.zero_grad()
        
        # Convert data to tensors

        batch_embeddings_tensor = torch.stack(batch_embeddings)
        batch_labels = np.array(batch_labels)
        batch_labels_tensor = torch.tensor(batch_labels, dtype = torch.float32)
        batch_labels_tensor = batch_labels_tensor.squeeze()

        
        outputs = clf_model(batch_embeddings_tensor)

        
        loss = criterion(outputs, batch_labels_tensor)

        
        train_loss+=loss.item()
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    scheduler.step()
    epoch_loss.append(train_loss/(len(embeddings)/batch_size))
    print(train_loss/(len(embeddings)/batch_size))
print("Training finished.")


Epoch 1/20:


Training Batches: 0it [00:00, ?it/s]

1.133167728470443
Epoch 2/20:


Training Batches: 0it [00:00, ?it/s]

0.8927634661027912
Epoch 3/20:


Training Batches: 0it [00:00, ?it/s]

0.783643140251067
Epoch 4/20:


Training Batches: 0it [00:00, ?it/s]

0.7075368810723243
Epoch 5/20:


Training Batches: 0it [00:00, ?it/s]

0.6530710563385229
Epoch 6/20:


Training Batches: 0it [00:00, ?it/s]

0.6078497929952779
Epoch 7/20:


Training Batches: 0it [00:00, ?it/s]

0.5752853772090759
Epoch 8/20:


Training Batches: 0it [00:00, ?it/s]

0.5510068889174087
Epoch 9/20:


Training Batches: 0it [00:00, ?it/s]

0.5246719460663621
Epoch 10/20:


Training Batches: 0it [00:00, ?it/s]

0.50443921648696
Epoch 11/20:


Training Batches: 0it [00:00, ?it/s]

0.48552652755052156
Epoch 12/20:


Training Batches: 0it [00:00, ?it/s]

0.48271354858882964
Epoch 13/20:


Training Batches: 0it [00:00, ?it/s]

0.47943544399646637
Epoch 14/20:


Training Batches: 0it [00:00, ?it/s]

0.4798208720747338
Epoch 15/20:


Training Batches: 0it [00:00, ?it/s]

0.47444674360104455
Epoch 16/20:


Training Batches: 0it [00:00, ?it/s]

0.4762639328907631
Epoch 17/20:


Training Batches: 0it [00:00, ?it/s]

0.47296012967373946
Epoch 18/20:


Training Batches: 0it [00:00, ?it/s]

0.4735499806689199
Epoch 19/20:


Training Batches: 0it [00:00, ?it/s]

0.47224997310122674
Epoch 20/20:


Training Batches: 0it [00:00, ?it/s]

0.47106711563416964
Training finished.


In [34]:
with open('BERT_DNN_senteces_test.json', "r") as f:
    test_sentences = json.load(f)

In [35]:
test_sentences = {key: value for key, value in test_sentences.items() if value}
len(test_sentences)

100

In [36]:
matching_string = test_sentences.values()
len(matching_string)

100

In [37]:
matching_string = [item for sublist in matching_string for item in sublist]
len(matching_string)

5000

In [38]:
batch_size = 128 # Define your batch size
# model.cuda()
test_embeddings = []
test_labels = []

# Define your data iterator in batches
for i in tqdm(range(0, len(matching_string), batch_size)):
    batch_sentences = matching_string[i:i+batch_size]

    batch_inputs = tokenizer.encode_batch(batch_sentences)
    input_ids_list = []
    attention_mask_list = []

    for encoding in batch_inputs:
        input_ids_list.append(encoding.ids)
        attention_mask_list.append(encoding.attention_mask)
    # Convert lists to tensors and move to device
    try:
        input_ids = torch.tensor(input_ids_list)
    except:
        for ins in input_ids_list:
            if len(ins)!=42:
                print(len(ins))
                print(ins)
    attention_mask = torch.tensor(attention_mask_list)

    with torch.inference_mode():
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
    hidden_states = outputs.last_hidden_state
    relevant_hidden_states = hidden_states[:, 20, :]  # Extract the 20th token's hidden state
    
    for j in range(len(batch_sentences)):
        # Move embeddings to CPU
        test_embeddings.append(relevant_hidden_states[j].cpu())
        indicator = batch_sentences[j].split()[19]
        test_labels.append(one_hot_encoded_dict[indicator])


  0%|          | 0/40 [00:00<?, ?it/s]

In [39]:
model.to("cpu")
torch.cuda.empty_cache()

In [40]:
clf_model.eval()

Classification_V0(
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (relu3): ReLU()
  (fc4): Linear(in_features=32, out_features=546, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [96]:
generator = data_generator(test_embeddings, test_labels, batch_size)
# Iterate over batches
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
all_predictions = []
all_labels = []
# Initialize lists to store predictions and labels across all batches
# Iterate over batches
count = 0
for batch_embeddings, batch_labels in tqdm(generator, desc="Evaluation Batches", leave=False):
    batch_embeddings_tensor = torch.stack(batch_embeddings)
    batch_labels = np.array(batch_labels)
    
    logits = clf_model(batch_embeddings_tensor)
    predictions = torch.sigmoid(logits)
    thresholded_predictions = (predictions > 0.80).float()
    all_predictions.append(thresholded_predictions.detach().numpy())
    all_labels.append(batch_labels)


Evaluation Batches: 0it [00:00, ?it/s]

In [97]:
len(all_predictions)

40

In [98]:
len(all_labels[0])
    

128

In [99]:
print(all_predictions[0].shape)

(128, 546)


In [100]:

# Concatenate predictions and labels across all batches
all_predictions = np.concatenate(all_predictions)
all_labels = np.concatenate(all_labels)


In [101]:
all_labels.shape

(5000, 546)

In [102]:
cl_report = multilabel_confusion_matrix(all_labels, all_predictions)
print("Classification report:")
print(len(cl_report))

Classification report:
546


In [103]:
cl_report[0]

array([[4591,  225],
       [  60,  124]])

In [104]:
from sklearn.metrics import classification_report

report = classification_report(all_labels, all_predictions, zero_division=np.nan)

In [105]:
all_predictions.sum(axis=0)

array([349., 144.,  90., 502., 168., 198., 147.,  97., 310., 257., 259.,
        98., 115., 166., 232.,   0.,  93.,  87.,  32.,  98., 147.,  36.,
       163., 146., 135., 139., 202., 199., 197.,  44., 109., 197.,  63.,
       122.,  71., 135., 107.,  98.,  96., 157., 174., 168., 158.,  61.,
       204., 207., 206., 109., 142.,  19.,  46., 132., 133., 112., 203.,
       228., 193.,  85.,  85.,  65., 138.,  87.,  85., 168., 319., 102.,
       435., 226., 292., 185., 232.,  68., 202.,  45., 214.,  59., 415.,
       220., 204., 423., 315., 312., 104., 111., 298., 198.,  34., 144.,
       115., 283., 100., 193.,  44.,  99., 167., 171.,  74.,  85., 232.,
       138., 132., 111., 224., 303., 159.,  90., 112., 113., 311., 143.,
       150., 144., 502., 298., 106., 268., 264., 143., 384., 354., 436.,
       160., 270., 274.,  86., 167., 356.,  85., 203., 272., 272., 149.,
       203., 143., 198., 199.,   0.,   0., 196., 171.,  89., 174.,  64.,
        95., 117., 129.,  54., 123.,  90.,  63., 25

In [106]:
print(report)

              precision    recall  f1-score   support

           0       0.36      0.67      0.47       184
           1       0.69      0.74      0.72       135
           2       0.50      0.90      0.64        50
           3       0.93      0.46      0.62      1008
           4       0.37      0.81      0.51        77
           5       0.30      0.73      0.42        81
           6       0.44      0.70      0.54        92
           7       0.95      0.92      0.93       100
           8       1.00      0.06      0.12      5000
           9       0.83      0.24      0.37       907
          10       0.83      0.24      0.37       907
          11       0.49      0.42      0.45       113
          12       0.25      0.56      0.35        52
          13       0.16      0.53      0.25        51
          14       0.47      0.81      0.59       133
          15        nan      0.00      0.00      3414
          16       0.34      0.62      0.44        52
          17       0.52    

In [107]:
enc.classes_[1]

'GO:0000096'