In [1]:
import psutil

def get_free_memory():
    memory = psutil.virtual_memory()
    return memory.available / (1024.0 ** 3)  # Convert bytes to gigabytes

print(f"Free CPU Memory: {get_free_memory():.2f} GB")

Free CPU Memory: 317.79 GB


In [2]:
import torch
torch.backends.cuda.matmul.allow_tf32 = True
from datasets import Dataset
import os
from deepgo.utils import Ontology

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import WhitespaceSplit

tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = WhitespaceSplit()

In [4]:
tokenizer_path = "WordLevel_tokenizer_trained_InterPro.json"
tokenizer = tokenizer.from_file(tokenizer_path)
tokenizer.enable_truncation(512)

In [5]:
tokenizer.get_vocab_size()


544998

In [6]:
test = tokenizer.encode("WP_265490204 WP_206642677 WP_053312998 WP_251959347 WP_000076573 WP_227526754 WP_218401808 WP_106925592")
test.ids

[1, 2966, 754, 2545, 196, 9231, 2817, 7418, 2526, 2]

In [7]:
import csv

defense_ips = []
directory = "/home/toibazd/Defense_InterPros/"
file_list = os.listdir(directory)

for file in file_list:
    print(file)
    file_path = os.path.join(directory, file)
    with open(file_path, 'r', newline='') as infile:
        reader = csv.reader(infile, delimiter = "\t")
        next(reader)
        for row in reader:
            ip = row[0]
            defense_ips.append(ip)

LIT_IP.tsv
CRISPR_IP.tsv
BREX_IP.tsv
DISARM_IP.tsv
AbiH_IP.tsv
Kiwa_IP.tsv


In [8]:
print("Defense IPs number: ", len(defense_ips))

Defense IPs number:  120


In [9]:
from collections import defaultdict
import csv
from tqdm.auto import tqdm
import numpy as np



new_tsv_filename = "/home/toibazd/Family_IPs_with_GO.tsv"
go = Ontology('data/go.obo')
data_dict = defaultdict(set)
ip_to_go = defaultdict(list)


with open(new_tsv_filename, "r") as new_tsvfile:
    reader = csv.reader(new_tsvfile, delimiter="\t")
    next(reader)
    for row in tqdm(reader):
        ip = row[0]  # Assuming the IP is in the first column
        go_terms = row[6]  # Assuming the GO terms are in the second column

        # Add IP and corresponding GO terms to data_dict
        ip_to_go[ip] = go_terms.split(',')



with open("/home/toibazd/Prot2IP.tsv", "r") as tsvfile:
    reader = csv.reader(tsvfile, delimiter = "\t")
    for row in tqdm(reader):
        key = row[0].split("prot_")[1].split(".")[0]
        iprs = eval(row[1])

        # Save only if there are filtered InterPro IDs
        for ip in iprs:
            if ip in defense_ips:
                for GO in ip_to_go[ip]:
                    data_dict[key].update(go.get_ancestors(GO))


0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [10]:
unique_defense = set()

# Iterate through each value list in the dictionary and add its elements to the set
for value_list in data_dict.values():
    unique_defense.update(value_list)

# Convert the set to a list if needed
unique_defense = list(unique_defense)

In [11]:
len(unique_defense)

54

In [12]:
from sklearn.preprocessing import MultiLabelBinarizer
enc = MultiLabelBinarizer()
one_hot_encoded = enc.fit_transform(data_dict.values())
one_hot_encoded_dict = {key: value for key, value in zip(data_dict.keys(), one_hot_encoded)}

print(len(one_hot_encoded_dict.keys()))

266


In [13]:
one_hot_encoded.shape

(266, 54)

In [14]:
# import os
# import random
# from tqdm import tqdm
# from concurrent.futures import ThreadPoolExecutor

# directory = '/ibex/user/toibazd/InterPro_annotated_genomes/'
# one_hot_encoded_sentences = {}

# sentence_length = 40
# sentences_per_IP = 100

# # Set random seed for reproducibility
# random.seed(42)

# # Randomly choose 1000 files with seed 42
# selected_files = os.listdir(directory)


# # Define a function to process a file
# def process_file(filename, IP):
#     sentences = []

#     filepath = os.path.join(directory, filename)

#     with open(filepath, 'r') as file:
#         content = file.read()
#         words = content.strip().split()

#         # Check if the key is in the file
#         for i in range(19, len(words)-20):
#             # Shuffle the indices of the words containing the key
#             if IP in data_dict[words[i]]:
#                 if len(words) - i >= 21:
#                     sentence = " ".join(words[i - 19:i + sentence_length - 19])
#                     sentences.append(sentence)
#     return sentences


# # Iterate over keys
# for IP in tqdm(unique_defense):
#     one_hot_encoded_sentences[IP] = []
#     sentences_count = 0

#     # Use ThreadPoolExecutor for concurrent processing
#     with ThreadPoolExecutor(max_workers=4) as executor:
#         futures = [executor.submit(process_file, filename, IP) for filename in selected_files]
#         for future in futures:
#             sentences = future.result()
#             one_hot_encoded_sentences[IP].extend(sentences)
#             sentences_count += len(sentences)
#             if sentences_count >= sentences_per_IP:
#                 break

#     # Break if the required number of sentences per key is reached


In [15]:
# for key, values in one_hot_encoded_sentences.items():
#     print(len(values))

In [16]:
# import json

# with open('BERT_sanity_check_defense_DNN_senteces.json', 'w') as f:
#     json.dump(one_hot_encoded_sentences, f)

In [14]:
import json
with open('BERT_sanity_check_defense_DNN_senteces.json', "r") as f:
    one_hot_encoded_sentences = json.load(f)

In [15]:
for key, value in one_hot_encoded_sentences.items():
    print(key, end=" ")
    print(len(value))

IPR013381 100
IPR030955 19
IPR013487 18
IPR028629 106
IPR047679 100
IPR013410 51
IPR048067 71
IPR010154 100
IPR047721 100
IPR047939 100
IPR010147 100
IPR010144 100
IPR017576 19
IPR013421 59
IPR013489 100
IPR017575 33
IPR010160 14
IPR027620 25
IPR019089 77
IPR014174 100
IPR021124 100
IPR010152 100
IPR010172 48
IPR049832 42
IPR005537 101
IPR013413 57
IPR010180 100
IPR049889 36
IPR019199 100
IPR013414 100
IPR010156 100
IPR013403 73
IPR013408 100
IPR002729 102
IPR013415 28
IPR017574 56
IPR013419 65
IPR010179 100
IPR047583 26
IPR027616 34
IPR049794 17
IPR017589 33
IPR010149 100
IPR013490 19
IPR019857 100
IPR013492 100
IPR013396 100
IPR019858 100
IPR021127 100
IPR006482 100
IPR033641 100
IPR032359 100
IPR013418 100
IPR010148 101
IPR019855 100
IPR019851 100
IPR013444 20
IPR013343 100
IPR013395 100
IPR013398 100
IPR031820 84
IPR013397 100
IPR019504 100
IPR013412 100
IPR010155 100
IPR013382 100
IPR014858 100
IPR019092 57
IPR023843 69
IPR013399 100
IPR021228 100
IPR005510 100
IPR010146 100
IPR02

In [16]:
one_hot_encoded_sentences = {key: value for key, value in one_hot_encoded_sentences.items() if value}
len(one_hot_encoded_sentences)

83

In [17]:
matching_string = one_hot_encoded_sentences.values()
len(matching_string)

83

In [18]:
matching_string = [item for sublist in matching_string for item in sublist]
len(matching_string)

6354

In [19]:
count= 0
for string in matching_string:
    words = string.split(" ")
    if not words[19] in data_dict.keys():
        print("False")
    else:
        count+=1
print(count)

False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
False
Fals

In [20]:
from transformers import BertModel

# Check if CUDA is available and choose device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_path = "/home/toibazd/Data/BERT/BERT_context_pretrained_InterPro_final"

model = BertModel.from_pretrained(model_path).cuda()
model.pooler = None
model.eval()

[2024-03-05 22:19:34,463] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Some weights of BertModel were not initialized from the model checkpoint at /home/toibazd/Data/BERT/BERT_context_pretrained_InterPro_final and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(544998, 256, padding_idx=0)
    (position_embeddings): Embedding(512, 256)
    (token_type_embeddings): Embedding(2, 256)
    (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=256, out_features=256, bias=True)
            (key): Linear(in_features=256, out_features=256, bias=True)
            (value): Linear(in_features=256, out_features=256, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=256, out_features=256, bias=True)
            (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [21]:
print(one_hot_encoded[1].shape)

(54,)


In [22]:
batch_size = 128 # Define your batch size
model.cuda()
embeddings = []
labels = []

# Define your data iterator in batches
for i in tqdm(range(0, len(matching_string), batch_size)):
    batch_sentences = matching_string[i:i+batch_size]

    batch_inputs = tokenizer.encode_batch(batch_sentences)
    input_ids_list = []
    attention_mask_list = []

    for encoding in batch_inputs:
        input_ids_list.append(encoding.ids)
        attention_mask_list.append(encoding.attention_mask)

    # Convert lists to tensors and move to device
    input_ids = torch.tensor(input_ids_list).cuda()
    attention_mask = torch.tensor(attention_mask_list).cuda()

    with torch.inference_mode():
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
    hidden_states = outputs.last_hidden_state
    relevant_hidden_states = hidden_states[:, 20, :]  # Extract the 20th token's hidden state
    
    for j in range(len(batch_sentences)):
        embeddings.append(relevant_hidden_states[j].cpu())
        indicator = batch_sentences[j].split()[19]
        labels.append(one_hot_encoded_dict[indicator])

# Ensure order in embeddings matches order in labels

# Now embeddings and labels are stored on the CPU


  0%|          | 0/50 [00:00<?, ?it/s]

KeyError: 'WP_001084084'

In [26]:
def calculate_pos_weights(class_counts):
    pos_weights = np.ones_like(class_counts)
    
    neg_counts = [len(embeddings)-pos_count for pos_count in class_counts]
    for cdx, (pos_count, neg_count) in enumerate(zip(class_counts,  neg_counts)):
      pos_weights[cdx] = neg_count / (pos_count + 1e-5)

    return torch.as_tensor(pos_weights, dtype=torch.float)
class_counts = np.array(labels).sum(axis=0)
pos_weights = calculate_pos_weights(class_counts) 

In [27]:
pos_weights

tensor([  6.,  55.,  13.,  22.,  62.,  62.,  32.,  38.,  62.,  62.,  26.,  54.,
         62., 452., 116.,  47.,  62.,  62.,  61.,  41.,  62.,  50.,  62.,  62.,
         62.,  62.,  62.,  86., 185.,  62., 118.,  43.,  54.,  30., 225.,  43.,
         93., 106., 316., 351.,  62., 301.,  62.,  62.,  62., 146., 112., 191.,
        333., 191.,  81.,  54., 396.,  30.,  62.,  28.,  58.,  60.,  58.,  59.,
         14.,  46.,  62.,  91., 121.,  62., 185., 180., 253.,  58., 333.,  74.,
         62.,  29., 243.,  62.,  62.,  62.,  88., 191., 372., 150., 175.])

In [28]:
import random

random.seed(42)
# Zip the lists together
combined = list(zip(embeddings, labels))

# Shuffle the combined list
random.shuffle(combined)

# Unzip the shuffled list
embeddings, labels = zip(*combined)



In [33]:
model.to("cpu")
torch.cuda.empty_cache()

In [34]:
import torch.nn as nn

class Classification_V0(nn.Module):
    def __init__(self, input_dim, first_hidden, second_hidden, last_hidden, output_dim, dropout_prob):
        super(Classification_V0, self).__init__()
        self.fc1 = nn.Linear(input_dim, first_hidden)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(first_hidden, second_hidden)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(second_hidden, last_hidden)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(last_hidden, output_dim)
        
        self.dropout = nn.Dropout(dropout_prob)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout(x)
        x = self.fc4(x)
        return x

input_dim = 256
first_hidden = 128
second_hidden = 64
last_hidden = 32
output_dim = 83
dropout_prob = 0.25

clf_model = Classification_V0(input_dim, first_hidden, second_hidden, last_hidden, output_dim, dropout_prob)


In [35]:
from torch.utils.data import DataLoader, TensorDataset
import torch.optim.lr_scheduler as lr_scheduler

batch_size = 128
def data_generator(embeddings, labels, batch_size):
    num_samples = len(embeddings)
    for i in range(0, num_samples, batch_size):
        batch_embeddings = embeddings[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]
        yield batch_embeddings, batch_labels


optimizer = torch.optim.Adam(clf_model.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=10, gamma=0.1)
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weights)

In [36]:
import numpy as np

num_epochs = 20
epoch_loss = []
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}:")
    
    # Initialize data generator
    generator = data_generator(embeddings, labels, batch_size)
    train_loss = 0
    # Iterate over batches
    for batch_embeddings, batch_labels in tqdm(generator, desc="Training Batches", leave=False):
        
        optimizer.zero_grad()
        
        # Convert data to tensors

        batch_embeddings_tensor = torch.stack(batch_embeddings)
        batch_labels = np.array(batch_labels)
        batch_labels_tensor = torch.tensor(batch_labels, dtype = torch.float32)
        outputs = clf_model(batch_embeddings_tensor)
        loss = criterion(outputs, batch_labels_tensor)
#         print(loss)
        train_loss+=loss.item()
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    scheduler.step()
    epoch_loss.append(train_loss/(len(embeddings)/batch_size))
    print(train_loss/(len(embeddings)/batch_size))
print("Training finished.")

Epoch 1/20:


Training Batches: 0it [00:00, ?it/s]

1.3371982064155576
Epoch 2/20:


Training Batches: 0it [00:00, ?it/s]

1.1072264513130416
Epoch 3/20:


Training Batches: 0it [00:00, ?it/s]

0.7983504821566016
Epoch 4/20:


Training Batches: 0it [00:00, ?it/s]

0.6445601538062884
Epoch 5/20:


Training Batches: 0it [00:00, ?it/s]

0.5415031329843283
Epoch 6/20:


Training Batches: 0it [00:00, ?it/s]

0.48084804341595694
Epoch 7/20:


Training Batches: 0it [00:00, ?it/s]

0.44218807262484294
Epoch 8/20:


Training Batches: 0it [00:00, ?it/s]

0.39668831241404745
Epoch 9/20:


Training Batches: 0it [00:00, ?it/s]

0.36394016120581646
Epoch 10/20:


Training Batches: 0it [00:00, ?it/s]

0.3326097968092196
Epoch 11/20:


Training Batches: 0it [00:00, ?it/s]

0.3255219067797782
Epoch 12/20:


Training Batches: 0it [00:00, ?it/s]

0.32120946356437474
Epoch 13/20:


Training Batches: 0it [00:00, ?it/s]

0.31584005983212776
Epoch 14/20:


Training Batches: 0it [00:00, ?it/s]

0.31166888709183865
Epoch 15/20:


Training Batches: 0it [00:00, ?it/s]

0.30640818181936946
Epoch 16/20:


Training Batches: 0it [00:00, ?it/s]

0.3090356142970425
Epoch 17/20:


Training Batches: 0it [00:00, ?it/s]

0.3103457040519582
Epoch 18/20:


Training Batches: 0it [00:00, ?it/s]

0.3009302693415934
Epoch 19/20:


Training Batches: 0it [00:00, ?it/s]

0.2986715909329307
Epoch 20/20:


Training Batches: 0it [00:00, ?it/s]

0.3027071690161648
Training finished.


In [37]:
# directory = '/ibex/user/toibazd/InterPro_annotated_genomes_for_testing/'
# one_hot_encoded_sentences_2 = {}

# sentence_length = 40
# sentence_per_IP = 50
# selected_files = os.listdir(directory)

# total_sentences = sum(len(sentences) for sentences in one_hot_encoded_sentences.values())

# # Iterate over keys
# for IP in tqdm(unique_defense):
#     one_hot_encoded_sentences_2[IP] = []
#     sentences_count=0
    
#     # Iterate over selected files
#     for filename in selected_files:
#         filepath = os.path.join(directory, filename)

#         with open(filepath, 'r') as file:
#             content = file.read()
#             words = content.strip().split()

#             # Check if the key is in the file
#             for i in range(19, len(words)-20):
#                 # Shuffle the indices of the words containing the key
#                 if IP in data_dict[words[i]]:
#                     if len(words) - i >= 21:
#                         sentence = " ".join(words[i - 19:i + sentence_length - 19])
#                         one_hot_encoded_sentences_2[IP].append(sentence)
#                         sentences_count += 1
#                         if sentences_count>=sentence_per_IP:
#                             break
#         if sentences_count>=sentence_per_IP:
#             break
#     print(sentences_count)
            


In [38]:
# for key, value in one_hot_encoded_sentences_2.items():
#     print(len(value))

In [39]:
# import json

# with open('BERT_sanity_check_defense_DNN_senteces_testing.json', 'w') as f:
#     json.dump(one_hot_encoded_sentences_2, f)

In [40]:
with open('BERT_sanity_check_defense_DNN_senteces_testing.json', "r") as f:
    test_sentences = json.load(f)

In [41]:
test_sentences = {key: value for key, value in test_sentences.items() if value}
len(test_sentences)

83

In [42]:
matching_string = test_sentences.values()
len(matching_string)

83

In [43]:
matching_string = [item for sublist in matching_string for item in sublist]
len(matching_string)

2659

In [44]:
from bertviz import model_view

batch_size = 128 # Define your batch size
# model.cuda()
test_embeddings = []
test_labels = []

# Define your data iterator in batches
for i in tqdm(range(0, len(matching_string), batch_size)):
    batch_sentences = matching_string[i:i+batch_size]
    batch_inputs = tokenizer.encode_batch(batch_sentences)
    input_ids_list = []
    attention_mask_list = []

    for encoding in batch_inputs:
        input_ids_list.append(encoding.ids)
        attention_mask_list.append(encoding.attention_mask)
    # Convert lists to tensors and move to device
    try:
        input_ids = torch.tensor(input_ids_list)
    except:
        for ins in input_ids_list:
            if len(ins)!=42:
                print(len(ins))
                print(ins)
    attention_mask = torch.tensor(attention_mask_list)

    with torch.inference_mode():
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True, output_attentions = True)
    hidden_states = outputs.last_hidden_state
    attentions = outputs.attentions[-1]
    relevant_hidden_states = hidden_states[:, 20, :]  # Extract the 20th token's hidden state
#     for i in range(len(batch_sentences)):
#         if i < 20:
#             att = []
#             extracted_tensor = attentions[i, :, :, :]
#             extracted_tensor = extracted_tensor.unsqueeze(0)
#             print(extracted_tensor.shape)
#             att.append(extracted_tensor)

#             tokens = "[CLS] "+tokenizer.decode(input_ids_list[i])+" [SEP]"
#             tokens = tokens.split(" ")
#             model_view(att, tokens)
    for j in range(len(batch_sentences)):
        # Move embeddings to CPU
        test_embeddings.append(relevant_hidden_states[j].cpu())
        indicator = batch_sentences[j].split()[19]
        test_labels.append(one_hot_encoded_dict[indicator])


  0%|          | 0/21 [00:00<?, ?it/s]

In [45]:
tokenizer.decode([0,12,12,12,12,
                 ])

'WP_072018053 WP_072018053 WP_072018053 WP_072018053'

In [46]:
model.to("cpu")
torch.cuda.empty_cache()

In [47]:
clf_model.eval()

Classification_V0(
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (relu3): ReLU()
  (fc4): Linear(in_features=32, out_features=83, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [60]:
generator = data_generator(test_embeddings, test_labels, batch_size)
# Iterate over batches
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
all_predictions = []
all_labels = []
# Initialize lists to store predictions and labels across all batches
# Iterate over batches
count = 0
for batch_embeddings, batch_labels in tqdm(generator, desc="Evaluation Batches", leave=False):
    batch_embeddings_tensor = torch.stack(batch_embeddings)
    batch_labels = np.array(batch_labels)
    logits = clf_model(batch_embeddings_tensor)
    predictions = torch.sigmoid(logits)
    thresholded_predictions = (predictions > 0.85).float()
    all_predictions.append(thresholded_predictions.detach().numpy())
    all_labels.append(batch_labels)


Evaluation Batches: 0it [00:00, ?it/s]

In [61]:
len(all_predictions)

21

In [62]:
len(all_labels[0])
    

128

In [63]:
print(all_predictions[0].shape)

(128, 83)


In [64]:
print(all_labels[0].shape)

(128, 83)


In [65]:

# Concatenate predictions and labels across all batches
all_predictions = np.concatenate(all_predictions)
all_labels = np.concatenate(all_labels)


In [66]:
all_labels.shape

(2659, 83)

In [67]:
cl_report = multilabel_confusion_matrix(all_labels, all_predictions)
print("Classification report:")
print(len(cl_report))

Classification report:
83


In [68]:
print(cl_report[0])

[[2256   13]
 [ 243  147]]


In [69]:
from sklearn.metrics import classification_report

report = classification_report(all_labels, all_predictions, zero_division=np.nan)

In [70]:
all_predictions.sum(axis=0)

array([160.,  65., 131.,  85.,  59.,  95., 106.,  72.,  55.,  42., 106.,
        74., 117.,   8.,  13.,  66.,  37.,  28.,  31.,  57.,  40.,  75.,
        55.,  66.,  78.,  82.,  59.,  19.,   9.,  61.,  14.,  92.,  18.,
        90.,   5.,  64.,  14.,  23.,   7.,   2.,  53.,   4.,  59.,  41.,
        48.,  11.,  11.,   4.,   5.,   4.,  44.,  18.,   9.,  66.,  40.,
       106.,  39.,  49.,  50.,  42., 132.,  68.,  54.,  14.,  16.,  81.,
        31.,  11.,  20.,  30.,   3.,  29.,  77., 106.,   3.,  50.,  53.,
        40.,  18.,   9.,   4.,   7.,   1.], dtype=float32)

In [71]:
print(report)

              precision    recall  f1-score   support

           0       0.92      0.38      0.53       390
           1       0.89      1.00      0.94        58
           2       0.91      0.63      0.74       190
           3       0.88      0.71      0.79       105
           4       0.59      0.70      0.64        50
           5       0.47      0.90      0.62        50
           6       0.71      0.85      0.77        88
           7       0.99      0.95      0.97        75
           8       0.91      1.00      0.95        50
           9       0.95      0.80      0.87        50
          10       0.88      0.99      0.93        94
          11       0.66      0.84      0.74        58
          12       0.34      0.80      0.48        50
          13       0.62      1.00      0.77         5
          14       0.69      1.00      0.82         9
          15       1.00      1.00      1.00        66
          16       1.00      0.74      0.85        50
          17       0.75    

In [1]:
list(enc.classes_)[15]

NameError: name 'enc' is not defined

In [None]:
from sklearn.metrics import precision_recall_curve, average_precision_score, roc_curve, roc_auc_score
import matplotlib.pyplot as plt
num_classes = 83
# Compute precision, recall, and thresholds for each class
precision = dict()
recall = dict()
thresholds = dict()
average_precision = dict()

for i in range(num_classes):  # num_classes is the number of classes
    precision[i], recall[i], thresholds[i] = precision_recall_curve(all_labels[:, i], all_predictions[:, i])
    average_precision[i] = average_precision_score(all_labels[:, i], all_predictions[:, i])

# Compute micro-average precision-recall curve and AUC
precision["micro"], recall["micro"], _ = precision_recall_curve(all_labels.ravel(), all_predictions.ravel())
average_precision["micro"] = average_precision_score(all_labels, all_predictions, average="micro")

# Compute ROC curve and AUC
fpr = dict()
tpr = dict()
roc_auc = dict()

for i in range(num_classes):
    fpr[i], tpr[i], _ = roc_curve(all_labels[:, i], all_predictions[:, i])
    roc_auc[i] = roc_auc_score(all_labels[:, i], all_predictions[:, i])

# Compute micro-average ROC curve and AUC
fpr["micro"], tpr["micro"], _ = roc_curve(all_labels.ravel(), all_predictions.ravel())
roc_auc["micro"] = roc_auc_score(all_labels, all_predictions, average="micro")

# Split classes into groups of 10
class_groups = [list(range(i, min(i + 10, num_classes))) for i in range(0, num_classes, 10)]

# Plot Precision-Recall and ROC curves for each group
for group in class_groups:
    plt.figure(figsize=(12, 6))

    # Plot Precision-Recall curve
    plt.subplot(1, 2, 1)
    plt.step(recall['micro'], precision['micro'], where='post', label='Micro-average Precision-Recall curve (AUPR = {0:0.2f})'
                 ''.format(average_precision["micro"]))
    for i in group:
        plt.step(recall[i], precision[i], where='post', label='Precision-recall curve of class {0} (AUPR = {1:0.2f})'
                 ''.format(i, average_precision[i]))

    plt.xlabel('Recall')
    plt.ylabel('Precision')
    plt.ylim([0.0, 1.05])
    plt.xlim([0.0, 1.0])
    plt.title('Precision-Recall curve (Classes {})'.format(group))
    plt.legend(loc="best")

    # Plot ROC curve
    plt.subplot(1, 2, 2)
    plt.plot(fpr['micro'], tpr['micro'], label='Micro-average ROC curve (AUC = {0:0.2f})'
                 ''.format(roc_auc["micro"]))
    for i in group:
        plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (AUC = {1:0.2f})'
                 ''.format(i, roc_auc[i]))

    plt.plot([0, 1], [0, 1], 'k--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC curve (Classes {})'.format(group))
    plt.legend(loc="best")

    plt.tight_layout()
    plt.show()


In [1]:
import nltk
import sklearn

print('The nltk version is {}.'.format(nltk.__version__))
print('The scikit-learn version is {}.'.format(sklearn.__version__))

The nltk version is 3.5.
The scikit-learn version is 1.4.1.post1.
