In [1]:
import psutil

def get_free_memory():
    memory = psutil.virtual_memory()
    return memory.available / (1024.0 ** 3)  # Convert bytes to gigabytes

print(f"Free CPU Memory: {get_free_memory():.2f} GB")

Free CPU Memory: 331.89 GB


In [2]:
import torch
torch.backends.cuda.matmul.allow_tf32 = True
from datasets import Dataset
import os

In [3]:
from tokenizers import Tokenizer
from tokenizers.models import WordLevel
from tokenizers.pre_tokenizers import WhitespaceSplit

tokenizer = Tokenizer(WordLevel(unk_token="[UNK]"))
tokenizer.pre_tokenizer = WhitespaceSplit()

In [4]:
tokenizer_path = "WordLevel_tokenizer_trained_InterPro.json"
tokenizer = tokenizer.from_file(tokenizer_path)
tokenizer.enable_truncation(512)

In [5]:
tokenizer.get_vocab_size()


544998

In [6]:
test = tokenizer.encode("WP_265490204 WP_206642677 WP_053312998 WP_251959347 WP_000076573 WP_227526754 WP_218401808 WP_106925592")
test.ids

[1, 2966, 754, 2545, 196, 9231, 2817, 7418, 2526, 2]

In [7]:
import json
with open("/home/toibazd/Most_frequent_IPs.json", "r") as f:
    ips = json.load(f)

sorted_dict = sorted(ips.items(), key=lambda x: x[1], reverse=True)
most_frequent_ips = [item[0] for item in sorted_dict[:100]]


In [8]:
print(most_frequent_ips[:10])

['IPR004090', 'IPR011701', 'IPR002514', 'IPR003719', 'IPR002155', 'IPR005750', 'IPR001001', 'IPR004604', 'IPR011603', 'IPR005252']


In [9]:
from collections import defaultdict
from sklearn.preprocessing import MultiLabelBinarizer
import csv
from tqdm.auto import tqdm
import numpy as np

data_dict = defaultdict(list)
enc = MultiLabelBinarizer()

with open("/home/toibazd/Prot2IP_GO_filtered_MF.tsv", "r") as tsvfile:
    reader = csv.reader(tsvfile, delimiter = "\t")
    for row in tqdm(reader):
        key = row[0].split("prot_")[1].split(".")[0]
        iprs = eval(row[1])
        
        # Filter InterPro IDs that are in the words list
#         filtered_iprs = [ipr for ipr in iprs if ipr in most_frequent_ips]
        filtered_iprs = iprs
        # Save only if there are filtered InterPro IDs
        for ip in iprs:
            if ip in most_frequent_ips:
                data_dict[key].append(ip)

one_hot_encoded = enc.fit_transform(data_dict.values())
one_hot_encoded_dict = {key: value for key, value in zip(data_dict.keys(), one_hot_encoded)}

print(len(one_hot_encoded_dict.keys()))

0it [00:00, ?it/s]

21791


In [10]:
len(one_hot_encoded_dict)

21791

In [11]:
len(one_hot_encoded)

21791

In [12]:
# Find unique numbers and their counts
unique_numbers, counts = np.unique(one_hot_encoded, return_counts=True)
all_count = 0
# Print the count of each number
for number, count in zip(unique_numbers, counts):
    all_count+=count
    print(f"Number {number}: Count {count}")
print(all_count)

Number 0: Count 2156132
Number 1: Count 22968
2179100


In [13]:
import json
with open('BERT_DNN_senteces.json', "r") as f:
    one_hot_encoded_sentences = json.load(f)

In [14]:
one_hot_encoded_sentences = {key: value for key, value in one_hot_encoded_sentences.items() if value}
len(one_hot_encoded_sentences)

100

In [15]:
matching_string = one_hot_encoded_sentences.values()
len(matching_string)

100

In [16]:
matching_string = [item for sublist in matching_string for item in sublist]
len(matching_string)

20155

In [17]:
from transformers import BertModel

# Check if CUDA is available and choose device accordingly
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_path = "/home/toibazd/Data/BERT/BERT_context_pretrained_InterPro_final"

model = BertModel.from_pretrained(model_path).cuda()
model.pooler = None
model.eval()

[2024-02-26 12:07:25,158] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Some weights of BertModel were not initialized from the model checkpoint at /home/toibazd/Data/BERT/BERT_context_pretrained_InterPro_final and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(544998, 256, padding_idx=0)
    (position_embeddings): Embedding(512, 256)
    (token_type_embeddings): Embedding(2, 256)
    (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=256, out_features=256, bias=True)
            (key): Linear(in_features=256, out_features=256, bias=True)
            (value): Linear(in_features=256, out_features=256, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=256, out_features=256, bias=True)
            (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
         

In [18]:
print(one_hot_encoded[1])

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [19]:
batch_size = 128 # Define your batch size

embeddings = []
labels = []

# Define your data iterator in batches
for i in tqdm(range(0, len(matching_string), batch_size)):
    batch_sentences = matching_string[i:i+batch_size]

    batch_inputs = tokenizer.encode_batch(batch_sentences)
    input_ids_list = []
    attention_mask_list = []

    for encoding in batch_inputs:
        input_ids_list.append(encoding.ids)
        attention_mask_list.append(encoding.attention_mask)

    # Convert lists to tensors and move to device
    input_ids = torch.tensor(input_ids_list).cuda()
    attention_mask = torch.tensor(attention_mask_list).cuda()

    with torch.inference_mode():
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
    hidden_states = outputs.last_hidden_state
    relevant_hidden_states = hidden_states[:, 20, :]  # Extract the 20th token's hidden state
    
    for j in range(len(batch_sentences)):
        # Move embeddings to CPU
        embeddings.append(relevant_hidden_states[j].cpu())
        indicator = batch_sentences[j].split()[19]
        labels.append(one_hot_encoded_dict[indicator])

# Ensure order in embeddings matches order in labels

# Now embeddings and labels are stored on the CPU


  0%|          | 0/158 [00:00<?, ?it/s]

In [20]:
def calculate_pos_weights(class_counts):
    pos_weights = np.ones_like(class_counts)
    
    neg_counts = [len(embeddings)-pos_count for pos_count in class_counts]
    for cdx, (pos_count, neg_count) in enumerate(zip(class_counts,  neg_counts)):
      pos_weights[cdx] = neg_count / (pos_count + 1e-5)

    return torch.as_tensor(pos_weights, dtype=torch.float)
class_counts = np.array(labels).sum(axis=0)
pos_weights = calculate_pos_weights(class_counts) 

In [23]:
pos_weights

tensor([72., 43., 99., 55., 99., 60., 87., 49., 21., 78., 54., 92., 49., 94.,
        82., 93., 94., 51., 94., 48., 88., 63., 62., 18., 67., 96., 80., 93.,
        41., 97., 95., 91., 90., 99., 99., 99., 92., 96., 99., 97., 20., 87.,
        48., 69., 18., 99., 92., 53., 95., 99., 70., 20., 95., 53., 76., 21.,
        94., 20., 98., 93., 44., 97., 99., 80., 51., 98., 64., 91., 94., 91.,
        99., 91., 76., 21., 51., 48., 98., 63., 66., 20., 24., 83., 95., 54.,
        49., 49., 92., 50., 95., 91., 99., 82., 62., 99., 96., 94., 63., 49.,
        47., 89.])

In [24]:
len(embeddings)

20155

In [25]:
len(labels)

20155

In [26]:
import random


# Zip the lists together
combined = list(zip(embeddings, labels))

# Shuffle the combined list
random.shuffle(combined)

# Unzip the shuffled list
embeddings, labels = zip(*combined)



In [27]:
model.to("cpu")
torch.cuda.empty_cache()

In [28]:
import torch.nn as nn

class Classification_V0(nn.Module):
    def __init__(self, input_dim, first_hidden, second_hidden, last_hidden, output_dim, dropout_prob):
        super(Classification_V0, self).__init__()
        self.fc1 = nn.Linear(input_dim, first_hidden)
        self.relu1 = nn.ReLU()
        self.fc2 = nn.Linear(first_hidden, second_hidden)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(second_hidden, last_hidden)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(last_hidden, output_dim)
        
        self.dropout = nn.Dropout(dropout_prob)
    
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu1(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu2(x)
        x = self.dropout(x)
        x = self.fc3(x)
        x = self.relu3(x)
        x = self.dropout(x)
        x = self.fc4(x)
        return x

input_dim = 256
first_hidden = 128
second_hidden = 64
last_hidden = 32
output_dim = 100
dropout_prob = 0.25

clf_model = Classification_V0(input_dim, first_hidden, second_hidden, last_hidden, output_dim, dropout_prob)


In [32]:
from torch.utils.data import DataLoader, TensorDataset


batch_size = 128
def data_generator(embeddings, labels, batch_size):
    num_samples = len(embeddings)
    for i in range(0, num_samples, batch_size):
        batch_embeddings = embeddings[i:i+batch_size]
        batch_labels = labels[i:i+batch_size]
        yield batch_embeddings, batch_labels

# Define optimizer and loss function
optimizer = torch.optim.Adam(clf_model.parameters(), lr=0.001)
scheduler = lr_scheduler.StepLR(optimizer=optimizer, step_size=10, gamma=0.05)
criterion = torch.nn.BCEWithLogitsLoss(pos_weight=pos_weights)

In [38]:
import numpy as np

num_epochs = 20
epoch_loss = []
for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}:")
    
    # Initialize data generator
    generator = data_generator(embeddings, labels, batch_size)
    train_loss = 0
    # Iterate over batches
    for batch_embeddings, batch_labels in tqdm(generator, desc="Training Batches", leave=False):
        
        optimizer.zero_grad()
        
        # Convert data to tensors

        batch_embeddings_tensor = torch.stack(batch_embeddings)
        batch_labels = np.array(batch_labels)
        batch_labels_tensor = torch.tensor(batch_labels, dtype = torch.float32)
        batch_labels_tensor = batch_labels_tensor.squeeze()

        
        outputs = clf_model(batch_embeddings_tensor)

        
        loss = criterion(outputs, batch_labels_tensor)

        
        train_loss+=loss.item()
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
    scheduler.step()
    epoch_loss.append(train_loss/(len(embeddings)/batch_size))
    print(train_loss/(len(embeddings)/batch_size))
print("Training finished.")


Epoch 1/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 2/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 3/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 4/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 5/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 6/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 7/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 8/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 9/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 10/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 11/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 12/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 13/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 14/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 15/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 16/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 17/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 18/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 19/20:


Training Batches: 0it [00:00, ?it/s]

Epoch 20/20:


Training Batches: 0it [00:00, ?it/s]

Training finished.
[1.0997934411933008, 1.073004124271636, 1.050019545331542, 1.0292624335169527, 1.009573803738192, 0.9934859162020464, 0.9764294392711854, 0.9623554128477929, 0.9481445021027225, 0.9353924061636332, 0.9230609758472182, 0.9101702081391191, 0.8997290329021131, 0.8891753241904052, 0.8776656150581286, 0.8686457526799143, 0.8550953090235247, 0.8499339840661145, 0.8433922012696281, 0.8347509188736026]


In [43]:
with open('BERT_DNN_senteces_test.json', "r") as f:
    test_sentences = json.load(f)

In [44]:
test_sentences = {key: value for key, value in test_sentences.items() if value}
len(test_sentences)

100

In [45]:
matching_string = test_sentences.values()
len(matching_string)

100

In [46]:
matching_string = [item for sublist in matching_string for item in sublist]
len(matching_string)

5000

In [52]:
batch_size = 128 # Define your batch size
# model.cuda()
test_embeddings = []
test_labels = []

# Define your data iterator in batches
for i in tqdm(range(0, len(matching_string), batch_size)):
    batch_sentences = matching_string[i:i+batch_size]

    batch_inputs = tokenizer.encode_batch(batch_sentences)
    input_ids_list = []
    attention_mask_list = []

    for encoding in batch_inputs:
        input_ids_list.append(encoding.ids)
        attention_mask_list.append(encoding.attention_mask)
    # Convert lists to tensors and move to device
    try:
        input_ids = torch.tensor(input_ids_list)
    except:
        for ins in input_ids_list:
            if len(ins)!=42:
                print(len(ins))
                print(ins)
    attention_mask = torch.tensor(attention_mask_list)

    with torch.inference_mode():
        outputs = model(input_ids, attention_mask=attention_mask, output_hidden_states=True)
    hidden_states = outputs.last_hidden_state
    relevant_hidden_states = hidden_states[:, 20, :]  # Extract the 20th token's hidden state
    
    for j in range(len(batch_sentences)):
        # Move embeddings to CPU
        test_embeddings.append(relevant_hidden_states[j].cpu())
        indicator = batch_sentences[j].split()[19]
        test_labels.append(one_hot_encoded_dict[indicator])


  0%|          | 0/40 [00:00<?, ?it/s]

In [53]:
model.to("cpu")
torch.cuda.empty_cache()

In [54]:
clf_model.eval()

Classification_V0(
  (fc1): Linear(in_features=256, out_features=128, bias=True)
  (relu1): ReLU()
  (fc2): Linear(in_features=128, out_features=64, bias=True)
  (relu2): ReLU()
  (fc3): Linear(in_features=64, out_features=32, bias=True)
  (relu3): ReLU()
  (fc4): Linear(in_features=32, out_features=100, bias=True)
  (dropout): Dropout(p=0.25, inplace=False)
)

In [89]:
generator = data_generator(test_embeddings, test_labels, batch_size)
# Iterate over batches
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.metrics import precision_score, recall_score, f1_score
import numpy as np
all_predictions = []
all_labels = []
# Initialize lists to store predictions and labels across all batches
# Iterate over batches
count = 0
for batch_embeddings, batch_labels in tqdm(generator, desc="Evaluation Batches", leave=False):
    batch_embeddings_tensor = torch.stack(batch_embeddings)
    batch_labels = np.array(batch_labels)
    
    logits = clf_model(batch_embeddings_tensor)
    predictions = torch.sigmoid(logits)
    thresholded_predictions = (predictions > 0.80).float()
    all_predictions.append(thresholded_predictions.detach().numpy())
    all_labels.append(batch_labels)


Evaluation Batches: 0it [00:00, ?it/s]

In [90]:
len(all_predictions)

40

In [91]:
len(all_labels[0])
    

128

In [92]:
print(all_predictions[0].shape)

(128, 100)


In [93]:

# Concatenate predictions and labels across all batches
all_predictions = np.concatenate(all_predictions)
all_labels = np.concatenate(all_labels)


In [94]:
all_labels.shape

(5000, 100)

In [95]:
cl_report = multilabel_confusion_matrix(all_labels, all_predictions)
print("Classification report:")
print(len(cl_report))

Classification report:
100


In [96]:
cl_report[0]

array([[4921,    5],
       [  69,    5]])

In [97]:
from sklearn.metrics import classification_report

report = classification_report(all_labels, all_predictions, zero_division=np.nan)

In [98]:
all_predictions.sum(axis=0)

array([ 10.,  49.,  20.,  66.,   0., 205., 146., 128., 186.,  17.,  70.,
        99., 170., 136.,   6.,  67.,  12.,  73.,  14., 147.,   0.,  42.,
        92., 191.,  55.,   8.,  24.,  81.,  54.,   0.,  41.,  24.,  65.,
       113.,  15.,  89.,  52.,   0.,  39.,   0., 186.,  49.,  87.,  92.,
       186.,  14.,  71.,  72., 122.,  90.,  47., 196.,  53.,  73.,  49.,
       186.,  86., 186.,  65.,  39.,  98.,  17.,   0., 164., 103.,  30.,
       106.,  19.,   2.,  20.,  19.,  17.,  35., 193., 130., 107., 107.,
        57.,  55., 193.,  66., 160., 117.,  61., 134., 170.,  98.,  87.,
        40.,  73.,  64.,  64., 165.,  19., 126.,  17.,  92.,  89.,  84.,
        49.], dtype=float32)

In [99]:
print(report)

              precision    recall  f1-score   support

           0       0.50      0.07      0.12        74
           1       0.88      0.38      0.53       113
           2       0.30      0.12      0.17        50
           3       0.39      0.31      0.34        85
           4        nan      0.00      0.00        50
           5       0.23      0.57      0.33        82
           6       0.12      0.33      0.17        52
           7       0.52      0.66      0.58       100
           8       1.00      0.92      0.96       202
           9       0.18      0.05      0.07        65
          10       0.10      0.08      0.09        83
          11       0.15      0.29      0.20        51
          12       0.28      0.47      0.35       100
          13       0.07      0.19      0.11        53
          14       0.67      0.07      0.13        57
          15       0.34      0.44      0.39        52
          16       0.00      0.00      0.00        56
          17       0.82    

In [88]:
enc.classes_[1]

'IPR000212'