In [70]:
import os
import time
from tqdm import tqdm
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn')

import torch
from torch.utils.data import Dataset
from torch import nn
from torch.utils.data import random_split
from torch.optim.lr_scheduler import ReduceLROnPlateau
from torchmetrics.classification import MultilabelF1Score
from torchmetrics.classification import MultilabelAccuracy

from transformers import BertModel, BertTokenizer

torch.cuda.get_device_name(torch.cuda.device)

"""
1. Test different loss functions (ambrose, better weights)
2. Test different models (like Temporal CNN, bigger linear model (keeping track of hyperparameters)
3. Implement CAFA-Evaluator for better metrics
4. Use more GOs in predictions
5. Read Kaggle notebooks online to gain intuition
6. Use new data!
"""

  plt.style.use('seaborn')


'NVIDIA GeForce GTX 1650 Ti'

In [71]:
MAIN_DIR = "data/"
WORK_DIR = "working/"
DATA_DIR = MAIN_DIR + "cafa-5-protein-function-prediction"
PROTBERT_DIR = MAIN_DIR + "protbert-embeddings-for-cafa5"

for dirname, _, filenames in os.walk(MAIN_DIR):
    for filename in filenames:
        print(os.path.join(dirname, filename))

data/cafa-5-protein-function-prediction.rar
data/cafa-5-ems-2-embeddings-numpy\test_embeddings.npy
data/cafa-5-ems-2-embeddings-numpy\test_ids.npy
data/cafa-5-ems-2-embeddings-numpy\train_embeddings.npy
data/cafa-5-ems-2-embeddings-numpy\train_ids.npy
data/cafa-5-protein-function-prediction\IA.txt
data/cafa-5-protein-function-prediction\sample_submission.tsv
data/cafa-5-protein-function-prediction\Test (Targets)\testsuperset-taxon-list.tsv
data/cafa-5-protein-function-prediction\Test (Targets)\testsuperset.fasta
data/cafa-5-protein-function-prediction\Train\go-basic.obo
data/cafa-5-protein-function-prediction\Train\train_sequences.fasta
data/cafa-5-protein-function-prediction\Train\train_taxonomy.tsv
data/cafa-5-protein-function-prediction\Train\train_terms.tsv
data/protbert-embeddings-for-cafa5\test_embeddings.npy
data/protbert-embeddings-for-cafa5\test_ids.npy
data/protbert-embeddings-for-cafa5\train_embeddings.npy
data/protbert-embeddings-for-cafa5\train_ids.npy
data/t5embeds\test_e

In [72]:
submission = pd.read_csv(f'{DATA_DIR}/sample_submission.tsv', sep='\t', header=None)
submission.columns = ["ProteinID", "GO_ID", "Probability"]
submission.head(10)

Unnamed: 0,ProteinID,GO_ID,Probability
0,A0A0A0MRZ7,GO:0000001,0.123
1,A0A0A0MRZ7,GO:0000002,0.123
2,A0A0A0MRZ8,GO:0000001,0.123
3,A0A0A0MRZ8,GO:0000002,0.123
4,A0A0A0MRZ9,GO:0000001,0.123
5,A0A0A0MRZ9,GO:0000002,0.123
6,A0A0A0MS00,GO:0000001,0.123
7,A0A0A0MS00,GO:0000002,0.123
8,A0A0A0MS01,GO:0000001,0.123
9,A0A0A0MS01,GO:0000002,0.123


In [73]:
class config:
    train_sequences_path = DATA_DIR  + "/Train/train_sequences.fasta"
    train_labels_path = DATA_DIR + "/Train/train_terms.tsv"
    test_sequences_path = DATA_DIR + "/Test (Targets)/testsuperset.fasta"

    num_labels = 500
    n_epochs = 25
    batch_size = 128
    lr = 0.002

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    print(f'Device: {device} - {torch.cuda.get_device_name(device)}')

Device: cuda - NVIDIA GeForce GTX 1650 Ti


In [74]:
# # ______________________ GET PROT BERT EMBEDDINGS WITH HUGGING FACE __________________________________
#
# # PROT BERT LOADING :
# tokenizer = BertTokenizer.from_pretrained("Rostlab/prot_bert", do_lower_case=False)
# model = BertModel.from_pretrained("Rostlab/prot_bert").to(config.device)
#
# def get_bert_embedding(
#     sequence : str,
#     len_seq_limit : int
# ):
#     """
#     Function to collect last hidden state embedding vector from pre-trained ProtBERT Model
#
#     INPUTS:
#     - sequence (str) : protein sequence (ex : AAABBB) from fasta file
#     - len_seq_limit (int) : maximum sequence lenght (i.e nb of letters) for truncation
#
#     OUTPUTS:
#     - output_hidden : last hidden state embedding vector for input sequence of length 1024
#     """
#     sequence_w_spaces = ' '.join(list(sequence))
#     encoded_input = tokenizer(
#         sequence_w_spaces,
#         truncation=True,
#         max_length=len_seq_limit,
#         padding='max_length',
#         return_tensors='pt').to(config.device)
#     output = model(**encoded_input)
#     output_hidden = output['last_hidden_state'][:,0][0].detach().cpu().numpy()
#     assert len(output_hidden)==1024
#     return output_hidden
#
# ### COLLECTING FOR TRAIN SAMPLES :
# print("Loading train set ProtBERT Embeddings...")
# fasta_train = SeqIO.parse(config.train_sequences_path, "fasta")
# print("Total Nb of Elements : ", len(list(fasta_train)))
# fasta_train = SeqIO.parse(config.train_sequences_path, "fasta")
# ids_list = []
# embed_vects_list = []
# t0 = time.time()
# checkpoint = 0
# for item in tqdm(fasta_train):
#     ids_list.append(item.id)
#     embed_vects_list.append(
#         get_bert_embedding(sequence = item.seq, len_seq_limit = 1200))
#     checkpoint+=1
#     if checkpoint>=100:
#         df_res = pd.DataFrame(data={"id" : ids_list, "embed_vect" : embed_vects_list})
#         np.save('/kaggle/working/train_ids.npy',np.array(ids_list))
#         np.save('/kaggle/working/train_embeddings.npy',np.array(embed_vects_list))
#         checkpoint=0
#
# np.save('/kaggle/working/train_ids.npy',np.array(ids_list))
# np.save('/kaggle/working/train_embeddings.npy',np.array(embed_vects_list))
# print('Total Elapsed Time:',time.time()-t0)
#
# ### COLLECTING FOR TEST SAMPLES :
# print("Loading test set ProtBERT Embeddings...")
# fasta_test = SeqIO.parse(config.test_sequences_path, "fasta")
# print("Total Nb of Elements : ", len(list(fasta_test)))
# fasta_test = SeqIO.parse(config.test_sequences_path, "fasta")
# ids_list = []
# embed_vects_list = []
# t0 = time.time()
# checkpoint=0
# for item in tqdm(fasta_test):
#     ids_list.append(item.id)
#     embed_vects_list.append(
#         get_bert_embedding(sequence = item.seq, len_seq_limit = 1200))
#     checkpoint+=1
#     if checkpoint>=100:
#         np.save('/kaggle/working/test_ids.npy',np.array(ids_list))
#         np.save('/kaggle/working/test_embeddings.npy',np.array(embed_vects_list))
#         checkpoint=0
#
# np.save('/kaggle/working/test_ids.npy',np.array(ids_list))
# np.save('/kaggle/working/test_embeddings.npy',np.array(embed_vects_list))
# print('Total Elasped Time:',time.time()-t0)

In [75]:
##### SCRIPT FOR LABELS (TARGETS) VECTORS COLLECTING #####

print("GENERATE TARGETS FOR ENTRY IDS ("+str(config.num_labels)+" MOST COMMON GO TERMS)")
ids = np.load(f"{PROTBERT_DIR}/train_ids.npy")
labels = pd.read_csv(config.train_labels_path, sep = "\t")

top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
labels_names = top_terms[:config.num_labels].index.values
train_labels_sub = labels[(labels.term.isin(labels_names)) & (labels.EntryID.isin(ids))]
id_labels = train_labels_sub.groupby('EntryID')['term'].apply(list).to_dict()

go_terms_map = {label: i for i, label in enumerate(labels_names)}
labels_matrix = np.empty((len(ids), len(labels_names)))

for index, id in tqdm(enumerate(ids)):
    id_gos_list = id_labels[id]
    temp = [go_terms_map[go] for go in labels_names if go in id_gos_list]
    labels_matrix[index, temp] = 1

labels_list = []
for l in range(labels_matrix.shape[0]):
    labels_list.append(labels_matrix[l, :])

labels_df = pd.DataFrame(data={"EntryID":ids, "labels_vect":labels_list})
labels_df.to_pickle(f"{WORK_DIR}/train_targets_top"+str(config.num_labels)+".pkl")
print("GENERATION FINISHED!")
labels_df.head(5)

GENERATE TARGETS FOR ENTRY IDS (500 MOST COMMON GO TERMS)


142246it [00:50, 2822.86it/s]


GENERATION FINISHED!


Unnamed: 0,EntryID,labels_vect
0,P20536,"[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
1,O73864,"[1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
2,O95231,"[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
3,A0A0B4J1F4,"[1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ..."
4,P54366,"[1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."


In [106]:
top_terms

term
GO:0005575    92912
GO:0008150    92210
GO:0110165    91286
GO:0003674    78637
GO:0005622    70785
              ...  
GO:1900010        1
GO:0039705        1
GO:0039709        1
GO:0039710        1
GO:0044717        1
Name: EntryID, Length: 31466, dtype: int64

In [104]:
GO_weight_dataset = pd.read_table(f'{DATA_DIR}/IA.txt', header=None, names=['GO', 'weight'])
GO_weight_dataset

Unnamed: 0,GO,weight
0,GO:0000001,0.000000
1,GO:0000002,3.103836
2,GO:0000003,3.439404
3,GO:0000011,0.056584
4,GO:0000012,6.400377
...,...,...
43243,GO:2001083,7.159871
43244,GO:2001084,7.592457
43245,GO:2001085,7.159871
43246,GO:2001147,5.554589


In [90]:
GO_weights = []

for each_label in labels_names:
    GO_weights.append(GO_weight_dataset.loc[GO_with_weights['GO'] == each_label]['weight'].values[0])

GO_weights = torch.tensor(GO_weights, dtype=torch.float32)
GO_weights

tensor([0.0000e+00, 0.0000e+00, 2.5471e-02, 0.0000e+00, 3.6695e-01, 5.8919e-01,
        5.8435e-01, 1.7165e-02, 4.5465e-01, 1.3480e-01, 4.1220e-01, 2.9561e-02,
        2.5790e-01, 1.1533e+00, 7.8701e-02, 2.1213e-01, 1.5681e+00, 1.5985e+00,
        1.6553e+00, 8.5186e-01, 1.6848e+00, 1.0687e-01, 6.7658e-02, 1.8248e+00,
        1.6347e+00, 1.0367e-01, 3.1833e-01, 1.2295e-01, 4.9670e-01, 2.1570e+00,
        1.6553e+00, 1.0337e+00, 7.3926e-02, 1.0960e+00, 1.5460e+00, 6.9622e-01,
        3.2499e-01, 1.4971e-01, 4.3930e-01, 3.5163e-02, 2.4797e+00, 1.9149e+00,
        0.0000e+00, 2.5399e+00, 1.8383e-04, 3.6778e-04, 1.3485e+00, 1.2932e-01,
        3.5321e-01, 1.6135e-01, 1.0754e+00, 5.5739e-01, 4.0606e-01, 1.1151e+00,
        3.0564e-01, 2.7477e+00, 5.9920e-01, 2.0041e-02, 3.4285e-02, 2.8116e+00,
        1.0779e+00, 2.1543e+00, 1.0962e+00, 2.1763e+00, 1.2565e-01, 1.2376e+00,
        2.5171e-01, 6.7157e-01, 2.4477e+00, 1.4801e+00, 7.9989e-01, 1.2116e+00,
        8.1851e-02, 1.4936e+00, 1.7235e-

In [77]:
# Directories for the different embedding vectors :
embeds_map = {
    "T5" : "t5embeds",
    "ProtBERT" : "protbert-embeddings-for-cafa5",
    "EMS2" : "cafa-5-ems-2-embeddings-numpy"
}

# Length of the different embedding vectors :
embeds_dim = {
    "T5" : 1024,
    "ProtBERT" : 1024,
    "EMS2" : 1280
}

In [78]:
class ProteinSequenceDataset(Dataset):

    def __init__(self, datatype, embeddings_source):
        super(ProteinSequenceDataset).__init__()
        self.datatype = datatype

        if embeddings_source in ["ProtBERT", "EMS2"]:
            embeds = np.load(f"{MAIN_DIR}"+embeds_map[embeddings_source]+"/"+datatype+"_embeddings.npy")
            ids = np.load(f"{MAIN_DIR}/"+embeds_map[embeddings_source]+"/"+datatype+"_ids.npy")

        if embeddings_source == "T5":
            embeds = np.load(f"{MAIN_DIR}/"+embeds_map[embeddings_source]+"/"+datatype+"_embeds.npy")
            ids = np.load(f"{MAIN_DIR}/"+embeds_map[embeddings_source]+"/"+datatype+"_ids.npy")

        embeds_list = []
        for l in range(embeds.shape[0]):
            embeds_list.append(embeds[l,:])
        self.df = pd.DataFrame(data={"EntryID": ids, "embed" : embeds_list})

        if datatype=="train":
            df_labels = pd.read_pickle(
                f"{WORK_DIR}/train_targets_top"+str(config.num_labels)+".pkl")
            self.df = self.df.merge(df_labels, on="EntryID")\

    def __len__(self):
        return len(self.df)

    def __getitem__(self, index):
        embed = torch.tensor(self.df.iloc[index]["embed"], dtype=torch.float32)

        if self.datatype=="train":
            targets = torch.tensor(self.df.iloc[index]["labels_vect"], dtype=torch.float32)
            return embed, targets

        if self.datatype=="test":
            id = self.df.iloc[index]["EntryID"]
            return embed, id


dataset = ProteinSequenceDataset(datatype="train", embeddings_source="T5")
dataset.df.head(10)

Unnamed: 0,EntryID,embed,labels_vect
0,P20536,"[0.04948842525482178, -0.03293515741825104, 0....","[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
1,O73864,"[-0.04461636394262314, 0.06492499262094498, -0...","[1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 0.0, 0.0, 1.0, ..."
2,O95231,"[-0.02012803591787815, -0.04977943375706673, 0...","[1.0, 1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
3,A0A0B4J1F4,"[-0.00751461973413825, 0.06062775477766991, 0....","[1.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 0.0, 0.0, ..."
4,P54366,"[0.013468174263834953, 0.04151567816734314, 0....","[1.0, 0.0, 1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, ..."
5,P33681,"[0.001116646104492247, -0.01536268275231123, 0...","[1.0, 1.0, 1.0, 1.0, 0.0, 0.0, 0.0, 0.0, 1.0, ..."
6,P77596,"[0.03678780049085617, 0.052980050444602966, 0....","[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
7,Q16787,"[0.007108339574187994, 0.01562744379043579, 0....","[1.0, 1.0, 1.0, 0.0, 1.0, 1.0, 1.0, 1.0, 0.0, ..."
8,Q59VP0,"[-0.006104866974055767, -0.026720179244875908,...","[0.0, 1.0, 0.0, 1.0, 0.0, 1.0, 0.0, 0.0, 0.0, ..."
9,P13508,"[-0.0071898759342730045, -0.02323203906416893,...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ..."


In [79]:
embeddings, labels = dataset.__getitem__(0)
print("COMPONENTS FOR FIRST PROTEIN:  ")
print("EMBEDDINGS VECTOR: \n ", embeddings, "\n")
print("TARGETS LABELS VECTOR: \n ", labels, "\n")

COMPONENTS FOR FIRST PROTEIN:  
EMBEDDINGS VECTOR: 
  tensor([ 0.0495, -0.0329,  0.0325,  ..., -0.0435,  0.0965,  0.0731]) 

TARGETS LABELS VECTOR: 
  tensor([0., 1., 0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 1., 0., 0., 0., 0., 1.,
        0., 0., 0., 1., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 1.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 1., 0., 0., 0., 1.,
        0., 1., 0., 1., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0

In [80]:
class MultiLayerPerceptron(torch.nn.Module):

    def __init__(self, input_dim, num_classes):
        super(MultiLayerPerceptron, self).__init__()

        self.linear1 = torch.nn.Linear(input_dim, input_dim)
        self.activation1 = torch.nn.ReLU()
        self.linear1 = torch.nn.Linear(input_dim, 1000)
        self.activation1 = torch.nn.ReLU()
        self.linear2 = torch.nn.Linear(1000, 800)
        self.activation2 = torch.nn.ReLU()
        self.linear3 = torch.nn.Linear(800, num_classes)

    def forward(self, x):
        x = self.linear1(x)
        x = self.activation1(x)
        x = self.linear2(x)
        x = self.activation2(x)
        x = self.linear3(x)
        return x

In [81]:
class CNN1D(nn.Module):
    def __init__(self, input_dim, num_classes):
        super(CNN1D, self).__init__()
        # (batch_size, channels, embed_size)
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=3, kernel_size=3, dilation=1, padding=1, stride=1)
        # (batch_size, 3, embed_size)
        self.pool1 = nn.MaxPool1d(kernel_size=2, stride=2)
        # (batch_size, 3, embed_size/2 = 512)
        self.conv2 = nn.Conv1d(in_channels=3, out_channels=8, kernel_size=3, dilation=1, padding=1, stride=1)
        # (batch_size, 8, embed_size/2 = 512)
        self.pool2 = nn.MaxPool1d(kernel_size=2, stride=2)
        # (batch_size, 8, embed_size/4 = 256)
        self.fc1 = nn.Linear(in_features=int(8 * input_dim/4), out_features=1024)       # 1024 is better
        self.fc2 = nn.Linear(in_features=1024, out_features=num_classes)                # 1024 is better

    def forward(self, x):
        x = x.reshape(x.shape[0], 1, x.shape[1])
        x = self.pool1(nn.functional.relu(self.conv1(x)))
        x = self.pool2(nn.functional.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        x = nn.functional.relu(self.fc1(x))
        x = self.fc2(x)
        return x

In [93]:
def train_model(embeddings_source, model_type="linear", train_size=0.9):

    train_dataset = ProteinSequenceDataset(datatype="train", embeddings_source = embeddings_source)

    train_set, val_set = random_split(train_dataset, lengths = [int(len(train_dataset)*train_size), len(train_dataset)-int(len(train_dataset)*train_size)])
    train_dataloader = torch.utils.data.DataLoader(train_set, batch_size=config.batch_size, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val_set, batch_size=config.batch_size, shuffle=True)

    if model_type == "linear":
        model = MultiLayerPerceptron(input_dim=embeds_dim[embeddings_source], num_classes=config.num_labels).to(config.device)

    if model_type == "conv":
        model = CNN1D(input_dim=embeds_dim[embeddings_source], num_classes=config.num_labels).to(config.device)

    optimizer = torch.optim.Adam(model.parameters(), lr = config.lr)
    scheduler = ReduceLROnPlateau(optimizer, factor=0.1, patience=1)
    MultiLabelLoss = torch.nn.BCEWithLogitsLoss(weight=GO_weights.to(config.device))
    f1_score = MultilabelF1Score(num_labels=config.num_labels).to(config.device)
    n_epochs = config.n_epochs

    print("BEGIN TRAINING...")
    train_loss_history=[]
    val_loss_history=[]

    train_f1score_history=[]
    val_f1score_history=[]

    for epoch in range(n_epochs):
        print("EPOCH ", epoch+1)

        ## TRAIN PHASE :
        losses, scores = [], []

        for embed, targets in tqdm(train_dataloader):
            embed, targets = embed.to(config.device), targets.to(config.device)
            preds = model(embed)
            loss= MultiLabelLoss(preds, targets)

            score=f1_score(preds, targets)
            losses.append(loss.item())
            scores.append(score.item())

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        avg_loss = np.mean(losses)
        avg_score = np.mean(scores)
        print("Running Average TRAIN Loss : ", avg_loss)
        print("Running Average TRAIN F1-Score : ", avg_score)
        train_loss_history.append(avg_loss)
        train_f1score_history.append(avg_score)

        ## VALIDATION PHASE :
        losses, scores = [], []

        for embed, targets in val_dataloader:
            embed, targets = embed.to(config.device), targets.to(config.device)
            preds = model(embed)

            loss= MultiLabelLoss(preds, targets)
            score=f1_score(preds, targets)
            losses.append(loss.item())
            scores.append(score.item())

        avg_loss = np.mean(losses)
        avg_score = np.mean(scores)
        print("Running Average VAL Loss : ", avg_loss)
        print("Running Average VAL F1-Score : ", avg_score)
        val_loss_history.append(avg_loss)
        val_f1score_history.append(avg_score)

        scheduler.step(avg_loss)
        print("\n")

    print("TRAINING FINISHED")
    print("FINAL TRAINING SCORE : ", train_f1score_history[-1])
    print("FINAL VALIDATION SCORE : ", val_f1score_history[-1])

    losses_history = {"train" : train_loss_history, "val" : val_loss_history}
    scores_history = {"train" : train_f1score_history, "val" : val_f1score_history}

    return model, losses_history, scores_history

In [94]:
t5_model, t5_losses, t5_scores = train_model(embeddings_source="T5", model_type="linear")

BEGIN TRAINING...
EPOCH  1


100%|██████████| 1001/1001 [00:38<00:00, 26.16it/s]


Running Average TRAIN Loss :  0.14626838810198553
Running Average TRAIN F1-Score :  0.04428508663391852
Running Average VAL Loss :  0.13415391696617007
Running Average VAL F1-Score :  0.07087949490440744


EPOCH  2


100%|██████████| 1001/1001 [00:36<00:00, 27.61it/s]


Running Average TRAIN Loss :  0.12974025100677997
Running Average TRAIN F1-Score :  0.09059316988338481
Running Average VAL Loss :  0.12943011076588715
Running Average VAL F1-Score :  0.10814577939787082


EPOCH  3


100%|██████████| 1001/1001 [00:36<00:00, 27.45it/s]


Running Average TRAIN Loss :  0.12492420246729723
Running Average TRAIN F1-Score :  0.11745217273553292
Running Average VAL Loss :  0.1263012649225337
Running Average VAL F1-Score :  0.11495284396888954


EPOCH  4


100%|██████████| 1001/1001 [00:36<00:00, 27.71it/s]


Running Average TRAIN Loss :  0.12081461632019513
Running Average TRAIN F1-Score :  0.14048802414974132
Running Average VAL Loss :  0.12476456152009112
Running Average VAL F1-Score :  0.1318848884797522


EPOCH  5


100%|██████████| 1001/1001 [00:35<00:00, 28.02it/s]


Running Average TRAIN Loss :  0.11720933650548641
Running Average TRAIN F1-Score :  0.1617087425959932
Running Average VAL Loss :  0.1229143424092659
Running Average VAL F1-Score :  0.15749583007501705


EPOCH  6


100%|██████████| 1001/1001 [00:36<00:00, 27.44it/s]


Running Average TRAIN Loss :  0.11366677759112892
Running Average TRAIN F1-Score :  0.1833227827370941
Running Average VAL Loss :  0.1224772527015635
Running Average VAL F1-Score :  0.17120778261284744


EPOCH  7


100%|██████████| 1001/1001 [00:36<00:00, 27.60it/s]


Running Average TRAIN Loss :  0.11034545358958897
Running Average TRAIN F1-Score :  0.2050083539300746
Running Average VAL Loss :  0.12262508133426309
Running Average VAL F1-Score :  0.16778317955322564


EPOCH  8


100%|██████████| 1001/1001 [00:36<00:00, 27.40it/s]


Running Average TRAIN Loss :  0.10707097929287386
Running Average TRAIN F1-Score :  0.2260847698737096
Running Average VAL Loss :  0.12145680661446281
Running Average VAL F1-Score :  0.1962526811153761


EPOCH  9


100%|██████████| 1001/1001 [00:36<00:00, 27.80it/s]


Running Average TRAIN Loss :  0.10407097927935711
Running Average TRAIN F1-Score :  0.24598436297236623
Running Average VAL Loss :  0.12215054939900126
Running Average VAL F1-Score :  0.20112482930666634


EPOCH  10


100%|██████████| 1001/1001 [00:35<00:00, 28.03it/s]


Running Average TRAIN Loss :  0.10128915519802482
Running Average TRAIN F1-Score :  0.2653060683480033
Running Average VAL Loss :  0.12265120932300176
Running Average VAL F1-Score :  0.22389395801084383


EPOCH  11


100%|██████████| 1001/1001 [00:36<00:00, 27.71it/s]


Running Average TRAIN Loss :  0.09221877809706983
Running Average TRAIN F1-Score :  0.31032411070106986
Running Average VAL Loss :  0.12143167327823383
Running Average VAL F1-Score :  0.23710714892617293


EPOCH  12


100%|██████████| 1001/1001 [00:35<00:00, 27.99it/s]


Running Average TRAIN Loss :  0.09007727240140621
Running Average TRAIN F1-Score :  0.3266248246589741
Running Average VAL Loss :  0.12149704567023686
Running Average VAL F1-Score :  0.2347098027489015


EPOCH  13


100%|██████████| 1001/1001 [00:36<00:00, 27.57it/s]


Running Average TRAIN Loss :  0.0890112217817154
Running Average TRAIN F1-Score :  0.3330147443534611
Running Average VAL Loss :  0.12251759094319173
Running Average VAL F1-Score :  0.23861480357923678


EPOCH  14


100%|██████████| 1001/1001 [00:37<00:00, 26.73it/s]


Running Average TRAIN Loss :  0.08748150450157953
Running Average TRAIN F1-Score :  0.34153592050611437
Running Average VAL Loss :  0.12241831455113632
Running Average VAL F1-Score :  0.2417687845549413


EPOCH  15


100%|██████████| 1001/1001 [00:36<00:00, 27.35it/s]


Running Average TRAIN Loss :  0.08728252749491881
Running Average TRAIN F1-Score :  0.3419895744496411
Running Average VAL Loss :  0.12197084724903107
Running Average VAL F1-Score :  0.2408410553554339


EPOCH  16


100%|██████████| 1001/1001 [00:36<00:00, 27.23it/s]


Running Average TRAIN Loss :  0.08706715705987814
Running Average TRAIN F1-Score :  0.34177587355499145
Running Average VAL Loss :  0.1221739320483591
Running Average VAL F1-Score :  0.24383732583373785


EPOCH  17


100%|██████████| 1001/1001 [00:36<00:00, 27.66it/s]


Running Average TRAIN Loss :  0.0870818231221322
Running Average TRAIN F1-Score :  0.34352206204321
Running Average VAL Loss :  0.12203066310446177
Running Average VAL F1-Score :  0.24273820527430093


EPOCH  18


100%|██████████| 1001/1001 [00:36<00:00, 27.30it/s]


Running Average TRAIN Loss :  0.08709761914197024
Running Average TRAIN F1-Score :  0.343285358571387
Running Average VAL Loss :  0.12192961181114827
Running Average VAL F1-Score :  0.2433486438489386


EPOCH  19


100%|██████████| 1001/1001 [00:36<00:00, 27.36it/s]


Running Average TRAIN Loss :  0.08703718555497598
Running Average TRAIN F1-Score :  0.34419171321582603
Running Average VAL Loss :  0.122039580983775
Running Average VAL F1-Score :  0.24392280481489642


EPOCH  20


100%|██████████| 1001/1001 [00:37<00:00, 26.90it/s]


Running Average TRAIN Loss :  0.08703083844927999
Running Average TRAIN F1-Score :  0.34290862623062524
Running Average VAL Loss :  0.12235397905377406
Running Average VAL F1-Score :  0.2425440797981407


EPOCH  21


100%|██████████| 1001/1001 [00:36<00:00, 27.28it/s]


Running Average TRAIN Loss :  0.0870434294169123
Running Average TRAIN F1-Score :  0.3441138860884008
Running Average VAL Loss :  0.12189124105498195
Running Average VAL F1-Score :  0.24299195116119726


EPOCH  22


100%|██████████| 1001/1001 [00:35<00:00, 27.98it/s]


Running Average TRAIN Loss :  0.08704494079435383
Running Average TRAIN F1-Score :  0.34393068475204036
Running Average VAL Loss :  0.12242660491860338
Running Average VAL F1-Score :  0.24439877777227334


EPOCH  23


100%|██████████| 1001/1001 [00:36<00:00, 27.78it/s]


Running Average TRAIN Loss :  0.08702752962857337
Running Average TRAIN F1-Score :  0.34342328197770305
Running Average VAL Loss :  0.12187290896794625
Running Average VAL F1-Score :  0.24500832839735917


EPOCH  24


100%|██████████| 1001/1001 [00:37<00:00, 26.48it/s]


Running Average TRAIN Loss :  0.08707654158343801
Running Average TRAIN F1-Score :  0.34350915567262785
Running Average VAL Loss :  0.12277868529781699
Running Average VAL F1-Score :  0.24247935261311276


EPOCH  25


100%|██████████| 1001/1001 [00:36<00:00, 27.75it/s]


Running Average TRAIN Loss :  0.08704605338337658
Running Average TRAIN F1-Score :  0.34369529249308467
Running Average VAL Loss :  0.12216846997450505
Running Average VAL F1-Score :  0.24323299606995924


TRAINING FINISHED
FINAL TRAINING SCORE :  0.34369529249308467
FINAL VALIDATION SCORE :  0.24323299606995924


In [None]:
t5_model(dataset[0][0].reshape(1, -1).to(config.device))

In [None]:
def predict(embeddings_source):
    test_dataset = ProteinSequenceDataset(datatype="test", embeddings_source = embeddings_source)
    test_dataloader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False)

    if embeddings_source == "T5":
        model = t5_model
    if embeddings_source == "ProtBERT":
        model = protbert_model
    if embeddings_source == "EMS2":
        model = ems2_model

    model.eval()

    labels = pd.read_csv(config.train_labels_path, sep = "\t")
    top_terms = labels.groupby("term")["EntryID"].count().sort_values(ascending=False)
    labels_names = top_terms[:config.num_labels].index.values
    print("GENERATE PREDICTION FOR TEST SET...")

    ids_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object)
    go_terms_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=object)
    confs_ = np.empty(shape=(len(test_dataloader)*config.num_labels,), dtype=np.float32)

    for i, (embed, id) in tqdm(enumerate(test_dataloader)):
        embed = embed.to(config.device)
        confs_[i*config.num_labels:(i+1)*config.num_labels] = torch.nn.functional.sigmoid(model(embed)).squeeze().detach().cpu().numpy()
        ids_[i*config.num_labels:(i+1)*config.num_labels] = id[0]
        go_terms_[i*config.num_labels:(i+1)*config.num_labels] = labels_names

    submission_df = pd.DataFrame(data={"Id" : ids_, "GO term" : go_terms_, "Confidence" : confs_})
    print("PREDICTIONS DONE")
    return submission_df

In [None]:
submission_df = predict("T5")
submission_df.head(50)

In [None]:
class Linear_Lightning(pl.LightningModule):
    def __init__(self, input_dim, num_classes, train_size, **hparams):
        super(Linear_Lightning, self).__init__()

        self.model = MultiLayerPerceptron(input_dim=embeds_dim[embeddings_source], num_classes=config.num_labels).to(config.device)

        train_dataset = ProteinSequenceDataset(datatype="train", embeddings_source = embeddings_source)
        self.train_set, self.val_set = random_split(train_dataset, lengths = [int(len(train_dataset)*train_size), len(train_dataset)-int(len(train_dataset)*train_size)])

        self.loss_fn = torch.nn.BCEWithLogitsLoss()
        self.batch_size = batch_size
        self.lr = lr

        self.f1_score = MultilabelF1Score(num_labels=num_classes)
        self.accuracy = MultilabelAccuracy(num_labels=num_classes)

    def forward(self, x):
        return self.model(x)

    def training_step(self, batch, batch_idx):
        embed, targets = batch
        preds = self(embed)
        loss = self.loss_fn(preds, targets)
        f1_score = self.f1_score(preds, targets)
        acc_score = self.accuracy(preds, targets)

        logs = {"train_loss" : loss, "f1_score" : f1_score, "accuracy_score" : acc_score}
        self.log_dict(
            logs,
            on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
        return {"loss": loss, "log": logs}

    def validation_step(self, batch, batch_idx):
        embed, targets = batch
        preds = self(embed)
        loss= self.loss_fn(preds, targets)
        f1_score = self.f1_score(preds, targets)
        acc_score = self.accuracy(preds, targets)

        return {"val_loss": loss, "f1_score": f1_score, "accuracy_score": acc_score}

    def validation_end(self, outputs):
        avg_loss = torch.stack([x["val_loss"] for x in ouputs]).mean()
        logs = {"val_loss" : avg_loss}
        self.log_dict(
            logs,
            on_step=True, on_epoch=True, prog_bar=True, logger=True
        )
        return {"avg_val_loss": avg_loss, "log": logs}

    def val_dataloader(self):
        val_dataloader = torch.utils.data.DataLoader(self.val_set, batch_size=config.batch_size, shuffle=False,)
        return val_dataloader

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        return optimizer

    def train_dataloader(self):
        train_dataloader = torch.utils.data.DataLoader(self.train_set, batch_size=self.batch_size, shuffle=False)
        return train_dataloader

In [None]:
trainer = Trainer(
    max_epochs=config.n_epochs,
    limit_train_batches=5000,
    logger=logger)

model = Linear_Lightning(
    input_dim=embeds_dim[embeddings_source],
    num_classes=config.num_labels,
    train_size=0.8
)

trainer.fit(model)