<a href="https://www.kaggle.com/code/averma111/cafa5lightning?scriptVersionId=131159495" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [1]:
%%capture 
!pip install torchmetrics

In [2]:
%%capture
!pip install torchsummary

In [4]:
## https://www.kaggle.com/code/alexandervc/baseline-multilabel-to-multitarget-binary#Load-train-features---precalculated-embeddings-for-the-proteins
import os
import gc
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
tqdm.pandas()
import torch
import warnings
warnings.filterwarnings('ignore')
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchmetrics import AUROC
from torchsummary import summary as torchsummary
import pytorch_lightning as pl


In [None]:
def get_dataframe(path):
    return pd.read_csv(path,sep='\t')

In [None]:
train_terms = '/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv'
train_taxonomy ='/kaggle/input/cafa-5-protein-function-prediction/Train/train_taxonomy.tsv'

In [None]:
get_dataframe(train_terms).head()

In [None]:
get_dataframe(train_taxonomy).head()

In [None]:
def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    summ['duplicate'] = df.duplicated().sum()
    return summ

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
summary('train_terms',reduce_mem_usage(get_dataframe(train_terms)))

In [None]:
sns.countplot(data=reduce_mem_usage(get_dataframe(train_terms)),x='aspect',color='r')

In [None]:
train_terms=reduce_mem_usage(get_dataframe(train_terms))

In [None]:
def get_train_dataset():
    train_protein_ids = np.load('/kaggle/input/t5embeds/train_ids.npy')
    train_embeddings = np.load('/kaggle/input/t5embeds/train_embeds.npy')
    column_num = train_embeddings.shape[1]
    train = pd.DataFrame(train_embeddings, columns = ["Column_" + str(i) for i in range(1, column_num+1)])
    return train,train_protein_ids

train,train_protein_ids = get_train_dataset()
print(train.shape,train_protein_ids.shape)

In [None]:
num_of_labels = 1500
def get_label_train_terms(df):
    labels=df['term'].value_counts().index[:num_of_labels].tolist()
    train_terms_updated=df.loc[df['term'].isin(labels)]
    return labels,train_terms_updated

labels_count,train_terms_updated=get_label_train_terms(train_terms)

In [None]:
def show_pit_aspects():
    pie_df = train_terms_updated['aspect'].value_counts()
    palette_color = sns.color_palette('pastel')
    plt.pie(pie_df.values, labels=np.array(pie_df.index), colors=palette_color, autopct='%.0f%%')
    plt.show()
    
show_pit_aspects()

In [None]:
def get_labels(train_protein_ids):
    train_size = train_protein_ids.shape[0] # len(X)
    train_labels = np.zeros((train_size ,num_of_labels))
    series_train_protein_ids = pd.Series(train_protein_ids)

    for i in range(num_of_labels):
        n_train_terms = train_terms_updated[train_terms_updated['term'] ==  labels_count[i]]
        label_related_proteins = n_train_terms['EntryID'].unique()
        train_labels[:,i] =  series_train_protein_ids.isin(label_related_proteins).astype(float)
    return train_labels


train_labels=get_labels(train_protein_ids)

labels = pd.DataFrame(data = train_labels, columns = labels_count)
print(labels.shape)

In [None]:
def train_test_dataset(features,labels):
    return  train_test_split(features,labels,shuffle=True,random_state=42)

X_train,X_val,y_train,y_val = train_test_dataset(train,labels)
print(X_train.shape,X_val.shape,y_train.shape,y_val.shape)