<a href="https://www.kaggle.com/code/averma111/pytorch-cafa-5-prediction?scriptVersionId=130469247" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
%%capture 
!pip install torchmetrics

In [None]:
import os
import gc
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
tqdm.pandas()
import torch
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torchmetrics import AUROC


In [None]:
def get_dataframe(path):
    return pd.read_csv(path,sep='\t')

In [None]:
train_terms = '/kaggle/input/cafa-5-protein-function-prediction/Train/train_terms.tsv'
train_taxonomy ='/kaggle/input/cafa-5-protein-function-prediction/Train/train_taxonomy.tsv'

In [None]:
get_dataframe(train_terms).head()

In [None]:
get_dataframe(train_taxonomy).head()


In [None]:
def summary(text, df):
    print(f'{text} shape: {df.shape}')
    summ = pd.DataFrame(df.dtypes, columns=['dtypes'])
    summ['null'] = df.isnull().sum()
    summ['unique'] = df.nunique()
    summ['min'] = df.min()
    summ['median'] = df.median()
    summ['max'] = df.max()
    summ['mean'] = df.mean()
    summ['std'] = df.std()
    summ['duplicate'] = df.duplicated().sum()
    return summ

In [None]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [None]:
summary('train_terms',reduce_mem_usage(get_dataframe(train_terms)))

In [None]:
summary('train_terms',reduce_mem_usage(get_dataframe(train_taxonomy)))

In [None]:
sns.countplot(data=reduce_mem_usage(get_dataframe(train_terms)),x='aspect',color='r')

In [None]:
terms = reduce_mem_usage(get_dataframe(train_terms)).groupby(['aspect', 'term'])['term'].count().reset_index(name='frequency')
terms.groupby('aspect')['term'].nunique()

In [None]:
fractions = (terms.groupby('aspect')['term'].nunique() / terms['term'].nunique() * 1500).apply(round)
print(fractions)

In [None]:
final_terms = set()
for aspect, number in fractions.items():
    final_terms = terms.loc[(terms.aspect == aspect)]
    final_terms = final_terms.nlargest(number, columns='frequency', keep='first')
    final_terms.update(final_terms.term.to_list())
    


In [None]:
def assign_labels(annotate,final_terms =set(final_terms)):
    intersection = final_terms.intersection(annotate)
    labels = np.isin(np.array(list(final_terms)), np.array(list(intersection)))
    return list(labels.astype('int'))

annotate = reduce_mem_usage(get_dataframe(train_terms)).groupby('EntryID')['term'].apply(set)
labels = annotate.progress_apply(assign_labels)
labels.head()

In [None]:
def get_features_labels():
    train_ids = np.load('/kaggle/input/t5embeds/train_ids.npy')
    X = np.load('/kaggle/input/t5embeds/train_embeds.npy')
    y = np.array(labels[train_ids].to_list())
    return X,y

In [None]:
def train_test_dataset(features,labels):
    return  train_test_split(features,labels,test_size=0.2,shuffle=True,random_state=42)

X_train,X_val,y_train,y_val = train_test_dataset(*get_features_labels())
print(X_train.shape,X_val.shape,y_train.shape,y_val.shape)

In [None]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)

In [None]:
def convert_to_torch(value):
    return torch.tensor(data=value,dtype=torch.float32,requires_grad=True,device=device)

In [None]:
class CAFA5Data(Dataset):
    def __init__(self, X_data, y_data):
        self.X_data = X_data
        self.y_data = y_data
        
    def __getitem__(self, index):
            return self.X_data[index], self.y_data[index]
        
    def __len__ (self):
        return len(self.X_data)

X_data = convert_to_torch(X_train)
y_data = convert_to_torch(y_train)
X_val = convert_to_torch(X_val)
y_val = convert_to_torch(y_val)
train_data = CAFA5Data(X_data,y_data)
test_data = CAFA5Data(X_val,y_val)

In [None]:
class CAFA5NNetBase(torch.nn.Module):
    
    def training_step(self,batch):
        features,labels = batch
        out = self(features)
        loss = F.binary_cross_entropy(out,labels)
        return loss
    
    def validation_step(self, batch):
        features, labels = batch 
        out = self(features)                    # Generate predictions
        loss = F.binary_cross_entropy(out, labels)   # Calculate loss
        acc = auroc(out, labels)           # Calculate accuracy
        return {'Validation_loss': loss.detach(), 'Validation_acc': acc}
        
    def validation_epoch_end(self, outputs):
        batch_losses = [x['Validation_loss'] for x in outputs]
        epoch_loss = torch.stack(batch_losses).mean()   # Combine losses
        batch_accs = [x['Validation_acc'] for x in outputs]
        epoch_acc = torch.stack(batch_accs).mean()      # Combine accuracies
        return {'Validation_loss': epoch_loss.item(), 'Valdation_acc': epoch_acc.item()}
    
    def epoch_end(self, epoch, result):
        print("Epoch [{}], Train_loss: {:.4f}, Validation_loss: {:.4f}, Validation_acc: {:.4f}".format(
            epoch, result['Train_loss'], result['Validation_loss'], result['Validation_acc']))

In [None]:
class CAFA5NNet(CAFA5NNetBase):
    def __init__(self,input_features,output_features):
        super(CAFA5NNet,self).__init__()
        self.network = torch.nn.Sequential(
        torch.nn.Linear(input_features,256),
        torch.nn.ReLU(),
        torch.nn.Linear(256,128),    
        torch.nn.ReLU(),
        torch.nn.Linear(128,output_features),
        torch.nn.Sigmoid()    
        )
    def forward(self,inputs):
        return self.network(inputs)

In [None]:
model = CAFA5NNet(X_train.shape[1],y_train.shape[1])
model.to(device)

In [None]:
BATCH_SIZE = 32
EPOCHS = 100
LEARNING_RATE = 0.1
MOMENTUM = 0.9
OPT_FUNC = torch.optim.Adam

In [None]:
def get_dataloaders(dataset_type,batch,shuffle):
    if shuffle:
         return DataLoader(dataset=dataset_type, batch_size=batch, shuffle=True)
    else:
        return DataLoader(dataset=dataset_type, batch_size=batch,shuffle=False)

In [None]:
def auroc(outputs, labels):
    auroc = AUROC(task="multiclass", num_classes=3)
    return auroc(preds, target)

  
@torch.no_grad()
def evaluate(model, val_loader):
    model.eval()
    outputs = [model.validation_step(batch) for batch in val_loader]
    return model.validation_epoch_end(outputs)

  
def fit(epochs, lr, model, train_loader, val_loader, opt_func = OPT_FUNC):
    
    history = []
    optimizer = opt_func(model.parameters(),lr)
    for epoch in tqdm(range(epochs)):
        
        model.train()
        train_losses = []
        for batch in train_loader:
            loss = model.training_step(batch)
            train_losses.append(loss)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            
        result = evaluate(model, val_loader)
        result['Train_loss'] = torch.stack(train_losses).mean().item()
        model.epoch_end(epoch, result)
        history.append(result)
    
    return history

In [None]:
train_dl = get_dataloaders(train_data,BATCH_SIZE,True)
val_dl = get_dataloaders(test_data,BATCH_SIZE,False)
history = fit(EPOCHS, LEARNING_RATE, model, train_dl, val_dl,OPT_FUNC)

In [None]:
def plot_accuracies(history):
    """ Plot the history of accuracies"""
    accuracies = [x['val_acc'] for x in history]
    plt.plot(accuracies, '-x')
    plt.xlabel('Epoch')
    plt.ylabel('Accuracy')
    plt.title('Accuracy vs. No. of epochs');
    

plot_accuracies(history)

In [None]:
def plot_losses(history):
    """ Plot the losses in each epoch"""
    train_losses = [x.get('Train_loss') for x in history]
    val_losses = [x['Validation_loss'] for x in history]
    plt.plot(train_losses, '-bx')
    plt.plot(val_losses, '-rx')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend(['Training', 'Validation'])
    plt.title('Loss vs. No. of epochs');

plot_losses(history)

In [None]:
test_ids = np.load('/kaggle/input/t5embeds/test_ids.npy')
x_test = np.load('/kaggle/input/t5embeds/test_embeds.npy')

In [None]:
predictions = model.predict(x_test)
del x_test
gc.collect()

chunk_size = 5_000
chunks = [range(i, min(i + chunk_size, len(predictions))) for i in range(0, len(predictions), chunk_size)]

final_sub = pd.DataFrame()  # Create an empty DataFrame to hold the final result

print(f"processing {len(chunks)} chunks of {chunk_size} predictions each")

for chunk in chunks:
    print(f"processing chunk {chunk}")
    sub = pd.DataFrame(data=predictions[chunk], columns=list(selected_terms), index=test_ids[chunk])
    sub = sub.T.unstack().reset_index(name='prediction')
    sub = sub.loc[sub['prediction'] > 0]
    final_sub = pd.concat([final_sub, sub])  # Concatenate current chunk DataFrame to the final DataFrame

final_sub.head()