In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from Bio import SeqIO
import h5py
from sklearn.preprocessing import MultiLabelBinarizer


In [2]:
# Specify the path to the project folder / the folder where data is located
path = '/Users/aleksandramaslova/Downloads/biological_data_pfp/train/'

## Part 1 - Dataset Preparation

Download annotations from CSV - file

In [3]:
data = pd.read_csv(path+'train_set.tsv', sep='\t')
data

Unnamed: 0,Protein_ID,aspect,GO_term
0,P91124,cellular_component,GO:0005575
1,P91124,cellular_component,GO:0110165
2,P91124,cellular_component,GO:0005737
3,P91124,cellular_component,GO:0005622
4,P91124,cellular_component,GO:0043226
...,...,...,...
4277042,P28271,biological_process,GO:0010608
4277043,P28271,biological_process,GO:0080090
4277044,P28271,biological_process,GO:0006417
4277045,P28271,biological_process,GO:0051246


In [4]:
# Check number of unique GO-terms for each aspect

print(f'Number of unique GO-Terms for biological process: {len(data[data.aspect == 'biological_process'].GO_term.unique())}')
print(f'Number of unique GO-Terms for molecular function: {len(data[data.aspect == 'molecular_function'].GO_term.unique())}')
print(f'Number of unique GO-Terms for cellular component: {len(data[data.aspect == 'cellular_component'].GO_term.unique())}')

Number of unique GO-Terms for biological process: 1487
Number of unique GO-Terms for molecular function: 839
Number of unique GO-Terms for cellular component: 678


In [5]:
# Collect all relevant GO-terms for each protein for each aspect
data = data.groupby(['Protein_ID', 'aspect'])['GO_term'].apply(set).reset_index()
data

Unnamed: 0,Protein_ID,aspect,GO_term
0,A0A009IHW8,biological_process,"{GO:0072521, GO:1901564, GO:0044237, GO:000679..."
1,A0A009IHW8,molecular_function,"{GO:0016787, GO:0016799, GO:0016798, GO:000382..."
2,A0A021WW32,biological_process,"{GO:0048468, GO:0048469, GO:0006996, GO:002170..."
3,A0A021WW32,cellular_component,"{GO:0043232, GO:0005634, GO:0000793, GO:009908..."
4,A0A021WZA4,cellular_component,"{GO:0110165, GO:0071944, GO:0005575, GO:000588..."
...,...,...,...
223395,X6RLK1,cellular_component,"{GO:0070013, GO:0005634, GO:0110165, GO:000562..."
223396,X6RLN4,cellular_component,"{GO:0110165, GO:0005622, GO:0005575, GO:000582..."
223397,X6RLP6,cellular_component,"{GO:0070013, GO:0005634, GO:0110165, GO:000562..."
223398,X6RLR1,cellular_component,"{GO:0070013, GO:0043232, GO:0005634, GO:011016..."


Download sequence information

In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parse the FASTA file to extract protein IDs and sequences
def parse_fasta(file_path):
    """
    Parses a FASTA file to extract protein IDs and sequences.
    Returns:
        protein_ids (list): A list of protein IDs.
        sequences (list): A list of corresponding sequences.
    """
    protein_ids = []
    sequences = []
    for record in SeqIO.parse(file_path, "fasta"):
        protein_id = record.id.split("|")[0]
        protein_ids.append(protein_id)
        sequences.append(str(record.seq))
    return protein_ids, sequences

# One-Hot Encoding Function for Protein Sequences
def one_hot_encode(sequence, vocab="ACDEFGHIKLMNPQRSTVWY"):
    """
    One-hot encodes a single protein sequence.
    Args:
        sequence (str): Protein sequence.
        vocab (str): Vocabulary of amino acid characters.
    Returns:
        np.ndarray: One-hot encoded matrix.
    """
    encoding = np.zeros((len(sequence), len(vocab)), dtype=int)
    char_to_index = {char: idx for idx, char in enumerate(vocab)}
    for i, char in enumerate(sequence):
        if char in char_to_index:
            encoding[i, char_to_index[char]] = 1
    return encoding

# Process Sequences (One-hot encode, pad, and return DataFrame)
def process_sequences(protein_ids, sequences, max_length, vocab="ACDEFGHIKLMNPQRSTVWY"):
    """
    One-hot encodes and pads/truncates protein sequences and returns them as a DataFrame.
    Args:
        protein_ids (list): List of protein IDs.
        sequences (list): List of protein sequences.
        max_length (int): Maximum length for padding/truncation.
        vocab (str): Vocabulary of amino acid characters.
    Returns:
        DataFrame: Table with protein IDs and corresponding processed (padded) sequences.
    """
    processed_data = []
    for protein_id, sequence in zip(protein_ids, sequences):
        encoded = one_hot_encode(sequence, vocab)

        # Pad or truncate the sequence to the max_length
        padded = pad_sequences(
            [np.argmax(encoded, axis=1)],  # Convert to integer encoding for padding
            maxlen=max_length,
            padding="post"
        )[0]  # Extract the single padded sequence

        processed_data.append([protein_id, padded])

    df = pd.DataFrame(processed_data, columns=["Protein_ID", "Processed_Seq"])
    return df


In [None]:
file_path = path+"train.fasta"
max_length = 2000
protein_ids, sequences = parse_fasta(file_path)
df_processed = process_sequences(protein_ids, sequences, max_length)

data = data.merge(df_processed, on='Protein_ID')
data

Download Domain information

In [6]:
# Step 1: Parse the train_protein2ipr.dat file to extract Protein_ID and InterPro domains
def parse_interpro_file(file_path):
    protein_to_domains = {}
    
# Read the InterPro data
    with open(file_path, "r") as f:
         for line in f:
            
             # Split the line by tab
            columns = line.strip().split("\t")
            
             # Ensure there are at least two columns: Protein_ID and InterPro domain
            if len(columns) < 2:
                print(f"Skipping malformed line: {line.strip()}")
                continue
            
             # Extract Protein_ID and InterPro domain (first two columns)
            protein_id, interpro_domain = columns[0], columns[1]
            
             # Add the domain to the list of domains for the protein
            if protein_id not in protein_to_domains:
                protein_to_domains[protein_id] = []
            protein_to_domains[protein_id].append(interpro_domain)
              
             
    
    return protein_to_domains


# Step 2: Create the DataFrame with Protein_ID and associated InterPro domains
def create_interpro_feature(file_path):
     # Parse the InterPro file
     interpro_data = parse_interpro_file(file_path)
    
     # Convert the dictionary into a pandas DataFrame
     protein_ids = list(interpro_data.keys())
     domains = ["; ".join(interpro_data[protein_id]) for protein_id in protein_ids]  # Combine multiple domains with ';'
    
     # Create a DataFrame with Protein_ID and InterPro_Domains
     df_domains = pd.DataFrame({
         'Protein_ID': protein_ids,
         'InterPro_Domains': domains
     })
     return df_domains

In [7]:
interpro_file_path = path + 'train_protein2ipr.dat'
prot_dom_dict = create_interpro_feature(interpro_file_path)

data = data.merge(prot_dom_dict, on='Protein_ID')
data

Unnamed: 0,Protein_ID,aspect,GO_term,InterPro_Domains
0,A0A009IHW8,biological_process,"{GO:0072521, GO:1901564, GO:0044237, GO:000679...",IPR000157; IPR000157; IPR000157; IPR035897; IP...
1,A0A009IHW8,molecular_function,"{GO:0016787, GO:0016799, GO:0016798, GO:000382...",IPR000157; IPR000157; IPR000157; IPR035897; IP...
2,A0A021WW32,biological_process,"{GO:0048468, GO:0048469, GO:0006996, GO:002170...",IPR006910; IPR039781; IPR049589
3,A0A021WW32,cellular_component,"{GO:0043232, GO:0005634, GO:0000793, GO:009908...",IPR006910; IPR039781; IPR049589
4,A0A021WZA4,cellular_component,"{GO:0110165, GO:0071944, GO:0005575, GO:000588...",IPR004481; IPR004481; IPR004837; IPR004837; IP...
...,...,...,...,...
214165,X6RKQ2,cellular_component,"{GO:0005634, GO:0005654, GO:0043231, GO:004323...",IPR015007
214166,X6RKS3,cellular_component,"{GO:0005634, GO:0110165, GO:0005622, GO:004322...",IPR000817; IPR000817; IPR000817; IPR000817; IP...
214167,X6RLP6,cellular_component,"{GO:0070013, GO:0005634, GO:0110165, GO:000562...",IPR000504; IPR000504; IPR000504; IPR012677; IP...
214168,X6RLR1,cellular_component,"{GO:0070013, GO:0043232, GO:0005634, GO:011016...",IPR009991; IPR009991


In [8]:
data["InterPro_Domains"] = data["InterPro_Domains"].apply(lambda x: [domain.strip() for domain in x.split(";")])
data

Unnamed: 0,Protein_ID,aspect,GO_term,InterPro_Domains
0,A0A009IHW8,biological_process,"{GO:0072521, GO:1901564, GO:0044237, GO:000679...","[IPR000157, IPR000157, IPR000157, IPR035897, I..."
1,A0A009IHW8,molecular_function,"{GO:0016787, GO:0016799, GO:0016798, GO:000382...","[IPR000157, IPR000157, IPR000157, IPR035897, I..."
2,A0A021WW32,biological_process,"{GO:0048468, GO:0048469, GO:0006996, GO:002170...","[IPR006910, IPR039781, IPR049589]"
3,A0A021WW32,cellular_component,"{GO:0043232, GO:0005634, GO:0000793, GO:009908...","[IPR006910, IPR039781, IPR049589]"
4,A0A021WZA4,cellular_component,"{GO:0110165, GO:0071944, GO:0005575, GO:000588...","[IPR004481, IPR004481, IPR004837, IPR004837, I..."
...,...,...,...,...
214165,X6RKQ2,cellular_component,"{GO:0005634, GO:0005654, GO:0043231, GO:004323...",[IPR015007]
214166,X6RKS3,cellular_component,"{GO:0005634, GO:0110165, GO:0005622, GO:004322...","[IPR000817, IPR000817, IPR000817, IPR000817, I..."
214167,X6RLP6,cellular_component,"{GO:0070013, GO:0005634, GO:0110165, GO:000562...","[IPR000504, IPR000504, IPR000504, IPR012677, I..."
214168,X6RLR1,cellular_component,"{GO:0070013, GO:0043232, GO:0005634, GO:011016...","[IPR009991, IPR009991]"


Download Embeddings

In [None]:
def load_prott5_embeddings(file_path):
    protein_ids = []
    embeddings = []

    with h5py.File(file_path, "r") as f:
        for protein_id in f.keys():
            embedding = f[protein_id][:]
            protein_ids.append(protein_id)
            embeddings.append(embedding)

    df_embeddings = pd.DataFrame({
        'Protein_ID': protein_ids,
        'Embedding': embeddings
    })

    return df_embeddings

In [None]:
df_embeddings = load_prott5_embeddings(path+"train_embeddings.h5")

data = data.merge(df_embeddings, on='Protein_ID')
data

In [None]:
# Apply L2-normalization to Embedding feature

def l2_normalize(embedding):
    norm = np.linalg.norm(embedding)
    if norm == 0:  # Avoid division by zero
        return embedding
    return embedding / norm

data['Embedding_norm'] = data['Embedding'].apply(l2_normalize)
data

Dataset Preparation for Machine Learning Models

In [9]:
# Divide three aspects into separate dataframes

bp_df = data[data['aspect'] == 'biological_process'].copy()
mf_df = data[data['aspect'] == 'molecular_function'].copy()
cc_df = data[data['aspect'] == 'cellular_component'].copy()

In [10]:
def encode_go_terms(dataframe):
    # Prepare for multi-label binarizer
    go_terms = dataframe['GO_term'].values.tolist()
    
    mlb = MultiLabelBinarizer()
    encoded_go_terms = mlb.fit_transform(go_terms)
    
    return dataframe['Protein_ID'].values, encoded_go_terms, mlb.classes_

In [11]:
prot_ids_bp, encoded_go_terms_bp, go_classes_bp = encode_go_terms(bp_df)
prot_ids_mf, encoded_go_terms_mf, go_classes_mf = encode_go_terms(mf_df)
prot_ids_cc, encoded_go_terms_cc, go_classes_cc = encode_go_terms(cc_df)

In [12]:
go_classes_bp

array(['GO:0000003', 'GO:0000041', 'GO:0000070', ..., 'GO:2001233',
       'GO:2001234', 'GO:2001242'], dtype=object)

In [13]:
go_classes_bp.shape

(1487,)

In [19]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

def encode_domains(dataframe):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(dataframe['InterPro_Domains'])

    max_domains = 303 # that is the largest number of domains withing the whole dataset (across 3 aspects) - must be adjasted for max number of domains for test_set

    dataframe["Domain_Tokenized"] = dataframe['InterPro_Domains'].apply(lambda x: tokenizer.texts_to_sequences([x])[0])

    # Pad the sequences to ensure they all have the same length
    dataframe["Padded_Domains"] = dataframe["Domain_Tokenized"].apply(lambda x: pad_sequences([x], maxlen=max_domains, padding='post')[0])
    return dataframe

In [20]:
bp_df = encode_domains(bp_df)
mf_df = encode_domains(mf_df)
cc_df = encode_domains(cc_df)

In [21]:
bp_df

Unnamed: 0,Protein_ID,aspect,GO_term,InterPro_Domains,Domain_Tokenized,Padded_Domains
0,A0A009IHW8,biological_process,"{GO:0072521, GO:1901564, GO:0044237, GO:000679...","[IPR000157, IPR000157, IPR000157, IPR035897, I...","[212, 212, 212, 304, 304]","[212, 212, 212, 304, 304, 0, 0, 0, 0, 0, 0, 0,..."
2,A0A021WW32,biological_process,"{GO:0048468, GO:0048469, GO:0006996, GO:002170...","[IPR006910, IPR039781, IPR049589]","[3763, 3934, 6468]","[3763, 3934, 6468, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
8,A0A023FFD0,biological_process,"{GO:0044092, GO:2000146, GO:0032101, GO:002305...",[IPR045797],[11316],"[11316, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,..."
13,A0A023GPJ3,biological_process,"{GO:0023057, GO:0002791, GO:0048523, GO:004851...","[IPR013087, IPR013087, IPR013087, IPR013087, I...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
15,A0A023GPK8,biological_process,"{GO:0090596, GO:0032502, GO:0032501, GO:005079...","[IPR003598, IPR003598, IPR003598, IPR003598, I...","[33, 33, 33, 33, 33, 33, 21, 21, 21, 21, 21, 2...","[33, 33, 33, 33, 33, 33, 21, 21, 21, 21, 21, 2..."
...,...,...,...,...,...,...
214120,X5KCU9,biological_process,"{GO:0010817, GO:1901360, GO:0008202, GO:000998...","[IPR001296, IPR028098]","[1434, 2572]","[1434, 2572, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
214122,X5KJC0,biological_process,"{GO:0010817, GO:1901360, GO:0008202, GO:000998...","[IPR001128, IPR001128, IPR001128, IPR001128, I...","[29, 29, 29, 29, 29, 1681, 1681, 1681, 796, 79...","[29, 29, 29, 29, 29, 1681, 1681, 1681, 796, 79..."
214124,X5L1L5,biological_process,"{GO:0010817, GO:1901360, GO:0008202, GO:000998...","[IPR001128, IPR001128, IPR001128, IPR001128, I...","[29, 29, 29, 29, 29, 1681, 1681, 1681, 796, 79...","[29, 29, 29, 29, 29, 1681, 1681, 1681, 796, 79..."
214126,X5L565,biological_process,"{GO:0010817, GO:1901360, GO:0008202, GO:000998...","[IPR001128, IPR001128, IPR001128, IPR001128, I...","[29, 29, 29, 29, 29, 1681, 1681, 1681, 796, 79...","[29, 29, 29, 29, 29, 1681, 1681, 1681, 796, 79..."


In [22]:
# Prepare features array and label array for future model
X_bp = np.vstack(np.array(bp_df['Padded_Domains'])).astype(np.float32)
y_bp = encoded_go_terms_bp

X_mf = np.vstack(np.array(mf_df['Padded_Domains'])).astype(np.float32)
y_mf = encoded_go_terms_mf

X_cc = np.vstack(np.array(cc_df['Padded_Domains'])).astype(np.float32)
y_cc = encoded_go_terms_cc

In [23]:
print(f'The shape of X_bp is: {X_bp.shape}')
print(f'The shape of y_bp is: {y_bp.shape}')
print(f'The shape of X_mf is: {X_mf.shape}')
print(f'The shape of y_mf is: {y_mf.shape}')
print(f'The shape of X_cc is: {X_cc.shape}')
print(f'The shape of y_cc is: {y_cc.shape}')

The shape of X_bp is: (79859, 303)
The shape of y_bp is: (79859, 1487)
The shape of X_mf is: (54545, 303)
The shape of y_mf is: (54545, 839)
The shape of X_cc is: (79766, 303)
The shape of y_cc is: (79766, 678)


In [None]:
"""
from sklearn.model_selection import train_test_split

X_train_bp, X_test_bp, y_train_bp, y_test_bp = train_test_split(X_bp, y_bp, test_size=0.3, random_state=42)
X_val_bp, X_test_bp, y_val_bp, y_test_bp = train_test_split(X_test_bp, y_test_bp, test_size=0.5, random_state=42)

X_train_mf, X_test_mf, y_train_mf, y_test_mf = train_test_split(X_mf, y_mf, test_size=0.3, random_state=42)
X_val_mf, X_test_mf, y_val_mf, y_test_mf = train_test_split(X_test_mf, y_test_mf, test_size=0.5, random_state=42)

X_train_cc, X_test_cc, y_train_cc, y_test_cc = train_test_split(X_cc, y_cc, test_size=0.3, random_state=42)
X_val_cc, X_test_cc, y_val_cc, y_test_cc = train_test_split(X_test_cc, y_test_cc, test_size=0.5, random_state=42)

"""


## Part 2 - Model Architecture and Training

In [24]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization

def build_model(input_dim, num_classes):
    model = Sequential([
        Dense(2000, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),  # Add Batch Norm after Dense
        Dropout(0.3),
        Dense(1500, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(1500, activation='relu'),
        BatchNormalization(),
        Dropout(0.3),
        Dense(num_classes, activation='sigmoid')
    ])
    return model


Training of the model for GO-terms related to Cellular Component Aspect

In [27]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

# 10-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

fold = 1
accuracy_scores_cc = []
precision_scores_cc =[]
recall_scores_cc = []

for train_index, val_index in kf.split(X_cc):
    print(f"Training on Fold {fold}...")

    # Split dataset into training and validation sets
    X_train, X_val = X_cc[train_index], X_cc[val_index]
    y_train, y_val = y_cc[train_index], y_cc[val_index]

    # Build and compile the model
    model_cc = build_model(input_dim=303, num_classes=678)
    model_cc.compile(optimizer=Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-07),
                  loss='binary_crossentropy',
                  metrics=['accuracy', Precision(), Recall()])

    # Train model
    model_cc.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1, validation_data=(X_val, y_val))

    # Evaluate model
    val_predictions = model_cc.predict(X_val)
    val_predictions = (val_predictions > 0.5).astype(int)  # Convert probabilities to binary
    acc = accuracy_score(y_val.flatten(), val_predictions.flatten())
    prec = precision_score(y_val.flatten(), val_predictions.flatten(), zero_division=0)  # Handle zero division
    rec = recall_score(y_val.flatten(), val_predictions.flatten(), zero_division=0)
    accuracy_scores_cc.append(acc)
    precision_scores_cc.append(prec)
    recall_scores_cc.append(rec)

    print(f"Fold {fold} Accuracy: {acc:.4f}\n")
    print(f"Fold {fold} Precision: {prec:.4f}\n")
    print(f"Fold {fold} Recall: {rec:.4f}\n")
    fold += 1

# Print final average accuracy
print(f"Average Accuracy across 10 folds: {np.mean(accuracy_scores_cc):.4f}")
print(f"Average Precision across 10 folds: {np.mean(precision_scores_cc):.4f}")
print(f"Average Recall across 10 folds: {np.mean(recall_scores_cc):.4f}")


Training on Fold 1...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 19ms/step - accuracy: 0.5230 - loss: 0.2049 - precision_11: 0.1615 - recall_11: 0.4345 - val_accuracy: 0.8692 - val_loss: 0.0507 - val_precision_11: 0.7440 - val_recall_11: 0.4398
Epoch 2/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 18ms/step - accuracy: 0.8701 - loss: 0.0515 - precision_11: 0.7398 - recall_11: 0.4283 - val_accuracy: 0.8692 - val_loss: 0.0506 - val_precision_11: 0.7470 - val_recall_11: 0.4358
Epoch 3/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 19ms/step - accuracy: 0.8710 - loss: 0.0508 - precision_11: 0.7429 - recall_11: 0.4334 - val_accuracy: 0.8692 - val_loss: 0.0499 - val_precision_11: 0.7473 - val_recall_11: 0.4421
Epoch 4/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 18ms/step - accuracy: 0.8679 - loss: 0.0503 - precision_11: 0.7478 - recall_11: 0.4345 - val_accuracy: 0.8692 - val_loss: 0.0500 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 18ms/step - accuracy: 0.5204 - loss: 0.2062 - precision_12: 0.1607 - recall_12: 0.4349 - val_accuracy: 0.8664 - val_loss: 0.0515 - val_precision_12: 0.7433 - val_recall_12: 0.4353
Epoch 2/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 19ms/step - accuracy: 0.8676 - loss: 0.0516 - precision_12: 0.7403 - recall_12: 0.4276 - val_accuracy: 0.8664 - val_loss: 0.0506 - val_precision_12: 0.7521 - val_recall_12: 0.4311
Epoch 3/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 18ms/step - accuracy: 0.8699 - loss: 0.0507 - precision_12: 0.7443 - recall_12: 0.4325 - val_accuracy: 0.8664 - val_loss: 0.0509 - val_precision_12: 0.7466 - val_recall_12: 0.4325
Epoch 4/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 18ms/step - accuracy: 0.8701 - loss: 0.0501 - precision_12: 0.7469 - recall_12: 0.4329 - val_accuracy: 0.8664 - val_loss: 0.0497 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 18ms/step - accuracy: 0.4934 - loss: 0.2054 - precision_13: 0.1601 - recall_13: 0.4320 - val_accuracy: 0.8540 - val_loss: 0.0503 - val_precision_13: 0.7214 - val_recall_13: 0.4391
Epoch 2/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 18ms/step - accuracy: 0.8671 - loss: 0.0519 - precision_13: 0.7404 - recall_13: 0.4259 - val_accuracy: 0.8711 - val_loss: 0.0490 - val_precision_13: 0.7527 - val_recall_13: 0.4353
Epoch 3/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 18ms/step - accuracy: 0.8673 - loss: 0.0507 - precision_13: 0.7434 - recall_13: 0.4309 - val_accuracy: 0.8711 - val_loss: 0.0491 - val_precision_13: 0.7526 - val_recall_13: 0.4414
Epoch 4/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 18ms/step - accuracy: 0.8700 - loss: 0.0500 - precision_13: 0.7471 - recall_13: 0.4355 - val_accuracy: 0.8711 - val_loss: 0.0486 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 19ms/step - accuracy: 0.5431 - loss: 0.2047 - precision_14: 0.1631 - recall_14: 0.4371 - val_accuracy: 0.8751 - val_loss: 0.0506 - val_precision_14: 0.7504 - val_recall_14: 0.4304
Epoch 2/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 20ms/step - accuracy: 0.8683 - loss: 0.0513 - precision_14: 0.7401 - recall_14: 0.4298 - val_accuracy: 0.8751 - val_loss: 0.0503 - val_precision_14: 0.7514 - val_recall_14: 0.4352
Epoch 3/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 20ms/step - accuracy: 0.8673 - loss: 0.0507 - precision_14: 0.7443 - recall_14: 0.4350 - val_accuracy: 0.8751 - val_loss: 0.0505 - val_precision_14: 0.7440 - val_recall_14: 0.4431
Epoch 4/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 19ms/step - accuracy: 0.8710 - loss: 0.0501 - precision_14: 0.7455 - recall_14: 0.4341 - val_accuracy: 0.8751 - val_loss: 0.0500 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 17ms/step - accuracy: 0.5191 - loss: 0.2056 - precision_15: 0.1618 - recall_15: 0.4331 - val_accuracy: 0.8664 - val_loss: 0.0500 - val_precision_15: 0.7545 - val_recall_15: 0.4354
Epoch 2/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 17ms/step - accuracy: 0.8684 - loss: 0.0516 - precision_15: 0.7402 - recall_15: 0.4285 - val_accuracy: 0.8664 - val_loss: 0.0496 - val_precision_15: 0.7539 - val_recall_15: 0.4363
Epoch 3/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 18ms/step - accuracy: 0.8706 - loss: 0.0508 - precision_15: 0.7422 - recall_15: 0.4328 - val_accuracy: 0.8664 - val_loss: 0.0492 - val_precision_15: 0.7539 - val_recall_15: 0.4399
Epoch 4/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 17ms/step - accuracy: 0.8697 - loss: 0.0504 - precision_15: 0.7456 - recall_15: 0.4327 - val_accuracy: 0.8664 - val_loss: 0.0492 - val_precision_15: 0.

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 19ms/step - accuracy: 0.5172 - loss: 0.2051 - precision_16: 0.1613 - recall_16: 0.4349 - val_accuracy: 0.8685 - val_loss: 0.0504 - val_precision_16: 0.7645 - val_recall_16: 0.4290
Epoch 2/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 18ms/step - accuracy: 0.8695 - loss: 0.0517 - precision_16: 0.7400 - recall_16: 0.4271 - val_accuracy: 0.8685 - val_loss: 0.0498 - val_precision_16: 0.7583 - val_recall_16: 0.4406
Epoch 3/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 19ms/step - accuracy: 0.8711 - loss: 0.0508 - precision_16: 0.7440 - recall_16: 0.4306 - val_accuracy: 0.8685 - val_loss: 0.0495 - val_precision_16: 0.7624 - val_recall_16: 0.4392
Epoch 4/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 18ms/step - accuracy: 0.8709 - loss: 0.0500 - precision_16: 0.7474 - recall_16: 0.4333 - val_accuracy: 0.8685 - val_loss: 0.0488 - val_precision_16: 0.

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 19ms/step - accuracy: 0.5469 - loss: 0.2053 - precision_17: 0.1633 - recall_17: 0.4341 - val_accuracy: 0.8721 - val_loss: 0.0496 - val_precision_17: 0.7549 - val_recall_17: 0.4379
Epoch 2/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 19ms/step - accuracy: 0.8662 - loss: 0.0516 - precision_17: 0.7408 - recall_17: 0.4295 - val_accuracy: 0.8721 - val_loss: 0.0497 - val_precision_17: 0.7526 - val_recall_17: 0.4397
Epoch 3/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 19ms/step - accuracy: 0.8700 - loss: 0.0509 - precision_17: 0.7417 - recall_17: 0.4313 - val_accuracy: 0.8721 - val_loss: 0.0496 - val_precision_17: 0.7359 - val_recall_17: 0.4358
Epoch 4/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 19ms/step - accuracy: 0.8689 - loss: 0.0502 - precision_17: 0.7443 - recall_17: 0.4330 - val_accuracy: 0.8721 - val_loss: 0.0486 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 18ms/step - accuracy: 0.5728 - loss: 0.2053 - precision_18: 0.1617 - recall_18: 0.4336 - val_accuracy: 0.8662 - val_loss: 0.0498 - val_precision_18: 0.7547 - val_recall_18: 0.4350
Epoch 2/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 18ms/step - accuracy: 0.8631 - loss: 0.0517 - precision_18: 0.7427 - recall_18: 0.4283 - val_accuracy: 0.8662 - val_loss: 0.0495 - val_precision_18: 0.7411 - val_recall_18: 0.4418
Epoch 3/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 18ms/step - accuracy: 0.8692 - loss: 0.0510 - precision_18: 0.7409 - recall_18: 0.4312 - val_accuracy: 0.8662 - val_loss: 0.0488 - val_precision_18: 0.7522 - val_recall_18: 0.4446
Epoch 4/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 19ms/step - accuracy: 0.8702 - loss: 0.0502 - precision_18: 0.7470 - recall_18: 0.4347 - val_accuracy: 0.8662 - val_loss: 0.0487 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 19ms/step - accuracy: 0.5451 - loss: 0.2054 - precision_19: 0.1624 - recall_19: 0.4369 - val_accuracy: 0.8731 - val_loss: 0.0510 - val_precision_19: 0.7548 - val_recall_19: 0.4220
Epoch 2/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 19ms/step - accuracy: 0.8661 - loss: 0.0516 - precision_19: 0.7411 - recall_19: 0.4272 - val_accuracy: 0.8731 - val_loss: 0.0509 - val_precision_19: 0.7424 - val_recall_19: 0.4300
Epoch 3/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 19ms/step - accuracy: 0.8717 - loss: 0.0504 - precision_19: 0.7417 - recall_19: 0.4342 - val_accuracy: 0.8731 - val_loss: 0.0505 - val_precision_19: 0.7511 - val_recall_19: 0.4321
Epoch 4/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 19ms/step - accuracy: 0.8710 - loss: 0.0499 - precision_19: 0.7462 - recall_19: 0.4358 - val_accuracy: 0.8731 - val_loss: 0.0497 - val_precision_19: 0.

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 19ms/step - accuracy: 0.5376 - loss: 0.2058 - precision_20: 0.1607 - recall_20: 0.4322 - val_accuracy: 0.8735 - val_loss: 0.0505 - val_precision_20: 0.7531 - val_recall_20: 0.4315
Epoch 2/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 18ms/step - accuracy: 0.8682 - loss: 0.0514 - precision_20: 0.7386 - recall_20: 0.4299 - val_accuracy: 0.8735 - val_loss: 0.0509 - val_precision_20: 0.7548 - val_recall_20: 0.4245
Epoch 3/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 18ms/step - accuracy: 0.8714 - loss: 0.0507 - precision_20: 0.7442 - recall_20: 0.4331 - val_accuracy: 0.8735 - val_loss: 0.0504 - val_precision_20: 0.7326 - val_recall_20: 0.4380
Epoch 4/10
[1m2244/2244[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 18ms/step - accuracy: 0.8703 - loss: 0.0500 - precision_20: 0.7460 - recall_20: 0.4341 - val_accuracy: 0.8735 - val_loss: 0.0501 - val_precision_20: 0.

Training of the model for GO-terms related to Molecular Function Aspect

In [26]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score


# 10-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

fold = 1
accuracy_scores_mf = []
precision_scores_mf=[]
recall_scores_mf = []

for train_index, val_index in kf.split(X_mf):
    print(f"Training on Fold {fold}...")

    # Split dataset into training and validation sets
    X_train, X_val = X_mf[train_index], X_mf[val_index]
    y_train, y_val = y_mf[train_index], y_mf[val_index]

    # Build and compile the model
    model_mf = build_model(input_dim=303, num_classes=839)
    model_mf.compile(optimizer=Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-07),
                  loss='binary_crossentropy',
                  metrics=['accuracy', Precision(), Recall()])

    # Train model
    model_mf.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1, validation_data=(X_val, y_val))

    # Evaluate model
    val_predictions = model_mf.predict(X_val)
    val_predictions = (val_predictions > 0.5).astype(int)  # Convert probabilities to binary
    acc = accuracy_score(y_val.flatten(), val_predictions.flatten())
    prec = precision_score(y_val.flatten(), val_predictions.flatten(), zero_division=0)  # Handle zero division
    rec = recall_score(y_val.flatten(), val_predictions.flatten(), zero_division=0)
    accuracy_scores_mf.append(acc)
    precision_scores_mf.append(prec)
    recall_scores_mf.append(rec)

    print(f"Fold {fold} Accuracy: {acc:.4f}\n")
    print(f"Fold {fold} Precision: {prec:.4f}\n")
    print(f"Fold {fold} Recall: {rec:.4f}\n")
    fold += 1

# Print final average accuracy
print(f"Average Accuracy across 10 folds: {np.mean(accuracy_scores_mf):.4f}")
print(f"Average Precision across 10 folds: {np.mean(precision_scores_mf):.4f}")
print(f"Average Recall across 10 folds: {np.mean(recall_scores_mf):.4f}")


Training on Fold 1...
Epoch 1/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 23ms/step - accuracy: 0.6067 - loss: 0.2457 - precision_1: 0.0483 - recall_1: 0.2777 - val_accuracy: 0.8456 - val_loss: 0.0424 - val_precision_1: 0.7067 - val_recall_1: 0.2357
Epoch 2/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 22ms/step - accuracy: 0.8470 - loss: 0.0437 - precision_1: 0.7084 - recall_1: 0.2171 - val_accuracy: 0.8456 - val_loss: 0.0424 - val_precision_1: 0.7530 - val_recall_1: 0.2182
Epoch 3/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 22ms/step - accuracy: 0.8472 - loss: 0.0428 - precision_1: 0.7144 - recall_1: 0.2201 - val_accuracy: 0.8456 - val_loss: 0.0414 - val_precision_1: 0.7667 - val_recall_1: 0.2156
Epoch 4/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 22ms/step - accuracy: 0.8477 - loss: 0.0418 - precision_1: 0.7275 - recall_1: 0.2221 - val_accuracy: 0.8456 - val_loss: 0.0407 - v

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 20ms/step - accuracy: 0.6088 - loss: 0.2458 - precision_2: 0.0480 - recall_2: 0.2780 - val_accuracy: 0.8510 - val_loss: 0.0423 - val_precision_2: 0.7253 - val_recall_2: 0.2213
Epoch 2/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 20ms/step - accuracy: 0.8400 - loss: 0.0435 - precision_2: 0.7133 - recall_2: 0.2170 - val_accuracy: 0.8510 - val_loss: 0.0417 - val_precision_2: 0.7607 - val_recall_2: 0.2187
Epoch 3/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 20ms/step - accuracy: 0.8462 - loss: 0.0427 - precision_2: 0.7205 - recall_2: 0.2209 - val_accuracy: 0.8510 - val_loss: 0.0412 - val_precision_2: 0.7361 - val_recall_2: 0.2317
Epoch 4/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 20ms/step - accuracy: 0.8430 - loss: 0.0421 - precision_2: 0.7192 - recall_2: 0.2216 - val_accuracy: 0.8510 - val_loss: 0.0405 - val_precision_2: 0.7209

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 21ms/step - accuracy: 0.6064 - loss: 0.2452 - precision_3: 0.0480 - recall_3: 0.2765 - val_accuracy: 0.8484 - val_loss: 0.0427 - val_precision_3: 0.7591 - val_recall_3: 0.2129
Epoch 2/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 21ms/step - accuracy: 0.8461 - loss: 0.0435 - precision_3: 0.7099 - recall_3: 0.2170 - val_accuracy: 0.8484 - val_loss: 0.0420 - val_precision_3: 0.7535 - val_recall_3: 0.2168
Epoch 3/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 22ms/step - accuracy: 0.8482 - loss: 0.0426 - precision_3: 0.7201 - recall_3: 0.2195 - val_accuracy: 0.8484 - val_loss: 0.0417 - val_precision_3: 0.6999 - val_recall_3: 0.2304
Epoch 4/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 22ms/step - accuracy: 0.8454 - loss: 0.0418 - precision_3: 0.7199 - recall_3: 0.2240 - val_accuracy: 0.8484 - val_loss: 0.0411 - val_precision_3: 0.7468 - val_reca

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 20ms/step - accuracy: 0.6145 - loss: 0.2439 - precision_4: 0.0485 - recall_4: 0.2749 - val_accuracy: 0.8385 - val_loss: 0.0423 - val_precision_4: 0.7341 - val_recall_4: 0.2113
Epoch 2/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 19ms/step - accuracy: 0.8494 - loss: 0.0434 - precision_4: 0.7108 - recall_4: 0.2169 - val_accuracy: 0.8385 - val_loss: 0.0418 - val_precision_4: 0.7513 - val_recall_4: 0.2266
Epoch 3/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 19ms/step - accuracy: 0.8494 - loss: 0.0424 - precision_4: 0.7200 - recall_4: 0.2197 - val_accuracy: 0.8385 - val_loss: 0.0415 - val_precision_4: 0.7364 - val_recall_4: 0.2325
Epoch 4/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 20ms/step - accuracy: 0.8495 - loss: 0.0420 - precision_4: 0.7221 - recall_4: 0.2218 - val_accuracy: 0.8385 - val_loss: 0.0413 - val_precision_4: 0.7460 - val_reca

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 20ms/step - accuracy: 0.6108 - loss: 0.2451 - precision_5: 0.0485 - recall_5: 0.2793 - val_accuracy: 0.8502 - val_loss: 0.0422 - val_precision_5: 0.7667 - val_recall_5: 0.2109
Epoch 2/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 19ms/step - accuracy: 0.8466 - loss: 0.0436 - precision_5: 0.7073 - recall_5: 0.2161 - val_accuracy: 0.8502 - val_loss: 0.0419 - val_precision_5: 0.7527 - val_recall_5: 0.2185
Epoch 3/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 19ms/step - accuracy: 0.8460 - loss: 0.0428 - precision_5: 0.7191 - recall_5: 0.2185 - val_accuracy: 0.8502 - val_loss: 0.0417 - val_precision_5: 0.7368 - val_recall_5: 0.2190
Epoch 4/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 20ms/step - accuracy: 0.8475 - loss: 0.0418 - precision_5: 0.7228 - recall_5: 0.2237 - val_accuracy: 0.8502 - val_loss: 0.0410 - val_precision_5: 0.7504 - val_reca

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 20ms/step - accuracy: 0.6106 - loss: 0.2450 - precision_6: 0.0479 - recall_6: 0.2764 - val_accuracy: 0.8423 - val_loss: 0.0426 - val_precision_6: 0.7469 - val_recall_6: 0.2256
Epoch 2/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 20ms/step - accuracy: 0.8461 - loss: 0.0431 - precision_6: 0.7100 - recall_6: 0.2195 - val_accuracy: 0.8423 - val_loss: 0.0419 - val_precision_6: 0.7504 - val_recall_6: 0.2278
Epoch 3/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 20ms/step - accuracy: 0.8476 - loss: 0.0424 - precision_6: 0.7215 - recall_6: 0.2208 - val_accuracy: 0.8423 - val_loss: 0.0416 - val_precision_6: 0.7442 - val_recall_6: 0.2217
Epoch 4/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 20ms/step - accuracy: 0.8478 - loss: 0.0418 - precision_6: 0.7215 - recall_6: 0.2254 - val_accuracy: 0.8423 - val_loss: 0.0414 - val_precision_6: 0.7306 - val_reca

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 19ms/step - accuracy: 0.6082 - loss: 0.2450 - precision_7: 0.0482 - recall_7: 0.2766 - val_accuracy: 0.8517 - val_loss: 0.0421 - val_precision_7: 0.7512 - val_recall_7: 0.2178
Epoch 2/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 19ms/step - accuracy: 0.8470 - loss: 0.0434 - precision_7: 0.7126 - recall_7: 0.2171 - val_accuracy: 0.8517 - val_loss: 0.0416 - val_precision_7: 0.7391 - val_recall_7: 0.2200
Epoch 3/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 19ms/step - accuracy: 0.8476 - loss: 0.0426 - precision_7: 0.7194 - recall_7: 0.2213 - val_accuracy: 0.8517 - val_loss: 0.0414 - val_precision_7: 0.7625 - val_recall_7: 0.2155
Epoch 4/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 20ms/step - accuracy: 0.8465 - loss: 0.0420 - precision_7: 0.7263 - recall_7: 0.2237 - val_accuracy: 0.8517 - val_loss: 0.0405 - val_precision_7: 0.7414

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 20ms/step - accuracy: 0.6095 - loss: 0.2445 - precision_8: 0.0488 - recall_8: 0.2775 - val_accuracy: 0.8537 - val_loss: 0.0422 - val_precision_8: 0.7489 - val_recall_8: 0.2207
Epoch 2/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 19ms/step - accuracy: 0.8478 - loss: 0.0433 - precision_8: 0.7119 - recall_8: 0.2167 - val_accuracy: 0.8537 - val_loss: 0.0414 - val_precision_8: 0.7612 - val_recall_8: 0.2114
Epoch 3/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 20ms/step - accuracy: 0.8475 - loss: 0.0427 - precision_8: 0.7187 - recall_8: 0.2216 - val_accuracy: 0.8537 - val_loss: 0.0404 - val_precision_8: 0.7490 - val_recall_8: 0.2270
Epoch 4/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 19ms/step - accuracy: 0.8430 - loss: 0.0420 - precision_8: 0.7232 - recall_8: 0.2229 - val_accuracy: 0.8537 - val_loss: 0.0403 - val_precision_8: 0.7451 - val_reca

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 20ms/step - accuracy: 0.6094 - loss: 0.2445 - precision_9: 0.0482 - recall_9: 0.2761 - val_accuracy: 0.8425 - val_loss: 0.0427 - val_precision_9: 0.7687 - val_recall_9: 0.2137
Epoch 2/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 21ms/step - accuracy: 0.8474 - loss: 0.0434 - precision_9: 0.7133 - recall_9: 0.2168 - val_accuracy: 0.8425 - val_loss: 0.0419 - val_precision_9: 0.7382 - val_recall_9: 0.2294
Epoch 3/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 20ms/step - accuracy: 0.8444 - loss: 0.0426 - precision_9: 0.7177 - recall_9: 0.2216 - val_accuracy: 0.8425 - val_loss: 0.0417 - val_precision_9: 0.7586 - val_recall_9: 0.2221
Epoch 4/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 21ms/step - accuracy: 0.8476 - loss: 0.0419 - precision_9: 0.7194 - recall_9: 0.2228 - val_accuracy: 0.8425 - val_loss: 0.0412 - val_precision_9: 0.7563 - val_reca

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 19ms/step - accuracy: 0.6067 - loss: 0.2463 - precision_10: 0.0479 - recall_10: 0.2765 - val_accuracy: 0.8445 - val_loss: 0.0427 - val_precision_10: 0.7741 - val_recall_10: 0.2094
Epoch 2/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 19ms/step - accuracy: 0.8471 - loss: 0.0435 - precision_10: 0.7082 - recall_10: 0.2156 - val_accuracy: 0.8445 - val_loss: 0.0421 - val_precision_10: 0.7375 - val_recall_10: 0.2265
Epoch 3/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 19ms/step - accuracy: 0.8489 - loss: 0.0425 - precision_10: 0.7210 - recall_10: 0.2208 - val_accuracy: 0.8445 - val_loss: 0.0418 - val_precision_10: 0.7498 - val_recall_10: 0.2190
Epoch 4/10
[1m1535/1535[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 19ms/step - accuracy: 0.8509 - loss: 0.0417 - precision_10: 0.7210 - recall_10: 0.2237 - val_accuracy: 0.8445 - val_loss: 0.0411 - val_preci

Training of the model for GO-terms related to Biological Process Aspect

In [28]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.metrics import Precision, Recall
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, precision_score, recall_score

# 10-Fold Cross-Validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

fold = 1
accuracy_scores_bp = []
precision_scores_bp=[]
recall_scores_bp = []

for train_index, val_index in kf.split(X_bp):
    print(f"Training on Fold {fold}...")

    # Split dataset into training and validation sets
    X_train, X_val = X_bp[train_index], X_bp[val_index]
    y_train, y_val = y_bp[train_index], y_bp[val_index]

    # Build and compile the model
    model_bp = build_model(input_dim=303, num_classes=1487)
    model_bp.compile(optimizer=Adam(learning_rate=0.0005, beta_1=0.9, beta_2=0.999, epsilon=1e-07),
                  loss='binary_crossentropy',
                  metrics=['accuracy', Precision(), Recall()])

    # Train model
    model_bp.fit(X_train, y_train, epochs=10, batch_size=32, verbose=1, validation_data=(X_val, y_val))

    # Evaluate model
    val_predictions = model_bp.predict(X_val)
    val_predictions = (val_predictions > 0.5).astype(int)  # Convert probabilities to binary
    acc = accuracy_score(y_val.flatten(), val_predictions.flatten())
    prec = precision_score(y_val.flatten(), val_predictions.flatten(), zero_division=0)  # Handle zero division
    rec = recall_score(y_val.flatten(), val_predictions.flatten(), zero_division=0)
    accuracy_scores_bp.append(acc)
    precision_scores_bp.append(prec)
    recall_scores_bp.append(rec)

    print(f"Fold {fold} Accuracy: {acc:.4f}\n")
    print(f"Fold {fold} Precision: {prec:.4f}\n")
    print(f"Fold {fold} Recall: {rec:.4f}\n")
    fold += 1

# Print final average accuracy
print(f"Average Accuracy across 10 folds: {np.mean(accuracy_scores_bp):.4f}")
print(f"Average Precision across 10 folds: {np.mean(precision_scores_bp):.4f}")
print(f"Average Recall across 10 folds: {np.mean(recall_scores_bp):.4f}")


Training on Fold 1...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 21ms/step - accuracy: 0.1031 - loss: 0.2233 - precision_21: 0.0615 - recall_21: 0.1523 - val_accuracy: 0.1317 - val_loss: 0.0839 - val_precision_21: 0.6115 - val_recall_21: 0.0711
Epoch 2/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 19ms/step - accuracy: 0.1306 - loss: 0.0834 - precision_21: 0.6151 - recall_21: 0.0767 - val_accuracy: 0.1317 - val_loss: 0.0830 - val_precision_21: 0.6723 - val_recall_21: 0.0699
Epoch 3/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 20ms/step - accuracy: 0.1308 - loss: 0.0825 - precision_21: 0.6346 - recall_21: 0.0763 - val_accuracy: 0.1317 - val_loss: 0.0824 - val_precision_21: 0.7357 - val_recall_21: 0.0721
Epoch 4/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 20ms/step - accuracy: 0.1304 - loss: 0.0819 - precision_21: 0.6583 - recall_21: 0.0768 - val_accuracy: 0.1317 - val_loss: 0.0821 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 20ms/step - accuracy: 0.1020 - loss: 0.2242 - precision_22: 0.0604 - recall_22: 0.1513 - val_accuracy: 0.1280 - val_loss: 0.0829 - val_precision_22: 0.7170 - val_recall_22: 0.0749
Epoch 2/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 19ms/step - accuracy: 0.1307 - loss: 0.0837 - precision_22: 0.6179 - recall_22: 0.0776 - val_accuracy: 0.1280 - val_loss: 0.0821 - val_precision_22: 0.6936 - val_recall_22: 0.0804
Epoch 3/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 21ms/step - accuracy: 0.1303 - loss: 0.0826 - precision_22: 0.6455 - recall_22: 0.0765 - val_accuracy: 0.1280 - val_loss: 0.0822 - val_precision_22: 0.6941 - val_recall_22: 0.0750
Epoch 4/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 22ms/step - accuracy: 0.1306 - loss: 0.0821 - precision_22: 0.6609 - recall_22: 0.0762 - val_accuracy: 0.1280 - val_loss: 0.0815 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 20ms/step - accuracy: 0.1027 - loss: 0.2223 - precision_23: 0.0612 - recall_23: 0.1498 - val_accuracy: 0.1344 - val_loss: 0.0832 - val_precision_23: 0.7011 - val_recall_23: 0.0696
Epoch 2/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 21ms/step - accuracy: 0.1283 - loss: 0.0843 - precision_23: 0.6194 - recall_23: 0.0759 - val_accuracy: 0.1344 - val_loss: 0.0826 - val_precision_23: 0.6831 - val_recall_23: 0.0810
Epoch 3/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 20ms/step - accuracy: 0.1282 - loss: 0.0826 - precision_23: 0.6419 - recall_23: 0.0759 - val_accuracy: 0.1344 - val_loss: 0.0821 - val_precision_23: 0.7515 - val_recall_23: 0.0642
Epoch 4/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 22ms/step - accuracy: 0.1265 - loss: 0.0821 - precision_23: 0.6555 - recall_23: 0.0774 - val_accuracy: 0.1344 - val_loss: 0.0816 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 21ms/step - accuracy: 0.1043 - loss: 0.2226 - precision_24: 0.0612 - recall_24: 0.1511 - val_accuracy: 0.1246 - val_loss: 0.0822 - val_precision_24: 0.6737 - val_recall_24: 0.0838
Epoch 2/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 21ms/step - accuracy: 0.1298 - loss: 0.0838 - precision_24: 0.6165 - recall_24: 0.0753 - val_accuracy: 0.1246 - val_loss: 0.0820 - val_precision_24: 0.6800 - val_recall_24: 0.0803
Epoch 3/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 21ms/step - accuracy: 0.1315 - loss: 0.0828 - precision_24: 0.6373 - recall_24: 0.0769 - val_accuracy: 0.1246 - val_loss: 0.0816 - val_precision_24: 0.6044 - val_recall_24: 0.0780
Epoch 4/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 21ms/step - accuracy: 0.1304 - loss: 0.0815 - precision_24: 0.6564 - recall_24: 0.0774 - val_accuracy: 0.1246 - val_loss: 0.0808 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 20ms/step - accuracy: 0.1020 - loss: 0.2239 - precision_25: 0.0612 - recall_25: 0.1517 - val_accuracy: 0.1290 - val_loss: 0.0825 - val_precision_25: 0.6482 - val_recall_25: 0.0797
Epoch 2/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 20ms/step - accuracy: 0.1324 - loss: 0.0840 - precision_25: 0.6178 - recall_25: 0.0761 - val_accuracy: 0.1290 - val_loss: 0.0818 - val_precision_25: 0.7126 - val_recall_25: 0.0733
Epoch 3/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 20ms/step - accuracy: 0.1292 - loss: 0.0827 - precision_25: 0.6420 - recall_25: 0.0773 - val_accuracy: 0.1290 - val_loss: 0.0811 - val_precision_25: 0.7566 - val_recall_25: 0.0653
Epoch 4/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 20ms/step - accuracy: 0.1299 - loss: 0.0820 - precision_25: 0.6609 - recall_25: 0.0772 - val_accuracy: 0.1290 - val_loss: 0.0808 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 20ms/step - accuracy: 0.1024 - loss: 0.2229 - precision_26: 0.0620 - recall_26: 0.1520 - val_accuracy: 0.1296 - val_loss: 0.0832 - val_precision_26: 0.5720 - val_recall_26: 0.0751
Epoch 2/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 20ms/step - accuracy: 0.1312 - loss: 0.0839 - precision_26: 0.6213 - recall_26: 0.0766 - val_accuracy: 0.1296 - val_loss: 0.0820 - val_precision_26: 0.6681 - val_recall_26: 0.0828
Epoch 3/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 20ms/step - accuracy: 0.1307 - loss: 0.0827 - precision_26: 0.6413 - recall_26: 0.0762 - val_accuracy: 0.1296 - val_loss: 0.0814 - val_precision_26: 0.6667 - val_recall_26: 0.0742
Epoch 4/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 20ms/step - accuracy: 0.1282 - loss: 0.0823 - precision_26: 0.6568 - recall_26: 0.0771 - val_accuracy: 0.1296 - val_loss: 0.0807 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 20ms/step - accuracy: 0.1021 - loss: 0.2231 - precision_27: 0.0611 - recall_27: 0.1494 - val_accuracy: 0.1297 - val_loss: 0.0843 - val_precision_27: 0.6725 - val_recall_27: 0.0795
Epoch 2/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 20ms/step - accuracy: 0.1300 - loss: 0.0837 - precision_27: 0.6182 - recall_27: 0.0766 - val_accuracy: 0.1297 - val_loss: 0.0827 - val_precision_27: 0.7206 - val_recall_27: 0.0747
Epoch 3/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 20ms/step - accuracy: 0.1312 - loss: 0.0830 - precision_27: 0.6442 - recall_27: 0.0755 - val_accuracy: 0.1297 - val_loss: 0.0818 - val_precision_27: 0.6764 - val_recall_27: 0.0832
Epoch 4/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m45s[0m 20ms/step - accuracy: 0.1319 - loss: 0.0815 - precision_27: 0.6656 - recall_27: 0.0762 - val_accuracy: 0.1297 - val_loss: 0.0814 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 21ms/step - accuracy: 0.1029 - loss: 0.2229 - precision_28: 0.0623 - recall_28: 0.1514 - val_accuracy: 0.1329 - val_loss: 0.0829 - val_precision_28: 0.6679 - val_recall_28: 0.0779
Epoch 2/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 20ms/step - accuracy: 0.1295 - loss: 0.0838 - precision_28: 0.6193 - recall_28: 0.0773 - val_accuracy: 0.1329 - val_loss: 0.0808 - val_precision_28: 0.6748 - val_recall_28: 0.0817
Epoch 3/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 21ms/step - accuracy: 0.1279 - loss: 0.0829 - precision_28: 0.6463 - recall_28: 0.0761 - val_accuracy: 0.1329 - val_loss: 0.0804 - val_precision_28: 0.6507 - val_recall_28: 0.0782
Epoch 4/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 21ms/step - accuracy: 0.1303 - loss: 0.0813 - precision_28: 0.6620 - recall_28: 0.0767 - val_accuracy: 0.1329 - val_loss: 0.0802 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 21ms/step - accuracy: 0.1023 - loss: 0.2236 - precision_29: 0.0615 - recall_29: 0.1508 - val_accuracy: 0.1253 - val_loss: 0.0819 - val_precision_29: 0.7041 - val_recall_29: 0.0766
Epoch 2/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 21ms/step - accuracy: 0.1316 - loss: 0.0844 - precision_29: 0.6107 - recall_29: 0.0759 - val_accuracy: 0.1253 - val_loss: 0.0818 - val_precision_29: 0.6804 - val_recall_29: 0.0667
Epoch 3/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 21ms/step - accuracy: 0.1313 - loss: 0.0830 - precision_29: 0.6387 - recall_29: 0.0762 - val_accuracy: 0.1253 - val_loss: 0.0821 - val_precision_29: 0.7125 - val_recall_29: 0.0743
Epoch 4/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 21ms/step - accuracy: 0.1305 - loss: 0.0819 - precision_29: 0.6604 - recall_29: 0.0764 - val_accuracy: 0.1253 - val_loss: 0.0805 - val_preci

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 21ms/step - accuracy: 0.1022 - loss: 0.2227 - precision_30: 0.0621 - recall_30: 0.1519 - val_accuracy: 0.1358 - val_loss: 0.0817 - val_precision_30: 0.6469 - val_recall_30: 0.0844
Epoch 2/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 20ms/step - accuracy: 0.1308 - loss: 0.0836 - precision_30: 0.6161 - recall_30: 0.0777 - val_accuracy: 0.1358 - val_loss: 0.0815 - val_precision_30: 0.6551 - val_recall_30: 0.0713
Epoch 3/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 20ms/step - accuracy: 0.1281 - loss: 0.0829 - precision_30: 0.6365 - recall_30: 0.0766 - val_accuracy: 0.1358 - val_loss: 0.0808 - val_precision_30: 0.7200 - val_recall_30: 0.0712
Epoch 4/10
[1m2247/2247[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m46s[0m 20ms/step - accuracy: 0.1293 - loss: 0.0825 - precision_30: 0.6563 - recall_30: 0.0768 - val_accuracy: 0.1358 - val_loss: 0.0805 - val_preci

## Part 3 - Obtaining the predictions for Test Set

In [29]:
test_path = '/Users/aleksandramaslova/Downloads/biological_data_pfp/test/'
test_data = pd.read_csv(test_path + 'test_ids.txt', header=None, names=["Protein_ID"])

test_data

Unnamed: 0,Protein_ID
0,O43747
1,Q969H0
2,Q9JMA2
3,P18065
4,A0A8I6AN32
...,...
995,P9WPA7
996,P13504
997,P70062
998,Q80TN5


In [30]:
test_data

Unnamed: 0,Protein_ID
0,O43747
1,Q969H0
2,Q9JMA2
3,P18065
4,A0A8I6AN32
...,...
995,P9WPA7
996,P13504
997,P70062
998,Q80TN5


In [31]:
def encode_domains_test(dataframe):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(dataframe['InterPro_Domains'])

    max_domains = 303

    dataframe["Domain_Tokenized"] = dataframe['InterPro_Domains'].apply(lambda x: tokenizer.texts_to_sequences([x])[0])

    # Pad the sequences to ensure they all have the same length
    dataframe["Padded_Domains"] = dataframe["Domain_Tokenized"].apply(lambda x: pad_sequences([x], maxlen=max_domains, padding='post')[0])
    return dataframe

In [32]:
# Upload domain information for test proteins
interpro_test_path = test_path + 'test_protein2ipr.dat'
domain_test_dict = create_interpro_feature(interpro_test_path)

# Add domain info to the test DataFrame and process it 
test_data = test_data.merge(domain_test_dict, on='Protein_ID')
test_data["InterPro_Domains"] = test_data["InterPro_Domains"].apply(lambda x: [domain.strip() for domain in x.split(";")])

# Encode domain info for model
test_data = encode_domains_test(test_data)
test_data


Unnamed: 0,Protein_ID,InterPro_Domains,Domain_Tokenized,Padded_Domains
0,O43747,"[IPR002553, IPR008152, IPR008152, IPR008153, I...","[1399, 345, 345, 642, 150, 643, 133, 1400]","[1399, 345, 345, 642, 150, 643, 133, 1400, 0, ..."
1,Q969H0,"[IPR001680, IPR001680, IPR001680, IPR001680, I...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ...","[7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, ..."
2,Q9JMA2,"[IPR002616, IPR002616, IPR004803, IPR004803, I...","[844, 844, 845, 845, 846, 846]","[844, 844, 845, 845, 846, 846, 0, 0, 0, 0, 0, ..."
3,P18065,"[IPR000716, IPR000716, IPR000716, IPR000716, I...","[192, 192, 192, 192, 192, 224, 224, 224, 225, ...","[192, 192, 192, 192, 192, 224, 224, 224, 225, ..."
4,A0A8I6AN32,"[IPR001876, IPR001876, IPR001876, IPR001876, I...","[268, 268, 268, 268, 847]","[268, 268, 268, 268, 847, 0, 0, 0, 0, 0, 0, 0,..."
...,...,...,...,...
976,P9WPA7,"[IPR004566, IPR004566, IPR004566, IPR004566, I...","[641, 641, 641, 641, 2545, 3, 3]","[641, 641, 641, 641, 2545, 3, 3, 0, 0, 0, 0, 0..."
977,P13504,"[IPR000157, IPR000157, IPR000157, IPR003599, I...","[472, 472, 472, 28, 28, 28, 148, 148, 148, 148...","[472, 472, 472, 28, 28, 28, 148, 148, 148, 148..."
978,P70062,"[IPR009071, IPR009071, IPR009071, IPR013558, I...","[87, 87, 87, 2546, 2547, 1360, 142, 142]","[87, 87, 87, 2546, 2547, 1360, 142, 142, 0, 0,..."
979,Q80TN5,"[IPR001594, IPR002110, IPR002110, IPR002110, I...","[2548, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ...","[2548, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, ..."


In [33]:
# Check if the max number of domains for test data does not exceed the padding lenght - 303 domains

max_testdomains = test_data['InterPro_Domains'].apply(len).max()
print(f'The maximum number of domains in one protein within test dataset:{max_testdomains}')



The maximum number of domains in one protein within test dataset:86


In [None]:
test_data

In [34]:
# Prepare test datasets
X_test_bp = np.vstack(np.array(test_data['Padded_Domains'])).astype(np.float32)
X_test_mf = np.vstack(np.array(test_data['Padded_Domains'])).astype(np.float32)
X_test_cc = np.vstack(np.array(test_data['Padded_Domains'])).astype(np.float32)

In [None]:
X_test_bp.shape

In [35]:
# Generate predictions for Biological Process aspect
predictions_bp = model_bp.predict(X_test_bp)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


In [36]:
# Generate predictions for Biological Process aspect
predictions_mf = model_mf.predict(X_test_mf)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step


In [37]:
# Generate predictions for Cellular Component aspect
predictions_cc = model_cc.predict(X_test_cc)

[1m31/31[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


In [38]:
# Extract top 10 (with highest probability) predictions for Biological Process aspect
df_predictions_bp = pd.DataFrame(predictions_bp, columns=[go_classes_bp])
df_predictions_bp.insert(0, 'Protein_ID', test_data.Protein_ID)

top_predictions_bp = []
df_predictions_bp.iloc[:, 1:] = df_predictions_bp.iloc[:, 1:].astype(float)
for i, row in df_predictions_bp.iterrows():
    protein_id = row["Protein_ID"]
    go_terms = row.iloc[1:].astype(float)  # Ensure numeric dtype
    
    
    # Get the top 10 predictions (GO terms with highest probability)
    top_10_indices = go_terms.nlargest(10).index  # Get top 10 column names
    top_10_probs = go_terms.nlargest(10).values   # Get corresponding probabilities

    # Store results
    for term, prob in zip(top_10_indices, top_10_probs):
        top_predictions_bp.append({"Protein_ID": protein_id, "GO_Term": term, "Probability": prob})

df_top_predictions_bp = pd.DataFrame(top_predictions_bp)
df_top_predictions_bp["GO_Term"] = df_top_predictions_bp["GO_Term"].astype(str).apply(lambda x: x.strip("()").replace(",", ""))

df_top_predictions_bp

Unnamed: 0,Protein_ID,GO_Term,Probability
0,O43747,'GO:0008150',0.999997
1,O43747,'GO:0009987',0.615575
2,O43747,'GO:0065007',0.412235
3,O43747,'GO:0050789',0.394671
4,O43747,'GO:0050896',0.368556
...,...,...,...
9805,Q9V2V6,'GO:0050794',0.389549
9806,Q9V2V6,'GO:0008152',0.331616
9807,Q9V2V6,'GO:0071704',0.302644
9808,Q9V2V6,'GO:0032502',0.272140


In [39]:
# Extract top 10 (with highest probability) predictions for Molecular Function aspect
df_predictions_mf = pd.DataFrame(predictions_mf, columns=[go_classes_mf])
df_predictions_mf.insert(0, 'Protein_ID', test_data.Protein_ID)

top_predictions_mf = []
df_predictions_mf.iloc[:, 1:] = df_predictions_mf.iloc[:, 1:].astype(float)
for i, row in df_predictions_mf.iterrows():
    protein_id = row["Protein_ID"]
    go_terms = row.iloc[1:].astype(float)  # Ensure numeric dtype
    
    
    # Get the top 10 predictions (GO terms with highest probability)
    top_10_indices = go_terms.nlargest(10).index  # Get top 10 column names
    top_10_probs = go_terms.nlargest(10).values   # Get corresponding probabilities

    # Store results
    for term, prob in zip(top_10_indices, top_10_probs):
        top_predictions_mf.append({"Protein_ID": protein_id, "GO_Term": term, "Probability": prob})

df_top_predictions_mf = pd.DataFrame(top_predictions_mf)
df_top_predictions_mf["GO_Term"] = df_top_predictions_mf["GO_Term"].astype(str).apply(lambda x: x.strip("()").replace(",", ""))

df_top_predictions_mf

Unnamed: 0,Protein_ID,GO_Term,Probability
0,O43747,'GO:0003674',0.999996
1,O43747,'GO:0003824',0.654997
2,O43747,'GO:0005488',0.511032
3,O43747,'GO:0005515',0.383997
4,O43747,'GO:0016787',0.241416
...,...,...,...
9805,Q9V2V6,'GO:0016491',0.156200
9806,Q9V2V6,'GO:0016787',0.138870
9807,Q9V2V6,'GO:0016740',0.137738
9808,Q9V2V6,'GO:0140096',0.131219


In [40]:
# Extract top 10 (with highest probability) predictions for Cellular Component aspect
df_predictions_cc = pd.DataFrame(predictions_cc, columns=[go_classes_cc])
df_predictions_cc.insert(0, 'Protein_ID', test_data.Protein_ID)

top_predictions_cc = []
df_predictions_cc.iloc[:, 1:] = df_predictions_cc.iloc[:, 1:].astype(float)
for i, row in df_predictions_cc.iterrows():
    protein_id = row["Protein_ID"]
    go_terms = row.iloc[1:].astype(float)  # Ensure numeric dtype
    
    
    # Get the top 10 predictions (GO terms with highest probability)
    top_10_indices = go_terms.nlargest(10).index  # Get top 10 column names
    top_10_probs = go_terms.nlargest(10).values   # Get corresponding probabilities

    # Store results
    for term, prob in zip(top_10_indices, top_10_probs):
        top_predictions_cc.append({"Protein_ID": protein_id, "GO_Term": term, "Probability": prob})

df_top_predictions_cc = pd.DataFrame(top_predictions_cc)
df_top_predictions_cc["GO_Term"] = df_top_predictions_cc["GO_Term"].astype(str).apply(lambda x: x.strip("()").replace(",", ""))

df_top_predictions_cc

Unnamed: 0,Protein_ID,GO_Term,Probability
0,O43747,'GO:0005575',0.999998
1,O43747,'GO:0110165',0.992166
2,O43747,'GO:0005622',0.750362
3,O43747,'GO:0043226',0.620377
4,O43747,'GO:0043229',0.611082
...,...,...,...
9805,Q9V2V6,'GO:0043229',0.579011
9806,Q9V2V6,'GO:0043227',0.525197
9807,Q9V2V6,'GO:0043231',0.519611
9808,Q9V2V6,'GO:0016020',0.320309


In [41]:
combined_predictions = pd.concat([df_top_predictions_bp, df_top_predictions_mf, df_top_predictions_cc], ignore_index=True)
combined_predictions

Unnamed: 0,Protein_ID,GO_Term,Probability
0,O43747,'GO:0008150',0.999997
1,O43747,'GO:0009987',0.615575
2,O43747,'GO:0065007',0.412235
3,O43747,'GO:0050789',0.394671
4,O43747,'GO:0050896',0.368556
...,...,...,...
29425,Q9V2V6,'GO:0043229',0.579011
29426,Q9V2V6,'GO:0043227',0.525197
29427,Q9V2V6,'GO:0043231',0.519611
29428,Q9V2V6,'GO:0016020',0.320309


In [43]:
sorted_df = combined_predictions.sort_values(by="Protein_ID").reset_index(drop=True)

In [44]:
sorted_df

Unnamed: 0,Protein_ID,GO_Term,Probability
0,A0A0B4JCV4,'GO:0022857',0.126721
1,A0A0B4JCV4,'GO:0110165',0.992916
2,A0A0B4JCV4,'GO:0008150',0.999997
3,A0A0B4JCV4,'GO:0009987',0.639608
4,A0A0B4JCV4,'GO:0065007',0.396411
...,...,...,...
29425,W7K139,'GO:0009987',0.642095
29426,W7K139,'GO:0065007',0.420684
29427,W7K139,'GO:0050789',0.392575
29428,W7K139,'GO:0050794',0.342651


In [45]:
sorted_df.to_csv("sorted_protein_predictions.csv", index=False)