In [1]:
# # Uncomment this if using goole drive
# from google.colab import drive

# drive.mount('/content/drive/')

Mounted at /content/drive/


In [11]:
# PATH = '/content/drive/MyDrive/Padua/Classes/BioData/Biological Data Project/biological_data_pfp/'
PATH = './biological_data_pfp/'

npy_file_path_embeddings = PATH + 'test/test_embeddings.npy'
npy_file_path_ids = PATH + 'test/test_ids.npy'
domain_embeddings_path = PATH + 'test/test_domain_embeddings_pca.npy'
npy_file_path_ids_pca = PATH + 'test/test_domain_embeddings_pca_ids.npy'

cc_model_path = PATH + 'models/MLP_cellular_component_final'
mf_model_path = PATH + 'models/MLP_molecular_function_final'
bp_model_path = PATH + 'models/MLP_biological_process_final'

cc_labels_order_path = PATH + 'sorted_labels_per_aspect/labels_cc.csv'
mf_labels_order_path = PATH + 'sorted_labels_per_aspect/labels_mf.csv'
bp_labels_order_path = PATH + 'sorted_labels_per_aspect/labels_bp.csv'

submission_file_path = PATH + "submission.tsv"

# Libraries imports

In [2]:
import tensorflow as tf
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Required for progressbar widget
import progressbar

# Load the T5 embeddings

In [3]:
# Load embeddings from the numpy file
X_loaded = np.load(npy_file_path_embeddings)

# Load IDs from the numpy file
ids_loaded = np.load(npy_file_path_ids)

In [4]:
# Now lets convert embeddings numpy array(train_embeddings) into pandas dataframe.
column_num = X_loaded.shape[1]
t5_df = pd.DataFrame(X_loaded, columns = ["Column_" + str(i) for i in range(1, column_num+1)])
t5_df.set_index(pd.Index(ids_loaded), inplace=True)

print(t5_df.shape)

(1000, 1024)


In [5]:
del X_loaded

# Load domain embeddings

In [6]:
domain_embeddings = np.load(domain_embeddings_path)

In [12]:
domain_embeddings_ids = np.load(npy_file_path_ids_pca, allow_pickle=True)

In [13]:
# Now lets convert embeddings numpy array(train_embeddings) into pandas dataframe.
column_num = domain_embeddings.shape[1]
domains_df = pd.DataFrame(domain_embeddings, columns = ["Column_" + str(i+1024) for i in range(1, column_num+1)])
domains_df.set_index(pd.Index(domain_embeddings_ids), inplace=True)

print(domains_df.shape)

(981, 1024)


In [14]:
del domain_embeddings

# Combine the T5 and domain embeddings

In [15]:
complete_df = pd.merge(t5_df, domains_df, left_index=True, right_index=True, how='inner')

In [16]:
del t5_df, domains_df

# Get the order of the Protein ids

In [17]:
protein_ids_list = complete_df.index.tolist()

# Load the three models

In [18]:
model_cc = tf.keras.models.load_model(cc_model_path)
model_mf = tf.keras.models.load_model(mf_model_path)
model_bp = tf.keras.models.load_model(bp_model_path)

# Make predictions with each model

In [19]:
predictions_cc = model_cc.predict(complete_df)



In [20]:
predictions_cc.shape

(981, 678)

In [21]:
predictions_mf = model_mf.predict(complete_df)



In [22]:
predictions_mf.shape

(981, 839)

In [23]:
predictions_bp = model_bp.predict(complete_df)



In [24]:
predictions_bp.shape

(981, 1487)

# Load the labels order for each aspect

In [25]:
labels_cc = pd.read_csv(cc_labels_order_path, header=None).iloc[:, 0].tolist()
labels_mf = pd.read_csv(mf_labels_order_path, header=None).iloc[:, 0].tolist()
labels_bp = pd.read_csv(bp_labels_order_path, header=None).iloc[:, 0].tolist()

# Generate three dataframes with the predictions

In [26]:
df_submission_cc = pd.DataFrame(columns = ['Protein Id', 'GO Term Id','Prediction'])

l = []
for k in protein_ids_list:
    l += [ k] * predictions_cc.shape[1]

df_submission_cc['Protein Id'] = l
df_submission_cc['GO Term Id'] = labels_cc * predictions_cc.shape[0]
df_submission_cc['Prediction'] = predictions_cc.ravel()

In [27]:
df_submission_mf = pd.DataFrame(columns = ['Protein Id', 'GO Term Id','Prediction'])

l = []
for k in protein_ids_list:
    l += [ k] * predictions_mf.shape[1]

df_submission_mf['Protein Id'] = l
df_submission_mf['GO Term Id'] = labels_mf * predictions_mf.shape[0]
df_submission_mf['Prediction'] = predictions_mf.ravel()

In [28]:
df_submission_bp = pd.DataFrame(columns = ['Protein Id', 'GO Term Id','Prediction'])

l = []
for k in protein_ids_list:
    l += [ k] * predictions_bp.shape[1]

df_submission_bp['Protein Id'] = l
df_submission_bp['GO Term Id'] = labels_bp * predictions_bp.shape[0]
df_submission_bp['Prediction'] = predictions_bp.ravel()

# Combine the dataframes to one

In [29]:
df_submission = pd.concat([df_submission_bp, df_submission_mf, df_submission_cc], ignore_index=True)

# Filter the dataframe such that there are no more than 1500 predictions per Protein ID

In [30]:
filtered_df = df_submission[df_submission['Prediction'] > 0.2]

In [31]:
filtered_df

Unnamed: 0,Protein Id,GO Term Id,Prediction
0,A0A0B4JCV4,GO:0008150,0.950101
1,A0A0B4JCV4,GO:0009987,0.920557
2,A0A0B4JCV4,GO:0065007,0.849362
3,A0A0B4JCV4,GO:0050789,0.841747
4,A0A0B4JCV4,GO:0050794,0.839409
...,...,...,...
2946258,W7K139,GO:0032991,0.734570
2946266,W7K139,GO:0012505,0.216386
2946270,W7K139,GO:0031982,0.916510
2946278,W7K139,GO:0097708,0.943124


In [32]:
# Count the occurrences of each protein_ID
id_counts = filtered_df['Protein Id'].value_counts()

# Find the highest count
max_repeats = id_counts.max()
max_repeats

159

If the value of max_repeats is bigger than 1500, adjust the threshold value!!!!

In [33]:
filtered_df.to_csv(submission_file_path, header=False, index=False, sep="\t")