In [None]:
!pip install biopython
# !pip install -r requirements.txt

import sys
import os
import logging
import yaml
from google.colab import drive
import pandas as pd
import numpy as np
import random

drive.mount('/content/drive')

PROJECT_DIR = "/content/drive/MyDrive/BiologicalData/progetto/biological_data_pfp/biological_data_pfp"
SRC_DIR = f"{PROJECT_DIR}/src"
CONFIG_FILE = f"{PROJECT_DIR}/notebooks/flavio/config.yaml"
OUTPUT_FILE = f"{PROJECT_DIR}/results/submission.tsv"

sys.path.append(SRC_DIR)

# Load the configuration data from the YAML file
with open(CONFIG_FILE, 'r') as f:
    config_data = yaml.safe_load(f)

import data_preprocessing as dp

np.random.seed(42)
random.seed(42)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_set, train_sequences, train_ids, train_embeddings, train_domains = dp.load_train_data(config_data)

In [None]:
train_mf, train_bp, train_cc = dp.split_by_aspect(train_set)

custom test set

In [None]:
# extract from train id list 1000 random proteins for test set
n_examples = 1000
custom_test_set_ids = np.random.choice(train_ids, size=n_examples, replace=False)

In [None]:
test_set = pd.DataFrame(index=custom_test_set_ids)

In [None]:
len(custom_test_set_ids)

1000

In [None]:
def get_selected_embeddings_as_df(data, train_ids, train_embeddings):
    # Get unique protein IDs from the input data
    unique_proteins = data['protein_id'].unique()

    # Find the indices of these proteins in train_ids
    indices = [np.where(train_ids == protein)[0][0] for protein in unique_proteins]

    # Select the corresponding embeddings
    selected_embeddings = train_embeddings[indices]

    # Create a DataFrame with protein IDs as the index
    embeddings_df = pd.DataFrame(selected_embeddings, index=unique_proteins)

    return embeddings_df

df_embeddings_mf = get_selected_embeddings_as_df(train_mf, train_ids, train_embeddings)
df_embeddings_bp = get_selected_embeddings_as_df(train_bp, train_ids, train_embeddings)
df_embeddings_cc = get_selected_embeddings_as_df(train_cc, train_ids, train_embeddings)

In [None]:
# this is useful to get test embeddings
df_embeddings = pd.concat([df_embeddings_mf, df_embeddings_cc, df_embeddings_bp], axis=0)
df_embeddings = df_embeddings[~df_embeddings.index.duplicated(keep='first')]

These 3 dfs are the same for now. later we will add the BLAST data results

In [None]:
# get test embeddings
test_mf = test_set.merge(df_embeddings, how='left', left_index=True, right_index=True)
test_cc = test_set.merge(df_embeddings, how='left', left_index=True, right_index=True)
test_bp = test_set.merge(df_embeddings, how='left', left_index=True, right_index=True)

Import blast data

In [None]:
df_blast_mf = pd.read_csv("{}/train/blast_dfs/df_blast_mf_all.tsv".format(PROJECT_DIR), sep='\t', index_col=0)
df_blast_cc = pd.read_csv("{}/train/blast_dfs/df_blast_cc_all.tsv".format(PROJECT_DIR), sep='\t', index_col=0)
df_blast_bp = pd.read_csv("{}/train/blast_dfs/df_blast_bp_all.tsv".format(PROJECT_DIR), sep='\t', index_col=0)

In [None]:
print(df_embeddings_mf.shape)
print(df_embeddings_bp.shape)
print(df_embeddings_cc.shape)
print("--------------")
print(df_blast_mf.shape)
print(df_blast_bp.shape)
print(df_blast_cc.shape)

(55698, 1024)
(83064, 1024)
(84638, 1024)
--------------
(105655, 839)
(111978, 1487)
(111747, 678)


here we take only the rows from the blast file that have a protein associated in the respective aspect

In [None]:
df_blast_mf = df_blast_mf[df_blast_mf.index.isin(df_embeddings_mf.index)]
df_blast_bp = df_blast_bp[df_blast_bp.index.isin(df_embeddings_bp.index)]
df_blast_cc = df_blast_cc[df_blast_cc.index.isin(df_embeddings_cc.index)]

In [None]:
print(df_embeddings_mf.shape)
print(df_embeddings_bp.shape)
print(df_embeddings_cc.shape)
print("--------------")
print(df_blast_mf.shape)
print(df_blast_bp.shape)
print(df_blast_cc.shape)

(55698, 1863)
(83064, 1024)
(84638, 1024)
--------------
(52053, 839)
(77451, 1487)
(77639, 678)


create the BLAST test set

In [None]:
df_blast_mf_test = df_blast_mf[df_blast_mf.index.isin(test_mf.index)]
df_blast_bp_test = df_blast_bp[df_blast_bp.index.isin(test_bp.index)]
df_blast_cc_test = df_blast_cc[df_blast_cc.index.isin(test_cc.index)]

In [None]:
df_blast_mf_test = test_set.merge(df_blast_mf_test, how='left', left_index=True, right_index=True)
df_blast_bp_test = test_set.merge(df_blast_bp_test, how='left', left_index=True, right_index=True)
df_blast_cc_test = test_set.merge(df_blast_cc_test, how='left', left_index=True, right_index=True)

In [None]:
df_blast_mf_test = df_blast_mf_test.fillna(0)
df_blast_bp_test = df_blast_bp_test.fillna(0)
df_blast_cc_test = df_blast_cc_test.fillna(0)

In [None]:
df_embeddings_mf = df_embeddings_mf[~df_embeddings_mf.index.isin(test_mf.index)]
df_embeddings_bp = df_embeddings_bp[~df_embeddings_bp.index.isin(test_bp.index)]
df_embeddings_cc = df_embeddings_cc[~df_embeddings_cc.index.isin(test_cc.index)]

In [None]:
df_blast_mf = df_blast_mf[~df_blast_mf.index.isin(df_blast_mf_test.index)]
df_blast_bp = df_blast_bp[~df_blast_bp.index.isin(df_blast_bp_test.index)]
df_blast_cc = df_blast_cc[~df_blast_cc.index.isin(df_blast_cc_test.index)]

In [None]:
print(df_embeddings_mf.shape)
print(df_embeddings_bp.shape)
print(df_embeddings_cc.shape)
print("--------------")
print(df_blast_mf.shape)
print(df_blast_bp.shape)
print(df_blast_cc.shape)

(55251, 1863)
(82389, 1024)
(83948, 1024)
--------------
(51645, 839)
(76825, 1487)
(77009, 678)


free some RAM

In [None]:
del train_set, train_sequences, train_embeddings, train_domains, df_embeddings

In [None]:
del train_mf, train_bp, train_cc

In [None]:

def tfidf_transform(df):
    tf_matrix = df.div(df.sum(axis=1), axis=0)                      # compute the term frequency
    df_df = (df > 0).astype(int)                                    # calculate document frequency for each GO term
    # calculate inverse document frequency
    N = df.shape[0]                                                 # Total number of proteins (documents)
    df_idf = np.log((N + 1) / (df_df.sum(axis=0) + 1))
    # Apply TF-IDF transformation
    tfidf_matrix = tf_matrix.multiply(df_idf, axis=1)               # Element-wise multiplication of the count matrix and IDF

    df_blast_tfidf = pd.DataFrame(tfidf_matrix, columns=df.columns, index=df.index) # Convert to df

    return df_blast_tfidf, df_idf

In [None]:
def tfidf_fit_transform(df, df_idf):
    # this is the same, but to be applied to the test set
    tf_matrix = df.div(df.sum(axis=1), axis=0)
    tfidf_matrix = tf_matrix.multiply(df_idf, axis=1)
    df_blast_tfidf = pd.DataFrame(tfidf_matrix, columns=df.columns, index=df.index)
    df_blast_tfidf = df_blast_tfidf.fillna(0)
    return df_blast_tfidf

In [None]:
df_blast_mf, df_idf_mf = tfidf_transform(df_blast_mf)
df_blast_bp, df_idf_bp = tfidf_transform(df_blast_bp)
df_blast_cc, df_idf_cc = tfidf_transform(df_blast_cc)

df_blast_mf_test = tfidf_fit_transform(df_blast_mf_test, df_idf_mf)
df_blast_bp_test = tfidf_fit_transform(df_blast_bp_test, df_idf_bp)
df_blast_cc_test = tfidf_fit_transform(df_blast_cc_test, df_idf_cc)

In [None]:
print(df_blast_mf.shape)
print(df_blast_bp.shape)
print(df_blast_cc.shape)

(51645, 839)
(76825, 1487)
(77009, 678)


In [None]:
df_blast_mf = pd.merge(df_embeddings_mf, df_blast_mf, left_index=True, right_index=True, how='left').fillna(0)
df_blast_bp = pd.merge(df_embeddings_bp, df_blast_bp, left_index=True, right_index=True, how='left').fillna(0)
df_blast_cc = pd.merge(df_embeddings_cc, df_blast_cc, left_index=True, right_index=True, how='left').fillna(0)

df_blast_mf_test = pd.merge(test_mf, df_blast_mf_test, left_index=True, right_index=True, how='left').fillna(0)
df_blast_bp_test = pd.merge(test_bp, df_blast_bp_test, left_index=True, right_index=True, how='left').fillna(0)
df_blast_cc_test = pd.merge(test_cc, df_blast_cc_test, left_index=True, right_index=True, how='left').fillna(0)


In [None]:
print(df_blast_mf.shape)
print(df_blast_bp.shape)
print(df_blast_cc.shape)
print('-----------------')
print(df_blast_mf_test.shape)
print(df_blast_bp_test.shape)
print(df_blast_cc_test.shape)


(55251, 1863)
(82389, 2511)
(83948, 1702)
-----------------
(1000, 1863)
(1000, 2511)
(1000, 1702)


In [None]:
if list(df_blast_mf.columns) == list(df_blast_mf_test.columns):
    print("The column order is the same in both DataFrames.")
else:
    print("The column order is different in the two DataFrames.")

The column order is the same in both DataFrames.


In [None]:
df_blast_mf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,GO:1901682,GO:1901702,GO:1901981,GO:1902936,GO:1904315,GO:1904680,GO:1990756,GO:1990782,GO:1990837,GO:1990841
A0A009IHW8,0.068176,-0.046478,0.001752,-0.008583,0.003763,0.046265,-0.059662,-0.050385,-0.005173,0.008865,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
A0A023FBW4,0.002447,0.007053,0.064453,0.007271,-0.033569,-0.009933,-0.022186,-0.083862,-0.003841,-0.018631,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
A0A023FBW7,0.049316,0.020691,0.108643,0.016342,-0.051056,-0.017334,-0.042084,-0.154053,0.007347,0.029907,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
A0A023FDY8,0.056488,0.019241,0.112122,0.019608,-0.055939,-0.016129,-0.045105,-0.152466,0.003454,0.026855,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
A0A023FF81,-0.000163,0.041138,0.098633,0.012909,-0.031494,-0.016129,-0.014793,-0.157837,-0.018585,-0.000127,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
X5KCU9,0.030869,-0.041443,-0.026169,0.019669,0.008804,0.005413,-0.055847,-0.071655,0.000172,-0.013817,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
X5KJC0,0.053864,0.097351,0.010437,0.018051,-0.049103,0.045563,-0.038574,-0.054688,0.066528,0.006565,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
X5L1L5,0.052826,0.097229,0.010933,0.022873,-0.047455,0.046021,-0.037720,-0.054840,0.068909,0.006290,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0
X5L565,0.053955,0.097717,0.010117,0.022079,-0.047607,0.045807,-0.037964,-0.053772,0.068237,0.005848,...,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0


In [None]:
df_blast_mf.to_csv('df_blast_mf_train_tfidf.tsv', sep='\t', index=True, header = True)
df_blast_bp.to_csv('df_blast_bp_train_tfidf.tsv', sep='\t', index=True, header = True)
df_blast_cc.to_csv('df_blast_cc_train_tfidf.tsv', sep='\t', index=True, header = True)

df_blast_mf_test.to_csv('df_blast_mf_test_tfidf.tsv', sep='\t', index=True, header = True)
df_blast_bp_test.to_csv('df_blast_bp_test_tfidf.tsv', sep='\t', index=True, header = True)
df_blast_cc_test.to_csv('df_blast_cc_test_tfidf.tsv', sep='\t', index=True, header = True)

In [None]:
#!mkdir blast_dfs
#!mv 'df_blast_mf_tfidf.tsv' blast_dfs/
#!mv 'df_blast_bp_tfidf.tsv' blast_dfs/
#!mv 'df_blast_cc_tfidf.tsv' blast_dfs/
!mv 'df_blast_mf_test_tfidf.tsv' blast_dfs/
!mv 'df_blast_bp_test_tfidf.tsv' blast_dfs/
!mv 'df_blast_cc_test_tfidf.tsv' blast_dfs/

!cp -r "blast_dfs" "$PROJECT_DIR/train/"