<a href="https://colab.research.google.com/github/archiebenn/BIOLM0050_kaggle/blob/master/protein_embedding_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -q torch transformers sentencepiece h5py


In [None]:
import h5py
import numpy as np
import os
import torch

from transformers import T5EncoderModel, T5Tokenizer

In [None]:
!unzip bbinf-26-subcell.zip

In [None]:
# store the embeddings in a dictionary

def getEmbeddings(filename):

  embeddings_dict = {}

  with h5py.File(filename, 'r') as f:

      # Iterate through all keys (protein accessions) in the HDF5 file
      for accession in f.keys():

          if accession != 'metadata':

              embeddings_dict[accession] = f[accession][:] # Load the embedding for each accession

  return embeddings_dict


filename="for_embed_prot_t5.h5"
embeddings_dict = getEmbeddings(filename)



In [None]:
import pandas as pd
df_train = pd.read_csv('train.csv')
df_kaggle_test= pd.read_csv('test.csv')

In [None]:
target_cols = ['cytoplasm', 'nucleus', 'extracellular', 'cell_surface', 'mitochondrion', 'endom']

y = df_train[target_cols]

In [None]:
if embeddings_dict:
    # Get the dimension of the embeddings from the first item
    embedding_dim = next(iter(embeddings_dict.values())).shape[0]
else:
    print("embeddings_dict is empty. Please check the loading process.")
    embedding_dim = 0 # Or handle this error appropriately

X = np.stack(df_train["acc"].apply(
    lambda acc: embeddings_dict.get(acc, np.zeros(embedding_dim)) # Use np.zeros for missing embeddings
))

print(X.shape)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

# Instantiate a LogisticRegression model
model = LogisticRegression(random_state=42, solver='liblinear', verbose=0)
#model=RandomForestClassifier(n_estimators=100, random_state=42)

# Create a MultiOutputClassifier instance
multi_output_model = MultiOutputClassifier(estimator=model)

# Train the MultiOutputClassifier model
multi_output_model.fit(X_train, y_train)

print("Multi-label classification model trained successfully.")

In [None]:
from sklearn.metrics import f1_score

# Calculate F1 score
y_pred = multi_output_model.predict(X_test)
f1 = f1_score(y_test, y_pred, average='macro')
print(f"F1 Score (macro average): {f1:.4f}")

# Ignore all below

1) Train a classifier on embeddings
2) Predict localisations for the human proteome
3) Identify the subset of cell-surface proteins (these are potential immunotherapy targets)
4) Rank the predicted cell surface human proteins by how many cancers they are differentially expressed in.
5) Manually check targets to using the human protein atlas targets should be:

    i) not expressed in all cells

    ii) especially not expressed in T-cells

In [None]:
# @title download genes upregulated in cancer
!gdown 1gJmvIiAqFcXdtBkwgAciPkMIZIKsILxV



In [None]:
df_pan_cancer = pd.read_csv("pan_cancer_de.csv").rename(columns={"ACC":"acc"})

In [None]:
df_canc_count = df_pan_cancer[["acc","canc_type"]].groupby("acc")['canc_type'].nunique().reset_index()

In [None]:
# @title download embeddings for human proteome
!gdown 1bgY8QuZx3BdNOiVPzo5kJ1ZqZyjqdkS6

In [None]:
filename='UP000005640_9606_prot_t5.h5'
embeddings_dict_human = getEmbeddings(filename)

In [None]:
if embeddings_dict:
    # Get the dimension of the embeddings from the first item
    embedding_dim = next(iter(embeddings_dict.values())).shape[0]
else:
    print("embeddings_dict is empty. Please check the loading process.")
    embedding_dim = 0 # Or handle this error appropriately

X = np.stack(df_canc_count["acc"].apply(
    lambda acc: embeddings_dict_human.get(acc, np.zeros(embedding_dim)) # Use np.zeros for missing embeddings
))

print(X.shape)

In [None]:
y_pred = multi_output_model.predict(X)

In [None]:
df_hum_preds = pd.DataFrame(y_pred, columns=target_cols)

In [None]:
df_hum_preds["acc"] = df_canc_count["acc"]

In [None]:
df_canc_count.merge(df_hum_preds[df_hum_preds.cell_surface==1],on="acc").sort_values("canc_type",ascending=False)

# Predict localisations at scale

In [None]:
X_kaggle_test = np.stack(df_kaggle_test["acc"].apply(
    lambda acc: embeddings_dict.get(acc, np.zeros(embedding_dim)) # Use np.zeros for missing embeddings
))

print(X_kaggle_test.shape)

y_kaggle_pred = multi_output_model.predict(X_kaggle_test)
print("Predictions generated with shape:", y_kaggle_pred.shape)

In [None]:
y_pred_df = pd.concat([df_kaggle_test['Id'], pd.DataFrame(y_kaggle_pred, columns=y_train.columns)], axis=1)

In [None]:
y_pred_df.to_csv("seq_t5embed_log_reg.csv", index=False)

In [None]:
!head seq_ohe_log_reg.csv

In [None]:
%cd /content/drive/MyDrive/ai-medicine/2026/bioinformatics-modular/project_datasets/protein_embeddings

In [None]:
filename='/content/drive/MyDrive/ai-medicine/2026/bioinformatics-modular/project_datasets/protein_embeddings/protein.sequence.embeddings.v12.0.h5'

In [None]:
!wget https://stringdb-downloads.org/download/species.v12.0.txt

In [None]:
import pandas as pd
df_spec= pd.read_csv("species.v12.0.txt", delimiter="\t")

In [None]:
tids = list(df_spec[df_spec.domain == "Eukaryotes"]["#taxon_id"])

In [None]:
tids

In [None]:
import itertools
with h5py.File(filename, 'r') as f:
    meta_keys = f['metadata'].attrs.keys()
    for key in meta_keys:
        print(key, f['metadata'].attrs[key])

    #embedding = f['embeddings'][:]
    #proteins = f['proteins'][:]
    print(type( f['species']))
    counter=0

    for species in tids[0:2]:
      print(species)
      species_str = str(species)
      embeddings = f['species'][species_str]['embeddings'][:]
      proteins = f['species'][species_str]['proteins'][:]
      tax_pred = multi_output_model.predict(embeddings)
      # protein names are stored as bytes, convert them to strings
      proteins = [p.decode('utf-8') for p in proteins]

In [None]:
import pandas as pd
import h5py

# Initialize an empty list to store DataFrames for each species
all_predictions_dfs = []

# Re-open the HDF5 file and iterate through the species
with h5py.File(filename, 'r') as f:
    print("Starting prediction and concatenation for selected eukaryotic species...")

    # Iterate through the first two eukaryotic species (as in previous execution)
    counter =0
    for species_id in tids:
        species_str = str(species_id)
        counter +=1
        print("doing species", species_str, counter)
        # Check if the species_str is a key in f['species']
        if species_str in f['species']:
            embeddings = f['species'][species_str]['embeddings'][:]
            proteins = f['species'][species_str]['proteins'][:]
            tax_pred = multi_output_model.predict(embeddings)

            # protein names are stored as bytes, convert them to strings
            proteins_decoded = [p.decode('utf-8') for p in proteins]

            # Create a DataFrame for the current species' predictions
            df_current_species = pd.DataFrame(tax_pred, columns=target_cols)
            df_current_species.insert(0, 'protein_accession', proteins_decoded)
            df_current_species.insert(0, 'taxon_id', species_id) # Add taxon_id as a separate column

            all_predictions_dfs.append(df_current_species)
            print(f"  Processed taxon {species_id} with {len(proteins_decoded)} proteins. Added to list.")
        else:
            print(f"  Species ID {species_id} not found in the HDF5 file. Skipping.")

# Concatenate all individual species DataFrames into one large DataFrame
if all_predictions_dfs:
    combined_predictions_df = pd.concat(all_predictions_dfs, ignore_index=True)
    print("\nSuccessfully concatenated predictions for all processed taxa.")
    print("Shape of combined_predictions_df:", combined_predictions_df.shape)
    print("First 5 rows of combined_predictions_df:")
    print(combined_predictions_df.head())
else:
    print("\nNo predictions were generated. The 'all_predictions_dfs' list is empty.")

# Note to user: This cell now performs the full iteration and DataFrame creation.
# You may want to modify or remove the loop in the previous cell (75SrziDDEGtr)
# if you no longer wish for it to be executed separately.
combined_predictions_df.to_csv('euk_loc_t5_logreg_preds.csv')

In [None]:
!wc -l euk_loc_t5_logreg_preds.csv

In [None]:
df_loc=pd.read_csv("euk_loc_t5_logreg_preds.csv")

In [None]:
df_loc_mean=df_loc.groupby("taxon_id").sum(numeric_only=True).drop(columns=["Unnamed: 0"])

In [None]:
result = df_loc_mean.div(df_loc_mean.sum(axis=1), axis=0)

In [None]:
result=df_loc_mean.copy()

In [None]:
result["total"] = result.select_dtypes(include="number").sum(axis=1)

In [None]:
df_spec.rename(columns={"#taxon_id":"taxon_id"}).merge(result.reset_index(), left_on="taxon_id", right_on="taxon_id")