In [1]:
import scanpy as sc
from cellflow.model import CellFlow
import requests
import pandas as pd



In [2]:
from UniProtMapper import ProtMapper
def get_protein_sequence_by_gene(gene_name):
    mapper = ProtMapper()
    result, failed = mapper.get(
        ids=gene_name, from_db="Gene_Name", to_db="UniProtKB"
    )
    result = result[(result['Organism'] == "Homo sapiens (Human)")&(result['Reviewed'] == "reviewed")]
    protein = result.iloc[0]["Entry"]
    # print(protein)
    # Define the UniProt API endpoint
    sequence_url = f"https://www.uniprot.org/uniprot/{protein}.fasta"
    sequence_response = requests.get(sequence_url)
        
    if sequence_response.status_code == 200:
        # Extract and return the protein sequence
        sequence = ''.join(sequence_response.text.splitlines()[1:])
        return sequence
    else:
        return "NONE"

# Example usage
# gene_name = "PTPRC"  # Replace with your gene name
# protein_sequence = get_protein_sequence_by_gene(gene_name)
# print(f"Protein Sequence for {gene_name}:\n{protein_sequence}")

  import pkg_resources


In [3]:
# filePath = "data/vcc_data/adata_Training.h5ad"
filePath = "data/vcc_sample.h5ad"

In [4]:
adata = sc.read_h5ad(filePath)

In [5]:
adata

AnnData object with n_obs × n_vars = 88509 × 18080
    obs: 'target_gene', 'guide_id', 'batch', 'control'
    var: 'gene_id'

In [6]:
adata.obs['control'] = [(lambda x: True if x == "non-targeting" else False)(x) for x in adata.obs['target_gene']]

In [7]:
cf = CellFlow(adata)

In [8]:
# Prepare gene embeddings
# Sort out target genes
genes = adata.obs[adata.obs['control'] == False]['target_gene'].to_list()
genes = list(set(genes))


In [9]:
# Parameters for preparing data
sample_rep = "X"
control_key = "control"
perturbation_covariates = {"gene": "target_gene"}
split_covariates = ["batch"]
perturbation_covariate_reps = {"gene": "gene_embeddings"}

In [10]:
embedding = pd.DataFrame(columns=["gene", "protein", "embedding"])
embedding["gene"] = genes
embedding.index = genes

In [11]:
t = embedding.iloc[1:5]

In [12]:
embedding["protein"] = embedding['gene'].apply(get_protein_sequence_by_gene)

Fetched: 500 / 617
Retrying in 3s
Fetched: 30 / 30
Fetched: 500 / 909
Fetched: 500 / 886
Fetched: 500 / 819
Fetched: 500 / 625
Fetched: 362 / 362
Fetched: 500 / 689
Fetched: 500 / 545
Retrying in 3s
Fetched: 500 / 805
Fetched: 82 / 82
Fetched: 500 / 916
Fetched: 500 / 802
Fetched: 451 / 451
Fetched: 348 / 348
Fetched: 366 / 366
Fetched: 227 / 227
Fetched: 500 / 864
Fetched: 452 / 452
Retrying in 3s
Fetched: 451 / 451
Fetched: 500 / 742
Fetched: 500 / 1206
Fetched: 500 / 604
Retrying in 3s
Fetched: 500 / 803
Fetched: 500 / 798
Retrying in 3s
Fetched: 500 / 1011
Retrying in 3s
Retrying in 3s
Fetched: 500 / 875
Fetched: 500 / 1026
Fetched: 348 / 348
Retrying in 3s
Fetched: 500 / 555
Fetched: 225 / 225
Retrying in 3s
Fetched: 500 / 824
Retrying in 3s
Fetched: 500 / 661
Fetched: 368 / 368
Retrying in 3s
Fetched: 339 / 339
Fetched: 500 / 756
Fetched: 500 / 606
Fetched: 500 / 719
Fetched: 236 / 236
Fetched: 106 / 106
Fetched: 500 / 652
Fetched: 500 / 521
Fetched: 500 / 639
Fetched: 500 / 752


In [13]:
embedding

Unnamed: 0,gene,protein,embedding
STAT6,STAT6,MSLWGLVSKMPPEKVQRLYVDFPQHLRHLLGDWLESQPWEFLVGSD...,
ZNF714,ZNF714,MNVMLENYKNLVFLAGIAVSKQDPITSLEQEKEPWNMKICEMVDES...,
MED1,MED1,MGTTGLESLSLGDRGAAPTVTSSERLVPDPPNDLRKEDVAMELERV...,
BRD9,BRD9,MGKKHKKHKAEWRSSYEDYADKPLEKPLKLVLKVGGSEVTELSGSG...,
CASP2,CASP2,MAAPSAGSWSTFQHKELMAADRGRRILGVCGMHPHHQETLKKNRVV...,
...,...,...,...
SALL4,SALL4,MSRRKQAKPQHINSEEDQGEQQPQQQTPEFADAAPAAPAAGELGAP...,
HIRA,HIRA,MKLLKPTWVNHNGKPIFSVDIHPDGTKFATGGQGQDSGKVVIWNMS...,
IDE,IDE,MRYRLAWLLHPALPSTFRSVLGARLPPPERLCGFQKKTYSKMNNPA...,
SV2A,SV2A,MEEGFRDRAAFIRGAKDIAKEVKKHAAKKVVKGLDRVQDEYSRRSY...,


In [19]:
import torch
from esm import pretrained
class ESMConverter:
  def __init__(self, model:str):
    self.model, self.alphabet = pretrained.load_model_and_alphabet(model)
    self.batch_converter = self.alphabet.get_batch_converter()

  def convert(self, sequences):
    batch_labels, batch_strs, batch_tokens = self.batch_converter(sequences)
    with torch.no_grad():
      token_embeddings = self.model(batch_tokens, repr_layers=[33])
      embeddings = token_embeddings['representations'][33]
      average_embeddings = embeddings.mean(dim=1)
    return average_embeddings

In [20]:
converter = ESMConverter("esm2_t33_650M_UR50D")

In [33]:
sequences = list(zip(embedding['gene'], embedding['protein']))
em = []
for s in sequences:
  em.append(converter.convert([s]))

In [36]:
embedding['embedding'] = em

In [39]:
pd.to_pickle(embedding,"subsample_gene_embedding.pkl")

In [58]:
mapper = ProtMapper()

result, failed = mapper.get(
    ids=["TP53"], from_db="Gene_Name", to_db="UniProtKB"
)

Fetched: 500 / 734


In [70]:
result[(result['Reviewed'] == "reviewed") & (result['Organism'] == "Homo sapiens (Human)")]

Unnamed: 0,From,Entry,Entry Name,Reviewed,Protein names,Gene Names,Organism,Length
6,TP53,P04637,P53_HUMAN,reviewed,Cellular tumor antigen p53 (Antigen NY-CO-13) ...,TP53 P53,Homo sapiens (Human),393
