# 2nd aproach: using the whole protein sequence

In [34]:
from Bio import SeqIO
from pyfaidx import Fasta
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

fasta_file = "Homo_sapiens.GRCh38.pep.all.fa"

edited_sequences = {}

# Function to extract the transcript ID from the header
def get_transcript_id(header):
    header_parts = header.split()
    for part in header_parts:
        if part.startswith("transcript:"):
            return part.split(":")[1]

for record in SeqIO.parse(fasta_file, "fasta"):
    transcript_id = get_transcript_id(record.description) # Extract the transcript ID from the header
    edited_sequences[transcript_id] = str(record.seq)

In [39]:
prot_seq = pd.DataFrame(edited_sequences.items(), columns=["Version_stable_ID", "protein_seq"])
def remove_version(stable_id):
    return stable_id.split(".")[0]
prot_seq["id"]=prot_seq["Version_stable_ID"].apply(remove_version)


## Get the Canonical dataset

In [12]:
from pybiomart import Dataset

dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')


df = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name', 'ensembl_transcript_id', 'transcript_is_canonical'])
transcrip_df=df[df["Ensembl Canonical"]==1].copy() # filter to only show the canonical genes

# Merge the two datasets

In [41]:
cano_df = pd.merge(transcrip_df,prot_seq , left_on='Transcript stable ID', right_on='id')
clean_df = cano_df[["Gene name","Gene stable ID","Version_stable_ID","protein_seq"]].copy()
clean_df.head()

Unnamed: 0,Gene name,Gene stable ID,Version_stable_ID,protein_seq
0,MT-ND1,ENSG00000198888,ENST00000361390.2,MPMANLLLLIVPILIAMAFLMLTERKILGYMQLRKGPNVVGPYGLL...
1,MT-ND2,ENSG00000198763,ENST00000361453.3,MNPLAQPVIYSTIFAGTLITALSSHWFFTWVGLEMNMLAFIPVLTK...
2,MT-CO1,ENSG00000198804,ENST00000361624.2,MFADRWLFSTNHKDIGTLYLLFGAWAGVLGTALSLLIRAELGQPGN...
3,MT-CO2,ENSG00000198712,ENST00000361739.1,MAHAAQVGLQDATSPIMEELITFHDHALMIIFLICFLVLYALFLTL...
4,MT-ATP8,ENSG00000228253,ENST00000361851.1,MPQLNTTVWPTMITPMLLTLFLITQLKMLNTNYHLPPSPKPMKMKN...
...,...,...,...,...
23818,DNAJC16,ENSG00000116138,ENST00000375847.8,MEVRKLSISWQFLIVLVLILQILSALDFDPYRVLGVSRTASQADIK...
23819,AGMAT,ENSG00000116771,ENST00000375826.4,MLRLLASGCARGPGPGVGARPAAGLFHPGRRQSRQASDAPRNQPPS...
23820,DDI2,ENSG00000197312,ENST00000480945.6,MLLTVYCVRRDLSEVTFSLQVDADFELHNFRALCELESGIPAAESQ...
23821,RSC1A1,ENSG00000215695,ENST00000345034.2,MSSLPTSDGFNHPARSSGQSPDVGNPMSLARSVSASVCPIKPSDSD...


### Export to csv

In [42]:
clean_df.to_csv("canonical_protein_seq.csv", index=False)

In [1]:
from bio_embeddings.embed import SeqVecEmbedder

embedder = SeqVecEmbedder()

  from .autonotebook import tqdm as notebook_tqdm


ImportError: cannot import name 'SeqVecEmbedder' from 'bio_embeddings.embed' (c:\Users\joaop\anaconda3\envs\dna\lib\site-packages\bio_embeddings\embed\__init__.py)