In [14]:
from pyfaidx import Fasta
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np


sfari= pd.read_csv("sfari_ed.csv")

genes = Fasta('Homo_sapiens.GRCh38.cdna.all.fa')


## Full list of the transcripts

In [15]:
names = pd.DataFrame(genes.keys(), columns=["id_version"])
names["id"]= names["id_version"].str.split(".").str.get(0)

# Get the Canonical dataset

In [16]:
from pybiomart import Dataset

dataset = Dataset(name='hsapiens_gene_ensembl', host='http://www.ensembl.org')


df = dataset.query(attributes=['ensembl_gene_id', 'external_gene_name', 'ensembl_transcript_id', 'transcript_is_canonical','entrezgene_id'])

In [17]:
transcrip_df=df[df["Ensembl Canonical"]==1].copy() # filter to only show the canonical genes

In [18]:
def format_float(x): # remove the version from the values
    if isinstance(x, float) and not np.isnan(x):
        return str(int(x))
    return x

transcrip_df["NCBI gene (formerly Entrezgene) ID"] = transcrip_df["NCBI gene (formerly Entrezgene) ID"].apply(format_float)
transcrip_df = transcrip_df.rename(columns={"NCBI gene (formerly Entrezgene) ID": "NCBI_id"})


### Merge the full transcipts with the cannonical transcripts

The purpose of this is to extract the Transcript version

In [19]:
cano_df = pd.merge(transcrip_df,names , left_on='Transcript stable ID', right_on='id')

In [20]:
clean_df = cano_df[["Gene name","Gene stable ID","id_version","NCBI_id"]].copy()

In [21]:
clean_df

Unnamed: 0,Gene name,Gene stable ID,id_version,NCBI_id
0,MT-ND1,ENSG00000198888,ENST00000361390.2,4535
1,MT-ND2,ENSG00000198763,ENST00000361453.3,4536
2,MT-CO1,ENSG00000198804,ENST00000361624.2,4512
3,MT-CO2,ENSG00000198712,ENST00000361739.1,4513
4,MT-ATP8,ENSG00000228253,ENST00000361851.1,4509
...,...,...,...,...
42064,CHCHD2P6,ENSG00000235084,ENST00000454346.1,
42065,CD24P1,ENSG00000236500,ENST00000422383.1,
42066,DDI2,ENSG00000197312,ENST00000480945.6,84301
42067,RSC1A1,ENSG00000215695,ENST00000345034.2,6248


In [22]:
print(clean_df.isna().sum())

Gene name          6697
Gene stable ID        0
id_version            0
NCBI_id           18308
dtype: int64


# Add the sequences

In [23]:
def get_seq(id):
    if id in genes:
                seq=genes[id][:].seq
                return seq

In [24]:
clean_dna=clean_df.copy()
clean_dna["seq"]=clean_dna['id_version'].apply(get_seq)

In [25]:
clean_dna

Unnamed: 0,Gene name,Gene stable ID,id_version,NCBI_id,seq
0,MT-ND1,ENSG00000198888,ENST00000361390.2,4535,ATACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAATCG...
1,MT-ND2,ENSG00000198763,ENST00000361453.3,4536,ATTAATCCCCTGGCCCAACCCGTCATCTACTCTACCATCTTTGCAG...
2,MT-CO1,ENSG00000198804,ENST00000361624.2,4512,ATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTG...
3,MT-CO2,ENSG00000198712,ENST00000361739.1,4513,ATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCTA...
4,MT-ATP8,ENSG00000228253,ENST00000361851.1,4509,ATGCCCCAACTAAATACTACCGTATGGCCCACCATAATTACCCCCA...
...,...,...,...,...,...
42064,CHCHD2P6,ENSG00000235084,ENST00000454346.1,,GGAAGCCGAAGCCACACCTCCCGCATGGCCCCTCCGGCCAGCCGGG...
42065,CD24P1,ENSG00000236500,ENST00000422383.1,,GCAATGGTGGACAGGCTCAGGCTGGGGCTGCTGCTTCTGGCACTGC...
42066,DDI2,ENSG00000197312,ENST00000480945.6,84301,AGACGGACTCGCAGGCGTGTGGCGGCGGCCGTGCTTGCTAGTGAGG...
42067,RSC1A1,ENSG00000215695,ENST00000345034.2,6248,AAGAGAAACCCGAGTTTGAGGACCTTATTTTATTCTACGCTGTTTA...


In [26]:
clean_dna.to_csv("transcript_seq.csv",index= False)