In [1]:
! pip install g2papi



In [11]:
import os
import json
#os.listdir('../../../g2p/Genomics2Proteins_portal/server/data')

filepath = '../../../g2p/Genomics2Proteins_portal/server/data/g2p_metadata_december_2024.json'
with open(filepath, 'r') as file:
    data = json.load(file)


uniprot_to_gene = {}
for gene, entries in data.items():
    for entry in entries:
        uniprot = entry['UniprotKB_Entry']
        if not uniprot in uniprot_to_gene:
            uniprot_to_gene[uniprot] = []
    uniprot_to_gene[uniprot].append(gene)


In [32]:
# Get genes and find associated transcript for canonical isoform of each:

# 1. Call the g2p API to get the gene
import g2papi

uniprot_list_path = "../data/uniprots.txt" # newline separated list of uniprots
uniprot_list = open(uniprot_list_path, 'r').read().splitlines()

uniprot_transcript_dict = {}

found = 0
notfound = 0
from tqdm import tqdm
for uniprot in tqdm(uniprot_list):
    if uniprot in uniprot_to_gene:
        if len(uniprot_to_gene[uniprot]) < 1:
            notfound += 1
            continue

        found += 1
        gene = uniprot_to_gene[uniprot][0]
        uniprot_transcript_dict[uniprot] = g2papi.get_gene_transcript_protein_isoform_structure(gene, uniprot)
    else:
        notfound += 1

# This list has 14,000 genes.... probably should have checked that first


  0%|          | 0/14004 [00:00<?, ?it/s]

100%|██████████| 14004/14004 [34:00<00:00,  6.86it/s] 


In [33]:
import pandas as pd
# make a giant dataframe (each entry in the list is a df, so just concat them all
transcript_df = pd.concat([pd.DataFrame(uniprot_transcript_dict[uniprot]) for uniprot in uniprot_transcript_dict.keys()])

# save the transcript df to a file
transcript_df.to_csv('../data/uniprot_transcript_df.csv', index=False)


In [34]:
print(notfound, found)
transcript_df.head()

11149 2855


Unnamed: 0,UniProtKB,UniProt Isoform,Ensembl Gene Id,Ensembl Protein Id,Ensembl Transcript Id,RefSeq mRNA Id,PDB Ids
0,Q9H400,Q9H400-1(*),ENSG00000203896,ENSP00000309521,ENST00000309546(*),NM_017806(MANE),not-available
1,Q9H400,not-assigned,ENSG00000203896,ENSP00000477561,ENST00000480139,NM_001305654,
2,Q9H400,not-assigned,ENSG00000203896,ENSP00000477561,ENST00000480139,NM_001305655,
0,Q9NR71,Q9NR71-1(*),ENSG00000188611,ENSP00000378897,ENST00000395526,not-assigned,4WGK
1,Q9NR71,Q9NR71-1(*),ENSG00000188611,ENSP00000506746,ENST00000682911(*),NM_019893(MANE),4WGK


In [55]:
# filter where UniProt Isoform column has (*) (means it's a canonical isoform)
transcript_df = transcript_df[transcript_df['UniProt Isoform'].notna()]
canonical_df = transcript_df[transcript_df['UniProt Isoform'].str.contains('(\*)')]# ENST00000373020.9

# remove (*) from Ensembl Transcript Id
canonical_df["Ensembl Transcript Id"] = canonical_df["Ensembl Transcript Id"].str.replace('\(\*\)', '')


transcripts = canonical_df['Ensembl Transcript Id'].unique()

  canonical_df = transcript_df[transcript_df['UniProt Isoform'].str.contains('(\*)')]# ENST00000373020.9
  canonical_df["Ensembl Transcript Id"] = canonical_df["Ensembl Transcript Id"].str.replace('\(\*\)', '')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  canonical_df["Ensembl Transcript Id"] = canonical_df["Ensembl Transcript Id"].str.replace('\(\*\)', '')


In [61]:

# gtex data is stored in a giant txt table, we can only read one line at a time. first line is header
gtex_filepath = '../data/GTEx_Analysis_v10_RSEMv1.3.3_transcripts_tpm.txt'


def get_completed_transcripts():
    if os.path.exists('../data/gtex_filtered_transcripts.csv'):
        return list(pd.read_csv('../data/gtex_filtered_transcripts.csv')['transcript_id'].unique())
    else:
        return []

completed_transcripts = get_completed_transcripts()

# add all tmp transcripts to completed_transcripts
if not os.path.exists('../data/tmp'):
    os.makedirs('../data/tmp')
tmp_files = os.listdir('../data/tmp')
tmp_transcripts = [file.split('_')[-1].split('.')[0] for file in tmp_files]
completed_transcripts.extend(tmp_transcripts)
completed_transcripts = set(completed_transcripts)
transcripts = set(transcripts)

# if there is a file already, load it
if os.path.exists('../data/gtex_filtered_transcripts.csv'):
    gtex_df = pd.read_csv('../data/gtex_filtered_transcripts.csv')

# get the header
from tqdm import tqdm
with open(gtex_filepath, 'r') as file:
    header = file.readline()
    header = header.split('\t')

    # now loop through each line
    for line in tqdm(file, total=244940):
        # split the line by tab
        fields = line.split('\t')
        # get the transcript id
        transcript_id = fields[0].split('.')[0]
        # if the transcript id is in the transcripts list, print the line
        if transcript_id in transcripts and transcript_id not in completed_transcripts:
            # add the line to the dataframe with pdconcat
            new_row = pd.DataFrame([fields])
            # save to tmp file
            tmp_filename = f'../data/tmp/gtex_filtered_transcripts_{transcript_id}.csv'
            if not os.path.exists("../data/tmp"):
                os.makedirs("../data/tmp")
            new_row.to_csv(tmp_filename, index=False)
            #new_df = pd.concat([new_df, pd.DataFrame([fields])])

import glob
# first read all of the tmp files and concat them
tmp_files = glob.glob('../data/tmp/*.csv')
tmp_df = pd.concat([pd.read_csv(file) for file in tmp_files])


gtex_df = pd.read_csv('../data/gtex_filtered_transcripts.csv')
gtex_df = pd.concat([gtex_df, tmp_df])

# remove tmp files
for file in tmp_files:
    os.remove(file)

# save the gtex df to a file
gtex_df.to_csv('../data/gtex_filtered_transcripts.csv', index=False)


100%|█████████▉| 244939/244940 [3:39:28<00:00, 18.60it/s]     


In [44]:
gtex_df.shape

(1568, 19790)

In [47]:
# convert header to column names by splitting on tab
gtex_df.columns = header

# strip /n from the end of every row and the column names
gtex_df = gtex_df.applymap(lambda x: x.rstrip('\n') if isinstance(x, str) else x)

gtex_df.head()
# save df to csv



Unnamed: 0,transcript_id,gene_id,GTEX-1117F-0005-SM-HL9SH,GTEX-1117F-0011-R10b-SM-GI4VE,GTEX-1117F-0011-R11b-SM-GIN8R,GTEX-1117F-0011-R2b-SM-GI4VL,GTEX-1117F-0011-R3a-SM-GJ3PJ,GTEX-1117F-0011-R4b-SM-GI4VM,GTEX-1117F-0011-R5a-SM-GI4VW,GTEX-1117F-0011-R6a-SM-GI4VX,...,GTEX-ZZPU-1326-SM-5GZWS,GTEX-ZZPU-1426-SM-5GZZ6,GTEX-ZZPU-1826-SM-5E43L,GTEX-ZZPU-2126-SM-5EGIU,GTEX-ZZPU-2226-SM-5EGIV,GTEX-ZZPU-2326-SM-GOQYU,GTEX-ZZPU-2426-SM-5E44I,GTEX-ZZPU-2526-SM-GOQZ3,GTEX-ZZPU-2626-SM-5E45Y,GTEX-ZZPU-2726-SM-5NQ8O\n
0,ENST00000522656.5,ENSG00000003989.18,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,...,0.3,0.0,0.09,0.35,0.0,0.44,0.8,0.1,2.16,0.88
0,ENST00000640220.1,ENSG00000003989.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0,ENST00000356634.7,ENSG00000004487.18,0.0,3.26,0.0,1.58,3.19,2.44,2.79,3.6,...,6.92,5.98,11.26,22.7,9.66,10.14,9.98,8.76,10.34,6.91
0,ENST00000421138.6,ENSG00000004700.16,0.01,0.19,0.46,0.31,0.27,1.14,0.93,0.68,...,2.08,2.65,3.65,1.97,3.21,2.02,4.8,2.78,3.97,2.23
0,ENST00000621914.4,ENSG00000006125.18,0.78,19.38,7.35,15.56,15.48,9.09,18.4,21.18,...,16.67,8.09,14.55,22.14,6.04,12.52,9.84,8.13,2.49,12.8


In [49]:
# remove /n from column names
gtex_df.columns = gtex_df.columns.str.rstrip('\n')

gtex_df.to_csv('../data/gtex_filtered_transcripts.csv', index=False)

In [65]:
# ok, so we have 2 things. we need to figure out what those ids mean in the column names, 
# and we need to add a gene name column

gtex_df["transcript_id"] = gtex_df["transcript_id"].str.split('.').str[0]
gtex_df.head()



Unnamed: 0,transcript_id,gene_id,GTEX-1117F-0005-SM-HL9SH,GTEX-1117F-0011-R10b-SM-GI4VE,GTEX-1117F-0011-R11b-SM-GIN8R,GTEX-1117F-0011-R2b-SM-GI4VL,GTEX-1117F-0011-R3a-SM-GJ3PJ,GTEX-1117F-0011-R4b-SM-GI4VM,GTEX-1117F-0011-R5a-SM-GI4VW,GTEX-1117F-0011-R6a-SM-GI4VX,...,19780,19781,19782,19783,19784,19785,19786,19787,19788,19789
0,ENST00000522656,ENSG00000003989.18,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,...,,,,,,,,,,
1,ENST00000640220,ENSG00000003989.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,,,,,,,
2,ENST00000356634,ENSG00000004487.18,0.0,3.26,0.0,1.58,3.19,2.44,2.79,3.6,...,,,,,,,,,,
3,ENST00000421138,ENSG00000004700.16,0.01,0.19,0.46,0.31,0.27,1.14,0.93,0.68,...,,,,,,,,,,
4,ENST00000621914,ENSG00000006125.18,0.78,19.38,7.35,15.56,15.48,9.09,18.4,21.18,...,,,,,,,,,,


In [62]:
canonical_df.head()

Unnamed: 0,UniProtKB,UniProt Isoform,Ensembl Gene Id,Ensembl Protein Id,Ensembl Transcript Id,RefSeq mRNA Id,PDB Ids
0,Q9H400,Q9H400-1(*),ENSG00000203896,ENSP00000309521,ENST00000309546,NM_017806(MANE),not-available
0,Q9NR71,Q9NR71-1(*),ENSG00000188611,ENSP00000378897,ENST00000395526,not-assigned,4WGK
1,Q9NR71,Q9NR71-1(*),ENSG00000188611,ENSP00000506746,ENST00000682911,NM_019893(MANE),4WGK
0,Q86XT9,Q86XT9(*),ENSG00000149932,ENSP00000279396,ENST00000279396,NM_001083613(MANE),not-available
1,Q86XT9,Q86XT9(*),ENSG00000149932,ENSP00000279396,ENST00000279396,NM_001369688,not-available


In [66]:
# join canonical df to gtex_df on "Ensembl Transcript Id" (canonical df) and transcript_id (gtex_df)
gtex_gene_df = gtex_df.merge(canonical_df, left_on="transcript_id", right_on="Ensembl Transcript Id", how="left")


gtex_gene_df.to_csv('../data/gtex_gene_df.csv', index=False)

In [67]:
gtex_gene_df.head()


Unnamed: 0,transcript_id,gene_id,GTEX-1117F-0005-SM-HL9SH,GTEX-1117F-0011-R10b-SM-GI4VE,GTEX-1117F-0011-R11b-SM-GIN8R,GTEX-1117F-0011-R2b-SM-GI4VL,GTEX-1117F-0011-R3a-SM-GJ3PJ,GTEX-1117F-0011-R4b-SM-GI4VM,GTEX-1117F-0011-R5a-SM-GI4VW,GTEX-1117F-0011-R6a-SM-GI4VX,...,19787,19788,19789,UniProtKB,UniProt Isoform,Ensembl Gene Id,Ensembl Protein Id,Ensembl Transcript Id,RefSeq mRNA Id,PDB Ids
0,ENST00000522656,ENSG00000003989.18,0.0,0.0,0.0,0.0,0.02,0.0,0.02,0.0,...,,,,P52569,P52569-1(*),ENSG00000003989,ENSP00000430464,ENST00000522656,not-assigned,not-available
1,ENST00000640220,ENSG00000003989.18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,,,,P52569,P52569-1(*),ENSG00000003989,ENSP00000492016,ENST00000640220,not-assigned,not-available
2,ENST00000356634,ENSG00000004487.18,0.0,3.26,0.0,1.58,3.19,2.44,2.79,3.6,...,,,,O60341,O60341-1(*),ENSG00000004487,ENSP00000349049,ENST00000356634,NM_015013,"2COM,2DW4,2EJR,2H94,2HKO,2IW5,2L3D,2UXN,2UXX,2..."
3,ENST00000421138,ENSG00000004700.16,0.01,0.19,0.46,0.31,0.27,1.14,0.93,0.68,...,,,,P46063,P46063(*),ENSG00000004700,ENSP00000395449,ENST00000421138,not-assigned,"2V1X,2WWY,4U7D,6JTZ,8YRS"
4,ENST00000621914,ENSG00000006125.18,0.78,19.38,7.35,15.56,15.48,9.09,18.4,21.18,...,,,,P63010,P63010-1(*),ENSG00000006125,ENSP00000482315,ENST00000621914,NM_001282,"1E42,2G30,2IV8,2IV9,2JKR,2JKT,2VGL,2XA7,4UQI,5..."


## Load the GTEX-id attribute map to a df

In [4]:
import os

# GTEx_Analysis_v10_Annotations_SampleAttributesDD.xlsx
# GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt

# load the xlsx file
import pandas as pd

gtex_col_names = pd.read_excel('../data/GTEx_Analysis_v10_Annotations_SampleAttributesDD.xlsx')
gtex_col_names.head()

# these are description of each column found in the txt file
# the column names are in the VARNAME column


Unnamed: 0,VARNAME,VARDESC,ACCESS,DOCFILE,TYPE,UNITS,COMMENT1,COMMENT2,VALUES,Unnamed: 9,...,Unnamed: 48,Unnamed: 49,Unnamed: 50,Unnamed: 51,Unnamed: 52,Unnamed: 53,Unnamed: 54,Unnamed: 55,Unnamed: 56,Unnamed: 57
0,SAMPID,"Sample ID, GTEx Public Sample ID",Open,,string,,,,,,...,,,,,,,,,,
1,SMATSSCR,Autolysis Score,Open,PRC Case Summary Report,"integer, encoded value",,Autolysis,The destruction of organism cells or tissues b...,0=None,1=Mild,...,,,,,,,,,,
2,SMNABTCH,Nucleic Acid Isolation Batch ID,Open,LDACC,string,,Generated at LDACC,Batch when DNA/RNA was isolated and extracted ...,,,...,,,,,,,,,,
3,SMNABTCHT,Type of nucleic acid isolation batch,Open,LDACC,string,,Generated at LDACC,The process by which DNA/RNA was isolated,,,...,,,,,,,,,,
4,SMNABTCHD,Date of nucleic acid isolation batch,Open,LDACC,string,,Generated at LDACC,The date on which DNA/RNA was isolated,,,...,,,,,,,,,,


In [11]:

# load the txt file
gtex_id_map = pd.read_csv('../data/GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt', sep='\t')
# now set the column names to the VARNAME column

# first filter to only include rows where ACCESS is Open
#colnames = gtex_col_names[gtex_col_names['ACCESS'] == 'Open']

# now set the column names to the VARNAME column
#gtex_id_map.columns = colnames['VARNAME']
gtex_id_map.head()


# save first 10 rows to csv
gtex_id_map.head(10).to_csv('../data/gtex_id_map_sample.csv', index=False)




  gtex_id_map = pd.read_csv('../data/GTEx_Analysis_v10_Annotations_SampleAttributesDS.txt', sep='\t')


In [13]:
# Load the gtex_gene_df
gtex_gene_df = pd.read_csv('../data/gtex_gene_df.csv')

smts_map = gtex_id_map['SMTS']

# get the unique tissue types by making a set of the smts_map
tissue_types = set(smts_map)

# make a new gene_df with a column for each tissue type
# get all cols for each tissue type from gtex_gene_df, and then take the mean of each
tissue_wise_df = pd.DataFrame()
from tqdm import tqdm
for tissue in tqdm(tissue_types):
    # first get all gtex_id_map rows where SMTS is the tissue type
    tissue_rows = gtex_id_map[gtex_id_map['SMTS'] == tissue]

    # get the corresponding columns from gtex_gene_df
    tissue_sample_ids = tissue_rows['SAMPID']
    tissue_cols = gtex_gene_df.columns[gtex_gene_df.columns.isin(tissue_sample_ids)]
    tissue_df = gtex_gene_df[tissue_cols]

    # now take the mean of each column
    tissue_df = tissue_df.mean(axis=1)
    tissue_wise_df[tissue] = tissue_df

# save tissue_wise_df to csv
tissue_wise_df.to_csv('../data/tissue_wise_df.csv', index=False)

tissue_wise_df.head()

  gtex_gene_df = pd.read_csv('../data/gtex_gene_df.csv')
100%|██████████| 31/31 [00:04<00:00,  6.62it/s]


Unnamed: 0,Blood Vessel,Bladder,Liver,Vagina,Adrenal Gland,Adipose Tissue,Nerve,Colon,Muscle,Thyroid,...,Breast,Blood,Small Intestine,Esophagus,Salivary Gland,Ovary,Spleen,Skin,Cervix Uteri,Prostate
0,0.293159,0.101818,0.173298,0.114706,0.067695,0.292483,0.316149,0.048486,0.854817,0.163918,...,0.159786,0.000584,0.031416,0.064854,0.048453,0.691347,0.007906,0.212674,0.160851,0.052872
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.001186,0.0,0.0,0.0,0.0,0.0,0.000442,0.0,0.0
2,7.575737,11.46961,1.708262,9.404471,7.957661,7.122098,7.718284,9.081632,8.801467,7.791316,...,7.312082,6.774513,7.335354,8.761103,9.931547,12.081036,5.650903,11.599183,12.103191,7.573723
3,3.496031,3.208312,1.380213,1.992824,2.706169,3.062014,1.889,2.029686,2.452579,2.456637,...,2.727393,4.873956,2.655531,2.181781,3.188066,2.072383,1.228736,5.991142,2.325106,1.301738
4,15.247477,21.453896,3.794929,16.870353,12.504068,16.495081,21.040881,16.834919,2.301834,22.358246,...,17.214125,9.471283,14.669912,16.441407,14.494641,26.117979,14.624729,18.321055,20.259149,16.677553


In [17]:
print(tissue_wise_df.shape)
print(gtex_gene_df.shape)
print(gtex_gene_df.columns)

# add the following columns to tissue_wise_df:
# 'UniProtKB', 'UniProt Isoform',
# 'Ensembl Gene Id', 'Ensembl Protein Id', 'Ensembl Transcript Id',
# 'RefSeq mRNA Id'

# get the columns from gtex_gene_df
metadata_cols = ['UniProtKB', 'UniProt Isoform', 'Ensembl Gene Id', 'Ensembl Protein Id', 'Ensembl Transcript Id', 'RefSeq mRNA Id']

# add the columns to tissue_wise_df
tissue_wise_df = pd.concat([tissue_wise_df, gtex_gene_df[metadata_cols]], axis=1)
tissue_wise_df.head()


(6217, 31)
(6217, 39587)
Index(['transcript_id', 'gene_id', 'GTEX-1117F-0005-SM-HL9SH',
       'GTEX-1117F-0011-R10b-SM-GI4VE', 'GTEX-1117F-0011-R11b-SM-GIN8R',
       'GTEX-1117F-0011-R2b-SM-GI4VL', 'GTEX-1117F-0011-R3a-SM-GJ3PJ',
       'GTEX-1117F-0011-R4b-SM-GI4VM', 'GTEX-1117F-0011-R5a-SM-GI4VW',
       'GTEX-1117F-0011-R6a-SM-GI4VX',
       ...
       '19787', '19788', '19789', 'UniProtKB', 'UniProt Isoform',
       'Ensembl Gene Id', 'Ensembl Protein Id', 'Ensembl Transcript Id',
       'RefSeq mRNA Id', 'PDB Ids'],
      dtype='object', length=39587)


Unnamed: 0,Blood Vessel,Bladder,Liver,Vagina,Adrenal Gland,Adipose Tissue,Nerve,Colon,Muscle,Thyroid,...,Spleen,Skin,Cervix Uteri,Prostate,UniProtKB,UniProt Isoform,Ensembl Gene Id,Ensembl Protein Id,Ensembl Transcript Id,RefSeq mRNA Id
0,0.293159,0.101818,0.173298,0.114706,0.067695,0.292483,0.316149,0.048486,0.854817,0.163918,...,0.007906,0.212674,0.160851,0.052872,P52569,P52569-1(*),ENSG00000003989,ENSP00000430464,ENST00000522656,not-assigned
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000442,0.0,0.0,P52569,P52569-1(*),ENSG00000003989,ENSP00000492016,ENST00000640220,not-assigned
2,7.575737,11.46961,1.708262,9.404471,7.957661,7.122098,7.718284,9.081632,8.801467,7.791316,...,5.650903,11.599183,12.103191,7.573723,O60341,O60341-1(*),ENSG00000004487,ENSP00000349049,ENST00000356634,NM_015013
3,3.496031,3.208312,1.380213,1.992824,2.706169,3.062014,1.889,2.029686,2.452579,2.456637,...,1.228736,5.991142,2.325106,1.301738,P46063,P46063(*),ENSG00000004700,ENSP00000395449,ENST00000421138,not-assigned
4,15.247477,21.453896,3.794929,16.870353,12.504068,16.495081,21.040881,16.834919,2.301834,22.358246,...,14.624729,18.321055,20.259149,16.677553,P63010,P63010-1(*),ENSG00000006125,ENSP00000482315,ENST00000621914,NM_001282


In [18]:
# print shape
print(tissue_wise_df.shape)

# save to csv
tissue_wise_df.to_csv('../data/tissue_wise_df.csv', index=False)



(6217, 37)
