In [25]:
import pandas as pd
from mygene import MyGeneInfo
mg = MyGeneInfo()

In [None]:
# Load, filter for human, and save
mrna_path = "../datasets/VESICLEPEDIA_PROTEIN_MRNA_DETAILS_5.1.txt"
mrna_df = pd.read_csv(mrna_path, sep="\t", on_bad_lines='skip', engine='python')
human_mrna_df = mrna_df[mrna_df['SPECIES'] == "Homo sapiens"]
human_mrna_df.to_csv("../datasets/mrna_clean.csv", index=False)

gene_path = "../datasets/VESICLEPEDIA_GENE_DETAILS_5.1.txt"
gene_df = pd.read_csv(gene_path, sep="\t", on_bad_lines='skip', engine='python')
human_gene_df = gene_df[gene_df['SPECIES'] == "Homo sapiens"]
human_gene_df.to_csv("../datasets/gene_clean.csv", index=False)

# Standardize column names
human_mrna_df.columns = human_mrna_df.columns.str.lower().str.strip().str.replace(" ", "_")
human_gene_df.columns = human_gene_df.columns.str.lower().str.strip().str.replace(" ", "_")

# Merge mRNA & Gene
merged_df = pd.merge(
    human_mrna_df, 
    human_gene_df[['entrez_gene_id', 'gene_symbol', 'gene_name', 'hgnc']],
    on='entrez_gene_id',
    how='left'
)

# If gene_symbol is present on both sides, keep gene_symbol from gene_df, if NA, keep gene_symbol from mRNA side
if {'gene_symbol_x', 'gene_symbol_y'}.issubset(merged_df.columns):
    merged_df['gene_symbol'] = merged_df['gene_symbol_y'].combine_first(merged_df['gene_symbol_x'])
    merged_df.drop(columns=['gene_symbol_x', 'gene_symbol_y'], inplace=True)
else:
    merged_df.rename(columns={'gene_symbol': 'gene_symbol'}, inplace=True)


In [28]:
# Create merged, standardized data where each record has:
# Verified gene symbol
# Entrez ID
# Study-level attributes (biofluid, vesicle type, method)
symbols = merged_df['gene_symbol'].dropna().unique().tolist()
mapping = mg.querymany(symbols, scopes='symbol', fields='symbol,entrezgene', species='human')

map_df = pd.DataFrame(mapping)[['query', 'symbol']].drop_duplicates()
merged_map_df = merged_df.merge(map_df, left_on='gene_symbol', right_on='query', how='left')
merged_map_df['gene_symbol'] = merged_map_df['symbol'].combine_first(merged_map_df['gene_symbol'])
merged_map_df.drop(columns=['query', 'symbol'], inplace=True)
merged_map_df

414 input query terms found dup hits:	[('ALDOAP2', 2), ('ALOX12P2', 2), ('ANXA2P1', 2), ('ANXA2P2', 2), ('ANXA2P3', 2), ('BAK1P1', 2), ('B
2396 input query terms found no hit:	['His3.3B', 'His3.3A', 'AARS', 'ABP1', 'ACPP', 'ADRBK1', 'ADRBK2', 'ADSS', 'AES', 'AIM1', 'ANXA8L2',


Unnamed: 0,content_id,content_type,entrez_gene_id_x,gene_symbol,species,experiment_id,methods,entrez_gene_id_y,gene_name,hgnc
0,37705,protein,31848,His3.3B,Homo sapiens,308,Mass spectrometry [MALDI TOF/TOF]|Mass spectro...,,,
1,37706,protein,33736,His3.3A,Homo sapiens,308,Mass spectrometry [MALDI TOF/TOF]|Mass spectro...,,,
2,2616,protein,1,A1BG,Homo sapiens,63,Mass spectrometry,1.0,alpha-1-B glycoprotein,5.0
3,16540,protein,1,A1BG,Homo sapiens,159,Mass spectrometry [LTQ],1.0,alpha-1-B glycoprotein,5.0
4,43432,protein,1,A1BG,Homo sapiens,353,Mass spectrometry [Orbitrap Velos],1.0,alpha-1-B glycoprotein,5.0
...,...,...,...,...,...,...,...,...,...,...
463351,38707,protein,397418,-,Homo sapiens,306,Mass spectrometry [MALDI TOF/TOF]|Mass spectro...,102725035.0,leukocyte immunoglobulin-like receptor subfami...,0.0
463352,38707,protein,397418,-,Homo sapiens,306,Mass spectrometry [MALDI TOF/TOF]|Mass spectro...,102725214.0,bolA-like protein 2-like,0.0
463353,38707,protein,397418,-,Homo sapiens,306,Mass spectrometry [MALDI TOF/TOF]|Mass spectro...,102725225.0,transmembrane protein C16orf54-like,0.0
463354,38707,protein,397418,-,Homo sapiens,306,Mass spectrometry [MALDI TOF/TOF]|Mass spectro...,102725406.0,UPF0627 protein ENSP00000358171-like,0.0
