In [16]:
import pandas as pd
import numpy as np
import os

In [4]:
graph_df = pd.read_csv('/itf-fi-ml/shared/users/ziyuzh/svm/data/gmb/pcbi.1004120.s003.tsv',sep="\t")
graph_df.head(3)

Unnamed: 0,gene_ID_1,gene_ID_2,gene_symbol_1,gene_symbol_2,sources
0,1,310,A1BG,ANXA7,signaling
1,1,1026,A1BG,CDKN1A,signaling
2,1,2886,A1BG,GRB7,signaling


In [7]:
graph_df.columns

Index(['gene_ID_1    ', 'gene_ID_2 ', ' gene_symbol_1 ', ' gene_symbol_2',
       '  sources'],
      dtype='object')

In [9]:
gene_dict1 = dict(zip(graph_df[' gene_symbol_1 '].str.strip(), graph_df['gene_ID_1    ']))
gene_dict2 = dict(zip(graph_df[' gene_symbol_2'].str.strip(), graph_df['gene_ID_2 ']))


In [53]:
for gene in gene_dict2:
    if gene not in gene_dict1.keys():
        gene_dict1[str(gene).strip()] = str(gene_dict2[gene]).strip()

In [54]:
len(gene_dict1),len(gene_dict2)

(13458, 11027)

In [17]:
#### string gene_id map
local_stringdb = os.path.join('/itf-fi-ml/shared/users/ziyuzh/svm/data/stringdb','2023')

ppidf = pd.read_csv(os.path.join(local_stringdb,'9606.protein.info.v12.0.txt'), sep='\t', header=0, usecols=['#string_protein_id', 'preferred_name'])
ppidf['preferred_name'] = ppidf['preferred_name'].str.upper()
stringId2name = ppidf.set_index('#string_protein_id')['preferred_name'].to_dict()
name2stringId = ppidf.set_index('preferred_name')['#string_protein_id'].to_dict()
ppidf = pd.read_csv(os.path.join(local_stringdb,'9606.protein.aliases.v12.0.txt'), sep='\t', header=0, usecols=['#string_protein_id', 'alias']).drop_duplicates(['alias'], keep='first')
ppidf['alias'] = ppidf['alias'].str.upper()
aliases2stringId = ppidf.set_index('alias')['#string_protein_id'].to_dict()

def string_convert(gene):
    if gene in name2stringId.keys():
        return name2stringId[gene]
    elif gene in aliases2stringId.keys():
        return aliases2stringId[gene]
    else:
        return None

In [55]:
gene_id_df = pd.DataFrame(list(gene_dict1.items()), columns=['gene_symbol', 'gmb_id'])

In [56]:
gene_id_df['string_id'] = gene_id_df['gene_symbol'].map(string_convert)

In [31]:
disease_df = pd.read_csv('/itf-fi-ml/shared/users/ziyuzh/svm/data/gmb/pcbi.1004120.s004.tsv', sep=";", header=0, names=["disease", "gene"])
disease_df


Unnamed: 0,disease,gene
0,adrenal gland diseases,3758/215/3762/1589/1585/6770/2516/6557/5573/...
1,alzheimer disease,5663/23036/348/5664/55103/10452/1191/2629/2...
2,amino acid metabolism inborn errors,445/383/2109/3815/11234/388552/4286/275/2110/...
3,amyotrophic lateral sclerosis,6647/1639/6311/57679/10133/2521/998/22920/2...
4,anemia aplastic,3458/6125/6135/7015/2187/6229/2177/2175/2176/...
...,...,...
65,spondylarthropathies,100507436/10085/147/4276/25949/3107/3593/3...
66,tauopathies,4137/5663/23036/348/5664/55103/10452/10228/25...
67,uveal diseases,4340/1121/4276/29113/3595/4016/149233/8870/4...
68,varicose veins,51438/7247/652/5629/2043/8622/83478/201456/3...


In [57]:
gene_id = []
disease_list = []

for _, row in disease_df.iterrows():
    # Split the 'gene' column string by the '/' delimiter
    genes = row['gene'].split('/')
    
    # Strip spaces from each gene and update the list
    genes = [gene.strip() for gene in genes]

    # Extend gene_id with split and stripped gene IDs
    gene_id.extend(genes)
    
    # Extend disease_list with copies of the current disease, matched to the number of genes split
    disease_list.extend([row['disease']] * len(genes))


In [None]:
gmb_dga = pd.DataFrame({'gmb_id': gene_id, 'disease_id': disease_list})


In [60]:
# Convert 'gmb_id' in both DataFrames to strings (object type)
gmb_dga['gmb_id'] = gmb_dga['gmb_id'].astype(str)
gene_id_df['gmb_id'] = gene_id_df['gmb_id'].astype(str)

# Now perform the merge operation
gmb_dga = gmb_dga.merge(gene_id_df, on='gmb_id', how='left')


In [61]:
gmb_dga

Unnamed: 0,gmb_id,disease,gene_symbol,string_id
0,3758,adrenal gland diseases,KCNJ1,9606.ENSP00000376432
1,215,adrenal gland diseases,ABCD1,9606.ENSP00000218104
2,3762,adrenal gland diseases,KCNJ5,9606.ENSP00000433295
3,1589,adrenal gland diseases,CYP21A2,9606.ENSP00000496625
4,1585,adrenal gland diseases,CYP11B2,9606.ENSP00000325822
...,...,...,...,...
2838,3586,vasculitis,IL10,9606.ENSP00000412237
2839,54535,vasculitis,CCHCR1,9606.ENSP00000379566
2840,2212,vasculitis,FCGR2A,9606.ENSP00000271450
2841,3106,vasculitis,HLA-B,9606.ENSP00000399168


In [69]:
gmb_dga = gmb_dga[~gmb_dga['string_id'].isna()]


In [71]:
gmb_dga.to_csv('/itf-fi-ml/shared/users/ziyuzh/svm/data/gmb/gmb_string.csv',index=False)