In [1]:
import pandas as pd
import gseapy as gp
import os

In [2]:
for name in gp.get_library_name('Human'):
    if 'go' in name.lower():
        print(name)

GO_Biological_Process_2013
GO_Biological_Process_2015
GO_Biological_Process_2017
GO_Biological_Process_2017b
GO_Biological_Process_2018
GO_Biological_Process_2021
GO_Biological_Process_2023
GO_Cellular_Component_2013
GO_Cellular_Component_2015
GO_Cellular_Component_2017
GO_Cellular_Component_2017b
GO_Cellular_Component_2018
GO_Cellular_Component_2021
GO_Cellular_Component_2023
GO_Molecular_Function_2013
GO_Molecular_Function_2015
GO_Molecular_Function_2017
GO_Molecular_Function_2017b
GO_Molecular_Function_2018
GO_Molecular_Function_2021
GO_Molecular_Function_2023
SynGO_2022
SynGO_2024


In [2]:
year = '2023'
annotations = ['GO_Biological_Process_'+year,'GO_Cellular_Component_'+year,'GO_Molecular_Function_'+year]

In [3]:
gp.get_library(name=annotations[0],organism='human')

{"'De Novo' AMP Biosynthetic Process (GO:0044208)": ['ATIC',
  'PAICS',
  'PFAS',
  'ADSS1',
  'ADSS2',
  'GART'],
 "'De Novo' Post-Translational Protein Folding (GO:0051084)": ['SDF2L1',
  'HSPA9',
  'CCT2',
  'HSPA6',
  'ST13',
  'ENTPD5',
  'HSPA1L',
  'HSPA5',
  'PTGES3',
  'HSPA8',
  'HSPA7',
  'DNAJB13',
  'HSPA2',
  'DNAJB14',
  'HSPE1',
  'DNAJC18',
  'GAK',
  'DNAJC7',
  'DNAJB12',
  'HSPA1A',
  'ST13P5',
  'HSPA1B',
  'ERO1A',
  'SELENOF',
  'HSPA14',
  'HSPA13',
  'DNAJB1',
  'CHCHD4',
  'DNAJB5',
  'DNAJB4',
  'SDF2',
  'UGGT1'],
 '2-Oxoglutarate Metabolic Process (GO:0006103)': ['IDH1',
  'PHYH',
  'GOT2',
  'MRPS36',
  'GOT1',
  'IDH2',
  'ADHFE1',
  'GPT2',
  'TAT',
  'DLST',
  'OGDHL',
  'L2HGDH',
  'D2HGDH',
  'OGDH'],
 "3'-UTR-mediated mRNA Destabilization (GO:0061158)": ['UPF1',
  'TRIM71',
  'RC3H1',
  'ZFP36L1',
  'ZFP36L2',
  'MOV10',
  'KHSRP',
  'ZC3H12D',
  'ZFP36',
  'ZC3H12A',
  'DHX36',
  'DND1',
  'PLEKHN1',
  'RBM24',
  'TARDBP'],
 "3'-UTR-mediated mRNA St

In [4]:
#### string gene_id map
local_stringdb = os.path.join('/itf-fi-ml/shared/users/ziyuzh/svm/data/stringdb','2023')

ppidf = pd.read_csv(os.path.join(local_stringdb,'9606.protein.info.v12.0.txt'), sep='\t', header=0, usecols=['#string_protein_id', 'preferred_name'])
ppidf['preferred_name'] = ppidf['preferred_name'].str.upper()
stringId2name = ppidf.set_index('#string_protein_id')['preferred_name'].to_dict()
name2stringId = ppidf.set_index('preferred_name')['#string_protein_id'].to_dict()
ppidf = pd.read_csv(os.path.join(local_stringdb,'9606.protein.aliases.v12.0.txt'), sep='\t', header=0, usecols=['#string_protein_id', 'alias']).drop_duplicates(['alias'], keep='first')
ppidf['alias'] = ppidf['alias'].str.upper()
aliases2stringId = ppidf.set_index('alias')['#string_protein_id'].to_dict()

def string_convert(gene):
    if gene in name2stringId.keys():
        return name2stringId[gene]
    elif gene in aliases2stringId.keys():
        return aliases2stringId[gene]
    else:
        return None

In [5]:
from collections import defaultdict
def invert_dict(original_dict):
    inverted_dict = defaultdict(list)

    for key, values in original_dict.items():
        for value in values:
            inverted_dict[string_convert(value)].append(key)  # Swap key and value
    return dict(inverted_dict) 



In [6]:
go_dict = dict()
for anno in annotations:
    anno_dict = gp.get_library(name=anno,organism='human')
    go_dict.update(anno_dict)

In [7]:

gene_list = [
    x
    for xs in list(go_dict.values())
    for x in xs
]

In [8]:
# Create a DataFrame with 0s
RowFeatures = pd.DataFrame(0, index=list(set(gene_list)), columns=go_dict.keys())

# Fill the DataFrame using dictionary comprehension and vectorized operations
for key, genes in go_dict.items():
    RowFeatures.loc[genes, key] = 1

In [9]:
RowFeatures

Unnamed: 0,'De Novo' AMP Biosynthetic Process (GO:0044208),'De Novo' Post-Translational Protein Folding (GO:0051084),2-Oxoglutarate Metabolic Process (GO:0006103),3'-UTR-mediated mRNA Destabilization (GO:0061158),3'-UTR-mediated mRNA Stabilization (GO:0070935),3'-Phosphoadenosine 5'-Phosphosulfate Metabolic Process (GO:0050427),5S Class rRNA Transcription By RNA Polymerase III (GO:0042791),7-Methylguanosine RNA Capping (GO:0009452),7-Methylguanosine Cap Hypermethylation (GO:0036261),7-Methylguanosine mRNA Capping (GO:0006370),...,Voltage-Gated Potassium Channel Activity Involved In Cardiac Muscle Cell Action Potential Repolarization (GO:0086008),Voltage-Gated Potassium Channel Activity Involved In Ventricular Cardiac Muscle Cell Action Potential Repolarization (GO:1902282),Voltage-Gated Sodium Channel Activity (GO:0005248),Volume-Sensitive Anion Channel Activity (GO:0005225),Water Channel Activity (GO:0015250),Water Transmembrane Transporter Activity (GO:0005372),Wide Pore Channel Activity (GO:0022829),Xylosyltransferase Activity (GO:0042285),Zinc Ion Binding (GO:0008270),Zinc Ion Transmembrane Transporter Activity (GO:0005385)
RAD23A,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
VPS54,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SBK1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
TNNI1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
GNA14,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ALDOC,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
SAMD10,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
IDS,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
HOXB-AS3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [10]:
go_df = RowFeatures.reset_index()
go_df['string_id'] = go_df['index'].map(string_convert)
go_df = go_df[~go_df['string_id'].isna()]
go_df = go_df.drop(columns='index')
new_columns = ['string_id'] + [f'feature_{i}' for i, col in enumerate(go_df.columns) if col != 'string_id']
# Reorder the DataFrame so that 'string_id' is the first column
df_combined = go_df[['string_id'] + [col for col in go_df.columns if col != 'string_id']]

df_combined.columns = new_columns
print(len(df_combined))
df_combined = df_combined.drop_duplicates(subset='string_id')
print(len(df_combined))


15914
15907


In [11]:
df_combined.to_csv(f'/itf-fi-ml/shared/users/ziyuzh/svm/data/GO/GO_{year}_all_features.csv',index=False)

In [96]:
len(go_dict.keys())

1539

In [97]:
from sklearn.decomposition import TruncatedSVD
if int(len(go_dict.keys())/3) > 1000:
    n_comp = 1000
else:
    n_comp = int(len(go_dict.keys())/3)
svd = TruncatedSVD(n_components=n_comp)
svdModel = svd.fit(RowFeatures)
visits_emb = svdModel.transform(RowFeatures)

In [98]:
go_df = pd.DataFrame(data=visits_emb, index=RowFeatures.index).reset_index()

In [99]:
go_df['string_id'] = go_df['index'].map(string_convert)

In [100]:
go_df = go_df[~go_df['string_id'].isna()]

In [101]:
go_df = go_df.drop(columns='index')

In [11]:
new_columns = ['string_id'] + [f'feature_{i}' for i, col in enumerate(go_df.columns) if col != 'string_id']
# Reorder the DataFrame so that 'string_id' is the first column
df_combined = go_df[['string_id'] + [col for col in go_df.columns if col != 'string_id']]
df_combined.columns = new_columns

NameError: name 'go_df' is not defined

In [103]:
df_combined

Unnamed: 0,string_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,...,feature_503,feature_504,feature_505,feature_506,feature_507,feature_508,feature_509,feature_510,feature_511,feature_512
0,9606.ENSP00000299766,2.550427,1.770823,-0.905457,0.472975,-0.278504,0.517632,0.050550,0.273291,1.132882,...,-0.085742,-0.037529,0.008060,0.033995,0.045863,-0.034890,-0.041629,0.021116,0.134906,0.066826
1,9606.ENSP00000339381,0.128143,-0.022666,0.081022,0.062613,0.152593,-0.088547,-0.063762,-0.071746,0.040301,...,-0.008824,0.048617,0.015191,0.048312,0.034061,0.036813,-0.022937,0.020521,0.027391,-0.025598
2,9606.ENSP00000239882,1.453872,-2.702339,-1.367520,-0.114336,-0.625128,-0.522973,0.088926,-0.049478,0.350334,...,-0.038521,-0.054019,0.054739,0.015925,0.005877,0.009057,0.068284,-0.013370,-0.001787,-0.002013
3,9606.ENSP00000380594,0.037089,0.011461,0.014264,-0.011807,0.056298,-0.032786,-0.029042,-0.027369,0.032848,...,-0.022867,-0.039707,0.022203,0.014371,0.026395,-0.004217,-0.011041,0.014336,0.008268,0.010492
4,9606.ENSP00000304250,0.229985,-0.036084,0.245019,0.112346,0.163196,-0.163405,-0.305747,-0.191674,-0.118890,...,0.016532,-0.017277,0.006034,-0.024026,0.006796,-0.000233,-0.007882,-0.029695,0.003936,-0.002176
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9811,9606.ENSP00000291592,0.151039,-0.236987,-0.020471,-0.068693,0.042749,0.195256,0.044545,0.017818,0.155633,...,-0.002116,-0.002251,0.001323,-0.002048,-0.000652,-0.002516,0.001424,0.000516,0.001792,-0.000908
9812,9606.ENSP00000324804,3.219197,-1.069475,1.780813,0.359029,0.852111,0.049525,-0.450672,-0.398389,0.141780,...,0.118744,0.062175,0.081027,-0.150741,-0.068153,0.106391,0.053627,-0.048938,-0.093984,-0.040482
9813,9606.ENSP00000300571,0.151039,-0.236987,-0.020471,-0.068693,0.042749,0.195256,0.044545,0.017818,0.155633,...,-0.002116,-0.002251,0.001323,-0.002048,-0.000652,-0.002516,0.001424,0.000516,0.001792,-0.000908
9814,9606.ENSP00000294742,0.408142,-0.275195,0.257251,-0.290618,0.534377,0.003924,0.108968,0.246958,0.452413,...,0.101412,0.039913,0.040435,-0.008072,-0.034300,-0.039135,0.006986,-0.027037,-0.015124,0.034419


In [104]:
df_combined.to_csv(f'/itf-fi-ml/shared/users/ziyuzh/svm/data/GO/GO_{year}_features.csv',index=False)