In [9]:
import numpy as np
import pandas as pd
import os
from functools import reduce

In [10]:
os.chdir('/Users/anb/Documents/CMEB-Lab/Projects/ML-DRG-PDAC-2025/pdac_feature_generation')

**Negative Genes**

In [11]:
# Specify the folder path
neg_CC_folderpath = 'local_basal_results/ClosenessCentrality_tables_GO/local_basal_negative_genesetsCC'
neg_MF_folderpath = 'local_basal_results/ClosenessCentrality_tables_GO/local_basal_negative_genesetsMF'
neg_BP_folderpath = 'local_basal_results/ClosenessCentrality_tables_GO/local_basal_negative_genesetsBP'

In [12]:
# Get the folder
neg_CC_folder = os.fsencode(neg_CC_folderpath)
neg_MF_folder = os.fsencode(neg_MF_folderpath)
neg_BP_folder = os.fsencode(neg_BP_folderpath)

**Negative - Get CC files**

In [13]:
# Initialize an empty list to store dataframes
neg_df_list = []

In [14]:
# loop over files in the CC folder
for CC_file in os.listdir(neg_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [15]:
# check the number of dataframes in the list 
print(len(neg_df_list))

17


**Negative - Get MF files**

In [16]:
# loop over files in the MF folder
for MF_file in os.listdir(neg_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [17]:
# check the number of dataframes in the list 
print(len(neg_df_list))

36


**Negative - Get BP files**

In [18]:
# loop over files in the BP folder
for BP_file in os.listdir(neg_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [19]:
# check the number of dataframes in the list 
print(len(neg_df_list))

54


**Negative - Get a list of all genes symbols**

In [20]:
# Specify the file path of the genes list
neg_all_genes_filepath = 'local_basal_results/filtered-degs-genes-only/local_basal_negative_genes.tsv'

In [21]:
# get the file of genes list
neg_all_genes_df = pd.read_csv(neg_all_genes_filepath, sep='\t')

In [22]:
neg_all_genes_df

Unnamed: 0,ID
0,BRIP1
1,FHAD1
2,WDR19
3,LIN54
4,UBA52
...,...
416,KRAS
417,OAZ2
418,LEO1
419,MTHFSD


In [23]:
# Rename the column 
neg_all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [24]:
# Sort the genes ID alphabetically
neg_genes_list_df = neg_all_genes_df.sort_values('hgnc_symbol')

In [25]:
neg_genes_list_df

Unnamed: 0,hgnc_symbol
18,ABCB4
117,ABHD14B
195,ACER3
105,ACSM2A
187,ACTL6A
...,...
19,ZNF695
379,ZNF75D
241,ZNF835
46,ZNF90


**Negative - Combine files**

In [26]:
neg_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), neg_df_list)

In [27]:
# Sort the genes ID alphabetically
neg_combined_df = neg_combined_df.sort_values('hgnc_symbol')

In [28]:
neg_combined_df

Unnamed: 0,hgnc_symbol,GO:0005654,GO:0044297,GO:0005815,GO:0005829,GO:0031974,GO:0005622,GO:0005813,GO:0005634,GO:0015630,...,GO:0010629,GO:0032446,GO:0043687,GO:0006281,GO:0016236,GO:0007267,GO:0006397,GO:0016032,GO:0034330,GO:0006974
0,ACSM2A,,,,,,,,,,...,,,,,,,,,,
1,ACTL6A,0.477477,,,,0.450000,0.415638,,0.448052,,...,,,,0.565217,,,,,,0.515152
2,ACTR5,0.344156,,,,0.328125,0.312693,,0.327014,,...,,,,0.433333,,,,,,0.414634
3,ADAR,0.404580,,,0.401786,0.379518,0.374074,,0.401163,,...,0.466667,,,,,,,0.625000,,
4,ADCY5,,,,,,,,,,...,,,,,,0.75,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
156,WNK1,,,,0.284810,,0.267905,,0.273810,,...,,0.363636,0.360000,,,,,,,0.326923
157,WWOX,,,,,,,,,,...,,,,,,,,,,
158,YOD1,,,,,,0.245146,,,,...,,,0.321429,,,,,,,
159,ZMYM1,,,,,,0.277473,,0.289916,,...,,,,,,,,,,


In [29]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_combined_df[neg_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0005654,GO:0044297,GO:0005815,GO:0005829,GO:0031974,GO:0005622,GO:0005813,GO:0005634,GO:0015630,...,GO:0010629,GO:0032446,GO:0043687,GO:0006281,GO:0016236,GO:0007267,GO:0006397,GO:0016032,GO:0034330,GO:0006974


**Negative - Combine Results with A List of All Genes Symbols**

In [30]:
neg_results_df = pd.merge(neg_genes_list_df, neg_combined_df, how='left', on='hgnc_symbol')

In [31]:
# Sort the genes ID alphabetically
neg_results_df = neg_results_df.sort_values('hgnc_symbol')

In [32]:
neg_empty_cells = neg_results_df.isnull().sum()
neg_empty_cells

hgnc_symbol      0
GO:0005654     367
GO:0044297     416
GO:0005815     413
GO:0005829     364
GO:0031974     357
GO:0005622     312
GO:0005813     413
GO:0005634     351
GO:0015630     413
GO:0005759     414
GO:0042175     417
GO:0031981     364
GO:0005739     403
GO:0070013     357
GO:0140513     399
GO:0043233     357
GO:0043025     416
GO:0032559     414
GO:0005524     408
GO:0140657     410
GO:0098772     404
GO:0016818     415
GO:0140096     398
GO:1901265     401
GO:0016462     415
GO:0016491     415
GO:0097367     410
GO:0008047     417
GO:0140677     413
GO:0016817     415
GO:0032553     405
GO:0017076     403
GO:0035639     404
GO:0016740     407
GO:0032555     405
GO:0003824     379
GO:0044419     415
GO:0043412     391
GO:0016567     410
GO:0036211     392
GO:0044281     411
GO:0006302     416
GO:0070647     404
GO:0016071     407
GO:0010629     406
GO:0032446     408
GO:0043687     402
GO:0006281     407
GO:0016236     415
GO:0007267     407
GO:0006397     414
GO:0016032  

In [33]:
# Replace missing values with 0
neg_results_df = neg_results_df.fillna(0)

In [34]:
neg_results_df

Unnamed: 0,hgnc_symbol,GO:0005654,GO:0044297,GO:0005815,GO:0005829,GO:0031974,GO:0005622,GO:0005813,GO:0005634,GO:0015630,...,GO:0010629,GO:0032446,GO:0043687,GO:0006281,GO:0016236,GO:0007267,GO:0006397,GO:0016032,GO:0034330,GO:0006974
0,ABCB4,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
1,ABHD14B,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
2,ACER3,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
3,ACSM2A,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
4,ACTL6A,0.477477,0.0,0.0,0.0,0.45,0.415638,0.0,0.448052,0.0,...,0.000000,0.0,0.0,0.565217,0.0,0.0,0.0,0.000000,0.0,0.515152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,ZNF695,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
417,ZNF75D,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
418,ZNF835,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000
419,ZNF90,0.000000,0.0,0.0,0.0,0.00,0.000000,0.0,0.000000,0.0,...,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.000000


In [35]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_results_df[neg_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0005654,GO:0044297,GO:0005815,GO:0005829,GO:0031974,GO:0005622,GO:0005813,GO:0005634,GO:0015630,...,GO:0010629,GO:0032446,GO:0043687,GO:0006281,GO:0016236,GO:0007267,GO:0006397,GO:0016032,GO:0034330,GO:0006974


**Negative - Save File**

In [None]:
# Save the DataFrame as a CSV file
neg_results_df.to_csv('local_basal_results/ML_input_files/ClosenessCentrality_tables/GO/local_basal_negative.csv')

**Positive Genes**

In [37]:
# Specify the folder path
pos_CC_folderpath = 'local_basal_results/ClosenessCentrality_tables_GO/local_basal_positive_genesetsCC'
pos_MF_folderpath = 'local_basal_results/ClosenessCentrality_tables_GO/local_basal_positive_genesetsMF'
pos_BP_folderpath = 'local_basal_results/ClosenessCentrality_tables_GO/local_basal_positive_genesetsBP'

In [38]:
# Get the folder
pos_CC_folder = os.fsencode(pos_CC_folderpath)
pos_MF_folder = os.fsencode(pos_MF_folderpath)
pos_BP_folder = os.fsencode(pos_BP_folderpath)

**Positive - Get CC files**

In [39]:
# Initialize an empty list to store dataframes
pos_df_list = []

In [40]:
# loop over files in the CC folder
for CC_file in os.listdir(pos_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [41]:
# check the number of dataframes in the list 
print(len(pos_df_list))

26


**Positive - Get MF files**

In [42]:
# loop over files in the MF folder
for MF_file in os.listdir(pos_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [43]:
# check the number of dataframes in the list 
print(len(pos_df_list))

34


**Positive - Get BP files**

In [44]:
# loop over files in the BP folder
for BP_file in os.listdir(pos_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [45]:
# check the number of dataframes in the list 
print(len(pos_df_list))

175


**Positive - Get a list of all genes symbols**

In [46]:
# Specify the file path of the genes list
pos_all_genes_filepath = 'local_basal_results/filtered-degs-genes-only/local_basal_sig_genes.tsv'

In [47]:
# get the file of genes list
pos_all_genes_df = pd.read_csv(pos_all_genes_filepath, sep='\t')

In [48]:
pos_all_genes_df

Unnamed: 0,ID
0,SLC2A1
1,SLC16A3
2,TNS4
3,SCEL
4,ZBED2
...,...
610,DNASE1
611,TFF2
612,SPRR3
613,CACNA1E


In [49]:
# Rename the column 
pos_all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [50]:
# Sort the genes ID alphabetically
pos_genes_list_df = pos_all_genes_df.sort_values('hgnc_symbol')

In [51]:
pos_genes_list_df

Unnamed: 0,hgnc_symbol
154,A1CF
195,ABAT
114,ABCA3
459,ABCC8
506,ACE2
...,...
422,ZBTB16
189,ZNF185
273,ZNF365
254,ZNF469


**Positive - Combine files**

In [52]:
pos_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), pos_df_list)

In [53]:
# Sort the genes ID alphabetically
pos_combined_df = pos_combined_df.sort_values('hgnc_symbol')

In [54]:
pos_combined_df

Unnamed: 0,hgnc_symbol,GO:0099080,GO:0099081,GO:0043228,GO:0036477,GO:0032991,GO:0062023,GO:0045111,GO:0030312,GO:0071944,...,GO:0055080,GO:0055082,GO:0030154,GO:0048762,GO:0060429,GO:0009306,GO:0009887,GO:0098813,GO:0042886,GO:0035987
0,ABAT,,,,,,,,,,...,,,,,,,,,,
1,ABCC8,,,,,,,,,,...,,0.5,,,,0.615385,,,0.514286,
2,ACE2,,,,,,,,,,...,,,,,,,,,,
3,ACP5,,,,,,,,,,...,,,,,,,,,,
4,ACSL5,,,0.252964,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
332,WNT2,,,,,0.295547,0.442857,,0.447368,0.381356,...,,,0.406452,0.444444,0.363636,,0.381818,,,
333,WNT5A,,,,,,0.534483,,0.531250,0.445545,...,,,0.466667,0.666667,0.409091,,0.567568,,,
334,XAF1,,,,,,,,,,...,,,,,,,,,,
335,ZNF469,,,,,,,,,,...,,,,,,,,,,


In [55]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_combined_df[pos_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0099080,GO:0099081,GO:0043228,GO:0036477,GO:0032991,GO:0062023,GO:0045111,GO:0030312,GO:0071944,...,GO:0055080,GO:0055082,GO:0030154,GO:0048762,GO:0060429,GO:0009306,GO:0009887,GO:0098813,GO:0042886,GO:0035987


**Positive - Combine Results with A List of All Genes Symbols**

In [56]:
pos_results_df = pd.merge(pos_genes_list_df, pos_combined_df, how='left', on='hgnc_symbol')

In [57]:
# Sort the genes ID alphabetically
pos_results_df = pos_results_df.sort_values('hgnc_symbol')

In [58]:
pos_empty_cells = pos_results_df.isnull().sum()
pos_empty_cells

hgnc_symbol      0
GO:0099080     587
GO:0099081     598
GO:0043228     543
GO:0036477     602
              ... 
GO:0009306     598
GO:0009887     590
GO:0098813     599
GO:0042886     596
GO:0035987     604
Length: 176, dtype: int64

In [59]:
# Replace missing values with 0
pos_results_df = pos_results_df.fillna(0)

In [60]:
pos_results_df

Unnamed: 0,hgnc_symbol,GO:0099080,GO:0099081,GO:0043228,GO:0036477,GO:0032991,GO:0062023,GO:0045111,GO:0030312,GO:0071944,...,GO:0055080,GO:0055082,GO:0030154,GO:0048762,GO:0060429,GO:0009306,GO:0009887,GO:0098813,GO:0042886,GO:0035987
0,A1CF,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
1,ABAT,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
2,ABCA3,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
3,ABCC8,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.5,0.0,0.0,0.0,0.615385,0.0,0.0,0.514286,0.0
4,ACE2,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,ZBTB16,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
611,ZNF185,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
612,ZNF365,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0
613,ZNF469,0.000000,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.000000,0.0


In [61]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_results_df[pos_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0099080,GO:0099081,GO:0043228,GO:0036477,GO:0032991,GO:0062023,GO:0045111,GO:0030312,GO:0071944,...,GO:0055080,GO:0055082,GO:0030154,GO:0048762,GO:0060429,GO:0009306,GO:0009887,GO:0098813,GO:0042886,GO:0035987


**Positive - Save File**

In [None]:
# Save the DataFrame as a CSV file
pos_results_df.to_csv('local_basal_results/ML_input_files/ClosenessCentrality_tables/GO/local_basal_positive.csv')

**ALL Genes**

In [63]:
# Specify the folder path
all_CC_folderpath = 'local_basal_results/ClosenessCentrality_tables_GO/local_basal_all_genesetsCC'
all_MF_folderpath = 'local_basal_results/ClosenessCentrality_tables_GO/local_basal_all_genesetsMF'
all_BP_folderpath = 'local_basal_results/ClosenessCentrality_tables_GO/local_basal_all_genesetsBP'

In [64]:
# Get the folder
all_CC_folder = os.fsencode(all_CC_folderpath)
all_MF_folder = os.fsencode(all_MF_folderpath)
all_BP_folder = os.fsencode(all_BP_folderpath)

**ALL - Get CC files**

In [65]:
# Initialize an empty list to store dataframes
all_df_list = []

In [66]:
# loop over files in the CC folder
for CC_file in os.listdir(all_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [67]:
# check the number of dataframes in the list 
print(len(all_df_list))

96


**ALL - Get MF files**

In [68]:
# loop over files in the MF folder
for MF_file in os.listdir(all_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [69]:
# check the number of dataframes in the list 
print(len(all_df_list))

189


**ALL - Get BP files**

In [70]:
# loop over files in the BP folder
for BP_file in os.listdir(all_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [71]:
# check the number of dataframes in the list 
print(len(all_df_list))

1055


**ALL - Get a list of all genes symbols**

In [72]:
# Specify the file path of the genes list
all_genes_filepath = 'local_basal_results/filtered-degs-genes-only/local_basal_all_genes.tsv'

In [73]:
# get the file of genes list
all_genes_df = pd.read_csv(all_genes_filepath, sep='\t')

In [74]:
all_genes_df

Unnamed: 0,ID
0,SLC2A1
1,SLC16A3
2,TNS4
3,SCEL
4,ZBED2
...,...
15389,MAP4
15390,MTHFD2
15391,FGF5
15392,GLOD4


In [75]:
# Rename the column 
all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [76]:
# Sort the genes ID alphabetically
all_genes_list_df = all_genes_df.sort_values('hgnc_symbol')

In [77]:
all_genes_list_df

Unnamed: 0,hgnc_symbol
6285,A1BG
211,A1CF
6849,A2M
13474,A2ML1
11664,A3GALT2
...,...
5943,ZXDC
10684,ZYG11A
4407,ZYG11B
3480,ZYX


**ALL - Combine files**

In [78]:
all_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), all_df_list)

In [79]:
# Sort the genes ID alphabetically
all_combined_df = all_combined_df.sort_values('hgnc_symbol')

In [80]:
all_combined_df

Unnamed: 0,hgnc_symbol,GO:1902495,GO:0031252,GO:1904813,GO:0016363,GO:0031045,GO:0035579,GO:0002102,GO:0098982,GO:0034705,...,GO:0042058,GO:0000280,GO:0000727,GO:0014020,GO:0098754,GO:1903037,GO:0002695,GO:0035987,GO:0032024,GO:0050851
0,A2M,,,,,,,,,,...,,,,,,,,,,
1,A4GALT,,,,,,,,,,...,,,,,,,,,,
2,AACS,,,,,,,,,,...,,,,,,,,,,
3,AADAC,,,,,,,,,,...,,,,,,,,,,
4,AANAT,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3583,ZNF804A,,,,,,,,,,...,,,,,,,,,,
3584,ZNFX1,,,,,,,,,,...,,,,,,,,,,
3585,ZWILCH,,,,,,,,,,...,,0.60355,,,,,,,,
3586,ZWINT,,,,,,,,,,...,,0.63354,,,,,,,,


In [81]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_combined_df[all_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:1902495,GO:0031252,GO:1904813,GO:0016363,GO:0031045,GO:0035579,GO:0002102,GO:0098982,GO:0034705,...,GO:0042058,GO:0000280,GO:0000727,GO:0014020,GO:0098754,GO:1903037,GO:0002695,GO:0035987,GO:0032024,GO:0050851


**ALL - Combine Results with A List of All Genes Symbols**

In [82]:
all_results_df = pd.merge(all_genes_list_df, all_combined_df, how='left', on='hgnc_symbol')

In [83]:
# Sort the genes ID alphabetically
all_results_df = all_results_df.sort_values('hgnc_symbol')

In [84]:
all_empty_cells = all_results_df.isnull().sum()
all_empty_cells

hgnc_symbol        0
GO:1902495     15250
GO:0031252     15274
GO:1904813     15350
GO:0016363     15368
               ...  
GO:1903037     15279
GO:0002695     15334
GO:0035987     15377
GO:0032024     15377
GO:0050851     15334
Length: 1056, dtype: int64

In [85]:
# Replace missing values with 0
all_results_df = all_results_df.fillna(0)

In [86]:
all_results_df

Unnamed: 0,hgnc_symbol,GO:1902495,GO:0031252,GO:1904813,GO:0016363,GO:0031045,GO:0035579,GO:0002102,GO:0098982,GO:0034705,...,GO:0042058,GO:0000280,GO:0000727,GO:0014020,GO:0098754,GO:1903037,GO:0002695,GO:0035987,GO:0032024,GO:0050851
0,A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15389,ZXDC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15390,ZYG11A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15391,ZYG11B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15392,ZYX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [87]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_results_df[all_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:1902495,GO:0031252,GO:1904813,GO:0016363,GO:0031045,GO:0035579,GO:0002102,GO:0098982,GO:0034705,...,GO:0042058,GO:0000280,GO:0000727,GO:0014020,GO:0098754,GO:1903037,GO:0002695,GO:0035987,GO:0032024,GO:0050851


**ALL - Save File**

In [None]:
# Save the DataFrame as a CSV file
all_results_df.to_csv('local_basal_results/ML_input_files/ClosenessCentrality_tables/GO/local_basal_all.csv')