In [1]:
import numpy as np
import pandas as pd
import os
from functools import reduce

In [2]:
os.chdir('/Users/anb/Documents/CMEB-Lab/Projects/ML-DRG-PDAC-2025/pdac_feature_generation')

**Negative Genes**

In [3]:
# Specify the folder path
neg_CC_folderpath = 'local_classic_results/ClosenessCentrality_tables_GO/local_classic_negative_genesetsCC'
neg_MF_folderpath = 'local_classic_results/ClosenessCentrality_tables_GO/local_classic_negative_genesetsMF'
neg_BP_folderpath = 'local_classic_results/ClosenessCentrality_tables_GO/local_classic_negative_genesetsBP'

In [4]:
# Get the folder
neg_CC_folder = os.fsencode(neg_CC_folderpath)
neg_MF_folder = os.fsencode(neg_MF_folderpath)
neg_BP_folder = os.fsencode(neg_BP_folderpath)

**Negative - Get CC files**

In [5]:
# Initialize an empty list to store dataframes
neg_df_list = []

In [6]:
# loop over files in the CC folder
for CC_file in os.listdir(neg_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [7]:
# check the number of dataframes in the list 
print(len(neg_df_list))

4


**Negative - Get MF files**

In [8]:
# loop over files in the MF folder
for MF_file in os.listdir(neg_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [9]:
# check the number of dataframes in the list 
print(len(neg_df_list))

6


**Negative - Get BP files**

In [10]:
# loop over files in the BP folder
for BP_file in os.listdir(neg_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [11]:
# check the number of dataframes in the list 
print(len(neg_df_list))

16


**Negative - Get a list of all genes symbols**

In [12]:
# Specify the file path of the genes list
neg_all_genes_filepath = 'local_classic_results/filtered-degs-genes-only/local_classic_negative_genes.tsv'

In [13]:
# get the file of genes list
neg_all_genes_df = pd.read_csv(neg_all_genes_filepath, sep='\t')

In [14]:
neg_all_genes_df

Unnamed: 0,ID
0,GEN1
1,BRIP1
2,TMC2
3,BCAS3
4,MTBP
...,...
199,SLC3A2
200,YTHDF2
201,TMEM106A
202,LYSMD4


In [15]:
# Rename the column 
neg_all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [16]:
# Sort the genes ID alphabetically
neg_genes_list_df = neg_all_genes_df.sort_values('hgnc_symbol')

In [17]:
neg_genes_list_df

Unnamed: 0,hgnc_symbol
72,ACAP1
40,ADAM22
35,ADAMTS17
127,AGPS
5,ALS2
...,...
167,ZNF12
163,ZNF134
197,ZNF226
43,ZNF835


**Negative - Combine files**

In [18]:
neg_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), neg_df_list)

In [19]:
# Sort the genes ID alphabetically
neg_combined_df = neg_combined_df.sort_values('hgnc_symbol')

In [20]:
neg_combined_df

Unnamed: 0,hgnc_symbol,GO:0043229,GO:0043231,GO:0043227,GO:0043226,GO:0005198,GO:0005488,GO:0009057,GO:0009719,GO:0042221,GO:0016071,GO:0051246,GO:0050896,GO:0071495,GO:0044238,GO:0006396,GO:0080090
0,ADAM22,,,,1.000000,,1.000000,,,,,,,,,,
1,ALS2,0.750000,0.750000,0.666667,0.750000,,0.666667,,,,,,,,1.000000,,
2,ASXL1,,,,,,,,0.5,0.75,,,0.454545,0.5,,,
3,BDNF,0.207650,0.213333,0.195312,0.202073,,0.187919,,,,,,,,,,
4,CAND1,0.136201,0.152381,0.171233,0.137809,,0.171779,,,,,,,,0.179856,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56,TRMT6,0.138686,0.156098,0.179856,0.141304,,0.184211,,,,0.454545,,,,0.192308,0.538462,
57,TUBGCP3,0.184466,,,0.179724,,,,,,,,,,,,
58,UBL5,0.209945,0.235294,0.231481,0.206349,,0.217054,,,,,,,,0.225225,,
59,WWOX,,,,,,,,0.5,,,,0.454545,0.5,,,


In [21]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_combined_df[neg_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0043229,GO:0043231,GO:0043227,GO:0043226,GO:0005198,GO:0005488,GO:0009057,GO:0009719,GO:0042221,GO:0016071,GO:0051246,GO:0050896,GO:0071495,GO:0044238,GO:0006396,GO:0080090


**Negative - Combine Results with A List of All Genes Symbols**

In [22]:
neg_results_df = pd.merge(neg_genes_list_df, neg_combined_df, how='left', on='hgnc_symbol')

In [23]:
# Sort the genes ID alphabetically
neg_results_df = neg_results_df.sort_values('hgnc_symbol')

In [24]:
neg_empty_cells = neg_results_df.isnull().sum()
neg_empty_cells

hgnc_symbol      0
GO:0043229     161
GO:0043231     162
GO:0043227     170
GO:0043226     158
GO:0005198     200
GO:0005488     165
GO:0009057     197
GO:0009719     200
GO:0042221     200
GO:0016071     198
GO:0051246     198
GO:0050896     196
GO:0071495     200
GO:0044238     170
GO:0006396     196
GO:0080090     188
dtype: int64

In [25]:
# Replace missing values with 0
neg_results_df = neg_results_df.fillna(0)

In [26]:
neg_results_df

Unnamed: 0,hgnc_symbol,GO:0043229,GO:0043231,GO:0043227,GO:0043226,GO:0005198,GO:0005488,GO:0009057,GO:0009719,GO:0042221,GO:0016071,GO:0051246,GO:0050896,GO:0071495,GO:0044238,GO:0006396,GO:0080090
0,ACAP1,0.00,0.00,0.000000,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ADAM22,0.00,0.00,0.000000,1.00,0.0,1.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ADAMTS17,0.00,0.00,0.000000,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AGPS,0.00,0.00,0.000000,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ALS2,0.75,0.75,0.666667,0.75,0.0,0.666667,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,ZNF12,0.00,0.00,0.000000,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200,ZNF134,0.00,0.00,0.000000,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201,ZNF226,0.00,0.00,0.000000,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
202,ZNF835,0.00,0.00,0.000000,0.00,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_results_df[neg_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0043229,GO:0043231,GO:0043227,GO:0043226,GO:0005198,GO:0005488,GO:0009057,GO:0009719,GO:0042221,GO:0016071,GO:0051246,GO:0050896,GO:0071495,GO:0044238,GO:0006396,GO:0080090


**Negative - Save File**

In [None]:
# Save the DataFrame as a CSV file
neg_results_df.to_csv('local_classic_results/ML_input_files/ClosenessCentrality_tables/GO/local_classic_negative.csv')

**Positive Genes**

In [29]:
# Specify the folder path
pos_CC_folderpath = 'local_classic_results/ClosenessCentrality_tables_GO/local_classic_positive_genesetsCC'
pos_MF_folderpath = 'local_classic_results/ClosenessCentrality_tables_GO/local_classic_positive_genesetsMF'
pos_BP_folderpath = 'local_classic_results/ClosenessCentrality_tables_GO/local_classic_positive_genesetsBP'

In [30]:
# Get the folder
pos_CC_folder = os.fsencode(pos_CC_folderpath)
pos_MF_folder = os.fsencode(pos_MF_folderpath)
pos_BP_folder = os.fsencode(pos_BP_folderpath)

**Positive - Get CC files**

In [31]:
# Initialize an empty list to store dataframes
pos_df_list = []

In [32]:
# loop over files in the CC folder
for CC_file in os.listdir(pos_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [33]:
# check the number of dataframes in the list 
print(len(pos_df_list))

0


**Positive - Get MF files**

In [34]:
# loop over files in the MF folder
for MF_file in os.listdir(pos_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [35]:
# check the number of dataframes in the list 
print(len(pos_df_list))

1


**Positive - Get BP files**

In [36]:
# loop over files in the BP folder
for BP_file in os.listdir(pos_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [37]:
# check the number of dataframes in the list 
print(len(pos_df_list))

5


**Positive - Get a list of all genes symbols**

In [38]:
# Specify the file path of the genes list
pos_all_genes_filepath = 'local_classic_results/filtered-degs-genes-only/local_classic_sig_genes.tsv'

In [39]:
# get the file of genes list
pos_all_genes_df = pd.read_csv(pos_all_genes_filepath, sep='\t')

In [40]:
pos_all_genes_df

Unnamed: 0,ID
0,TMPRSS4
1,CTSE
2,S100P
3,AGR2
4,STYK1
...,...
354,MSMB
355,APOBEC2
356,DMBT1
357,FXYD2


In [41]:
# Rename the column 
pos_all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [42]:
# Sort the genes ID alphabetically
pos_genes_list_df = pos_all_genes_df.sort_values('hgnc_symbol')

In [43]:
pos_genes_list_df

Unnamed: 0,hgnc_symbol
79,ACSL5
42,ADAM28
77,ADAM8
198,ADARB2
269,ADCYAP1R1
...,...
141,WNT5A
98,ZBED2
207,ZBTB16
273,ZNF469


**Positive - Combine files**

In [44]:
pos_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), pos_df_list)

In [45]:
# Sort the genes ID alphabetically
pos_combined_df = pos_combined_df.sort_values('hgnc_symbol')

In [46]:
pos_combined_df

Unnamed: 0,hgnc_symbol,GO:0016787,GO:0030003,GO:0050801,GO:0006873,GO:0055080
0,ATP4A,,1.0,1.0,1.0,1.0
1,CEL,0.923077,,,,
2,CPA1,1.0,,,,
3,CPA2,0.923077,,,,
4,CPB1,0.923077,,,,
5,CTRB1,1.0,,,,
6,CTRB2,0.923077,,,,
7,CTRC,1.0,,,,
8,CTRL,0.8,,,,
9,FXYD2,,1.0,1.0,1.0,1.0


In [47]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_combined_df[pos_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0016787,GO:0030003,GO:0050801,GO:0006873,GO:0055080


**Positive - Combine Results with A List of All Genes Symbols**

In [48]:
pos_results_df = pd.merge(pos_genes_list_df, pos_combined_df, how='left', on='hgnc_symbol')

In [49]:
# Sort the genes ID alphabetically
pos_results_df = pos_results_df.sort_values('hgnc_symbol')

In [50]:
pos_empty_cells = pos_results_df.isnull().sum()
pos_empty_cells

hgnc_symbol      0
GO:0016787     346
GO:0030003     350
GO:0050801     350
GO:0006873     350
GO:0055080     350
dtype: int64

In [51]:
# Replace missing values with 0
pos_results_df = pos_results_df.fillna(0)

In [52]:
pos_results_df

Unnamed: 0,hgnc_symbol,GO:0016787,GO:0030003,GO:0050801,GO:0006873,GO:0055080
0,ACSL5,0.0,0.0,0.0,0.0,0.0
1,ADAM28,0.0,0.0,0.0,0.0,0.0
2,ADAM8,0.0,0.0,0.0,0.0,0.0
3,ADARB2,0.0,0.0,0.0,0.0,0.0
4,ADCYAP1R1,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...
354,WNT5A,0.0,0.0,0.0,0.0,0.0
355,ZBED2,0.0,0.0,0.0,0.0,0.0
356,ZBTB16,0.0,0.0,0.0,0.0,0.0
357,ZNF469,0.0,0.0,0.0,0.0,0.0


In [53]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_results_df[pos_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0016787,GO:0030003,GO:0050801,GO:0006873,GO:0055080


**Positive - Save File**

In [None]:
# Save the DataFrame as a CSV file
pos_results_df.to_csv('local_classic_results/ML_input_files/ClosenessCentrality_tables/GO/local_classic_positive.csv')

**ALL Genes**

In [55]:
# Specify the folder path
all_CC_folderpath = 'local_classic_results/ClosenessCentrality_tables_GO/local_classic_all_genesetsCC'
all_MF_folderpath = 'local_classic_results/ClosenessCentrality_tables_GO/local_classic_all_genesetsMF'
all_BP_folderpath = 'local_classic_results/ClosenessCentrality_tables_GO/local_classic_all_genesetsBP'

In [56]:
# Get the folder
all_CC_folder = os.fsencode(all_CC_folderpath)
all_MF_folder = os.fsencode(all_MF_folderpath)
all_BP_folder = os.fsencode(all_BP_folderpath)

**ALL - Get CC files**

In [57]:
# Initialize an empty list to store dataframes
all_df_list = []

In [58]:
# loop over files in the CC folder
for CC_file in os.listdir(all_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [59]:
# check the number of dataframes in the list 
print(len(all_df_list))

58


**ALL - Get MF files**

In [60]:
# loop over files in the MF folder
for MF_file in os.listdir(all_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [61]:
# check the number of dataframes in the list 
print(len(all_df_list))

97


**ALL - Get BP files**

In [62]:
# loop over files in the BP folder
for BP_file in os.listdir(all_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [63]:
# check the number of dataframes in the list 
print(len(all_df_list))

528


**ALL - Get a list of all genes symbols**

In [64]:
# Specify the file path of the genes list
all_genes_filepath = 'local_classic_results/filtered-degs-genes-only/local_classic_all_genes.tsv'

In [65]:
# get the file of genes list
all_genes_df = pd.read_csv(all_genes_filepath, sep='\t')

In [66]:
all_genes_df

Unnamed: 0,ID
0,TMPRSS4
1,CTSE
2,S100P
3,GYPC
4,AGR2
...,...
15389,CGB2
15390,SALL1
15391,INTS9
15392,ACTL7A


In [67]:
# Rename the column 
all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [68]:
# Sort the genes ID alphabetically
all_genes_list_df = all_genes_df.sort_values('hgnc_symbol')

In [69]:
all_genes_list_df

Unnamed: 0,hgnc_symbol
6176,A1BG
2875,A1CF
13870,A2M
3534,A2ML1
9767,A3GALT2
...,...
10183,ZXDC
9982,ZYG11A
12003,ZYG11B
10052,ZYX


**ALL - Combine files**

In [70]:
all_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), all_df_list)

In [71]:
# Sort the genes ID alphabetically
all_combined_df = all_combined_df.sort_values('hgnc_symbol')

In [72]:
all_combined_df

Unnamed: 0,hgnc_symbol,GO:0031252,GO:0002102,GO:0005902,GO:0005721,GO:0062023,GO:0005912,GO:0098992,GO:0000922,GO:0005938,...,GO:0071280,GO:0002285,GO:0002440,GO:0043434,GO:0042058,GO:0000280,GO:0098754,GO:1903037,GO:0035987,GO:0050851
0,ABAT,,,,,,,,,,...,,,,,,,,,,
1,ABCA12,,,,,,,,,,...,,,,,,,,,,
2,ABCC1,,,,,,,,,,...,,,,,,,,,,
3,ABCC3,,,,,,,,,,...,,,,,,,,,,
4,ABCC5,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2797,ZNF469,,,,,,,,,,...,,,,,,,,,,
2798,ZNF598,,,,,,,,,,...,,,,,,,,,,
2799,ZNFX1,,,,,,,,,,...,,,,,,,,,,
2800,ZWILCH,,,,,,,,,,...,,,,,,0.557292,,,,


In [73]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_combined_df[all_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0031252,GO:0002102,GO:0005902,GO:0005721,GO:0062023,GO:0005912,GO:0098992,GO:0000922,GO:0005938,...,GO:0071280,GO:0002285,GO:0002440,GO:0043434,GO:0042058,GO:0000280,GO:0098754,GO:1903037,GO:0035987,GO:0050851


**ALL - Combine Results with A List of All Genes Symbols**

In [74]:
all_results_df = pd.merge(all_genes_list_df, all_combined_df, how='left', on='hgnc_symbol')

In [75]:
# Sort the genes ID alphabetically
all_results_df = all_results_df.sort_values('hgnc_symbol')

In [76]:
all_empty_cells = all_results_df.isnull().sum()
all_empty_cells

hgnc_symbol        0
GO:0031252     15265
GO:0002102     15377
GO:0005902     15379
GO:0005721     15385
               ...  
GO:0000280     15286
GO:0098754     15358
GO:1903037     15286
GO:0035987     15379
GO:0050851     15334
Length: 529, dtype: int64

In [77]:
# Replace missing values with 0
all_results_df = all_results_df.fillna(0)

In [78]:
all_results_df

Unnamed: 0,hgnc_symbol,GO:0031252,GO:0002102,GO:0005902,GO:0005721,GO:0062023,GO:0005912,GO:0098992,GO:0000922,GO:0005938,...,GO:0071280,GO:0002285,GO:0002440,GO:0043434,GO:0042058,GO:0000280,GO:0098754,GO:1903037,GO:0035987,GO:0050851
0,A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15389,ZXDC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15390,ZYG11A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15391,ZYG11B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15392,ZYX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [79]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_results_df[all_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0031252,GO:0002102,GO:0005902,GO:0005721,GO:0062023,GO:0005912,GO:0098992,GO:0000922,GO:0005938,...,GO:0071280,GO:0002285,GO:0002440,GO:0043434,GO:0042058,GO:0000280,GO:0098754,GO:1903037,GO:0035987,GO:0050851


**ALL - Save File**

In [None]:
# Save the DataFrame as a CSV file
all_results_df.to_csv('local_classic_results/ML_input_files/ClosenessCentrality_tables/GO/local_classic_all.csv')