In [1]:
import numpy as np
import pandas as pd
import os
from functools import reduce

**Negative Genes**

In [2]:
# Specify the folder path
neg_CC_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_GO/local_basal_negative_genesetsCC'
neg_MF_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_GO/local_basal_negative_genesetsMF'
neg_BP_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_GO/local_basal_negative_genesetsBP'

In [3]:
# Get the folder
neg_CC_folder = os.fsencode(neg_CC_folderpath)
neg_MF_folder = os.fsencode(neg_MF_folderpath)
neg_BP_folder = os.fsencode(neg_BP_folderpath)

**Negative - Get CC files**

In [4]:
# Initialize an empty list to store dataframes
neg_df_list = []

In [5]:
# loop over files in the CC folder
for CC_file in os.listdir(neg_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [6]:
# check the number of dataframes in the list 
print(len(neg_df_list))

25


**Negative - Get MF files**

In [7]:
# loop over files in the MF folder
for MF_file in os.listdir(neg_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [8]:
# check the number of dataframes in the list 
print(len(neg_df_list))

39


**Negative - Get BP files**

In [9]:
# loop over files in the BP folder
for BP_file in os.listdir(neg_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [10]:
# check the number of dataframes in the list 
print(len(neg_df_list))

68


**Negative - Get a list of all genes symbols**

In [11]:
# Specify the file path of the genes list
neg_all_genes_filepath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/filtered-degs-genes-only/local_basal_negative_genes.tsv'

In [12]:
# get the file of genes list
neg_all_genes_df = pd.read_csv(neg_all_genes_filepath, sep='\t')

In [13]:
neg_all_genes_df

Unnamed: 0,ID
0,BRIP1
1,FHAD1
2,WDR19
3,LIN54
4,UBA52
...,...
416,KRAS
417,OAZ2
418,LEO1
419,MTHFSD


In [14]:
# Rename the column 
neg_all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [15]:
# Sort the genes ID alphabetically
neg_genes_list_df = neg_all_genes_df.sort_values('hgnc_symbol')

In [16]:
neg_genes_list_df

Unnamed: 0,hgnc_symbol
18,ABCB4
117,ABHD14B
195,ACER3
105,ACSM2A
187,ACTL6A
...,...
19,ZNF695
379,ZNF75D
241,ZNF835
46,ZNF90


**Negative - Combine files**

In [17]:
neg_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), neg_df_list)

In [18]:
# Sort the genes ID alphabetically
neg_combined_df = neg_combined_df.sort_values('hgnc_symbol')

In [19]:
neg_combined_df

Unnamed: 0,hgnc_symbol,GO:0005622,GO:0005634,GO:0005654,GO:0005681,GO:0005739,GO:0005747,GO:0005759,GO:0005813,GO:0005815,...,GO:0042776,GO:0043412,GO:0043687,GO:0044248,GO:0044281,GO:0044419,GO:0046034,GO:0070647,GO:0098727,GO:0140888
119,ACSM2A,,,,,0.300000,,0.666667,,,...,,,,,0.333333,,,,,
56,ACTL6A,0.415638,0.448052,0.477477,,,,,,,...,,,,,,,,,1.0,
81,ACTR5,0.312693,0.327014,0.344156,,,,,,,...,,,,,,,,,,
36,ADAR,0.374074,0.401163,0.404580,,,,,,,...,,0.376623,,,,1.0,,,,0.75
157,ADCY5,,,,,,,,,,...,,,,,0.500000,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26,WNK1,0.267905,0.273810,,,,,,,,...,,0.290000,0.361702,,,,,0.365854,,
121,WWOX,,,,,0.357143,,,,,...,,,,,,,,,,
79,YOD1,0.245146,,,,,,,,,...,,0.278846,0.293103,,,,,0.300000,,
62,ZMYM1,0.277473,0.289916,,,,,,,,...,,,,,,,,,,


In [20]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_combined_df[neg_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0005622,GO:0005634,GO:0005654,GO:0005681,GO:0005739,GO:0005747,GO:0005759,GO:0005813,GO:0005815,...,GO:0042776,GO:0043412,GO:0043687,GO:0044248,GO:0044281,GO:0044419,GO:0046034,GO:0070647,GO:0098727,GO:0140888


**Negative - Combine Results with A List of All Genes Symbols**

In [21]:
neg_results_df = pd.merge(neg_genes_list_df, neg_combined_df, how='left', on='hgnc_symbol')

In [22]:
# Sort the genes ID alphabetically
neg_results_df = neg_results_df.sort_values('hgnc_symbol')

In [23]:
neg_empty_cells = neg_results_df.isnull().sum()
neg_empty_cells

hgnc_symbol      0
GO:0005622     312
GO:0005634     351
GO:0005654     367
GO:0005681     416
              ... 
GO:0044419     415
GO:0046034     417
GO:0070647     405
GO:0098727     417
GO:0140888     417
Length: 69, dtype: int64

In [24]:
# Replace missing values with 0
neg_results_df = neg_results_df.fillna(0)

In [25]:
neg_results_df

Unnamed: 0,hgnc_symbol,GO:0005622,GO:0005634,GO:0005654,GO:0005681,GO:0005739,GO:0005747,GO:0005759,GO:0005813,GO:0005815,...,GO:0042776,GO:0043412,GO:0043687,GO:0044248,GO:0044281,GO:0044419,GO:0046034,GO:0070647,GO:0098727,GO:0140888
0,ABCB4,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,ABHD14B,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,ACER3,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,ACSM2A,0.000000,0.000000,0.000000,0.0,0.3,0.0,0.666667,0.0,0.0,...,0.0,0.0,0.0,0.0,0.333333,0.0,0.0,0.0,0.0,0.0
4,ACTL6A,0.415638,0.448052,0.477477,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,ZNF695,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
417,ZNF75D,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
418,ZNF835,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
419,ZNF90,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0


In [26]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_results_df[neg_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0005622,GO:0005634,GO:0005654,GO:0005681,GO:0005739,GO:0005747,GO:0005759,GO:0005813,GO:0005815,...,GO:0042776,GO:0043412,GO:0043687,GO:0044248,GO:0044281,GO:0044419,GO:0046034,GO:0070647,GO:0098727,GO:0140888


**Negative - Save File**

In [27]:
# Save the DataFrame as a CSV file
neg_results_df.to_csv(r'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ML_input_files/ClosenessCentrality_tables/GO/local_basal_negative.csv')

**Positive Genes**

In [28]:
# Specify the folder path
pos_CC_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_GO/local_basal_positive_genesetsCC'
pos_MF_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_GO/local_basal_positive_genesetsMF'
pos_BP_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_GO/local_basal_positive_genesetsBP'

In [29]:
# Get the folder
pos_CC_folder = os.fsencode(pos_CC_folderpath)
pos_MF_folder = os.fsencode(pos_MF_folderpath)
pos_BP_folder = os.fsencode(pos_BP_folderpath)

**Positive - Get CC files**

In [30]:
# Initialize an empty list to store dataframes
pos_df_list = []

In [31]:
# loop over files in the CC folder
for CC_file in os.listdir(pos_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [32]:
# check the number of dataframes in the list 
print(len(pos_df_list))

55


**Positive - Get MF files**

In [33]:
# loop over files in the MF folder
for MF_file in os.listdir(pos_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [34]:
# check the number of dataframes in the list 
print(len(pos_df_list))

101


**Positive - Get BP files**

In [35]:
# loop over files in the BP folder
for BP_file in os.listdir(pos_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [36]:
# check the number of dataframes in the list 
print(len(pos_df_list))

438


**Positive - Get a list of all genes symbols**

In [37]:
# Specify the file path of the genes list
pos_all_genes_filepath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/filtered-degs-genes-only/local_basal_sig_genes.tsv'

In [38]:
# get the file of genes list
pos_all_genes_df = pd.read_csv(pos_all_genes_filepath, sep='\t')

In [39]:
pos_all_genes_df

Unnamed: 0,ID
0,SLC2A1
1,SLC16A3
2,TNS4
3,SCEL
4,ZBED2
...,...
610,DNASE1
611,TFF2
612,SPRR3
613,CACNA1E


In [40]:
# Rename the column 
pos_all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [41]:
# Sort the genes ID alphabetically
pos_genes_list_df = pos_all_genes_df.sort_values('hgnc_symbol')

In [42]:
pos_genes_list_df

Unnamed: 0,hgnc_symbol
154,A1CF
195,ABAT
114,ABCA3
459,ABCC8
506,ACE2
...,...
422,ZBTB16
189,ZNF185
273,ZNF365
254,ZNF469


**Positive - Combine files**

In [43]:
pos_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), pos_df_list)

In [44]:
# Sort the genes ID alphabetically
pos_combined_df = pos_combined_df.sort_values('hgnc_symbol')

In [45]:
pos_combined_df

Unnamed: 0,hgnc_symbol,GO:0000775,GO:0000776,GO:0000779,GO:0001533,GO:0005576,GO:0005581,GO:0005583,GO:0005604,GO:0005615,...,GO:1903828,GO:1903900,GO:1904950,GO:1904951,GO:1905818,GO:1990573,GO:2000026,GO:2000027,GO:2000145,GO:2000147
411,A1CF,,,,,,,,,,...,,,,,,,,,,
386,ABAT,,,,,,,,,,...,,,,,,,,,,
256,ABCC8,,,,,,,,,,...,0.666667,,0.666667,0.571429,,0.857143,,,,
237,ACE2,,,,,,,,,,...,,,,,,,,,,
124,ACP5,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
165,WNT2,,,,,,,,,,...,,,,,,,0.357143,1.0,,
42,WNT5A,,,,,0.457317,,,,,...,,,,,,,0.526316,1.0,0.530612,0.608696
199,XAF1,,,,,,,,,,...,,,,,,,,,,
143,ZNF469,,,,,,,,,,...,,,,,,,,,,


In [46]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_combined_df[pos_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0000775,GO:0000776,GO:0000779,GO:0001533,GO:0005576,GO:0005581,GO:0005583,GO:0005604,GO:0005615,...,GO:1903828,GO:1903900,GO:1904950,GO:1904951,GO:1905818,GO:1990573,GO:2000026,GO:2000027,GO:2000145,GO:2000147


**Positive - Combine Results with A List of All Genes Symbols**

In [47]:
pos_results_df = pd.merge(pos_genes_list_df, pos_combined_df, how='left', on='hgnc_symbol')

In [48]:
# Sort the genes ID alphabetically
pos_results_df = pos_results_df.sort_values('hgnc_symbol')

In [49]:
pos_empty_cells = pos_results_df.isnull().sum()
pos_empty_cells

hgnc_symbol      0
GO:0000775     602
GO:0000776     607
GO:0000779     605
GO:0001533     606
              ... 
GO:1990573     608
GO:2000026     594
GO:2000027     611
GO:2000145     588
GO:2000147     600
Length: 439, dtype: int64

In [50]:
# Replace missing values with 0
pos_results_df = pos_results_df.fillna(0)

In [51]:
pos_results_df

Unnamed: 0,hgnc_symbol,GO:0000775,GO:0000776,GO:0000779,GO:0001533,GO:0005576,GO:0005581,GO:0005583,GO:0005604,GO:0005615,...,GO:1903828,GO:1903900,GO:1904950,GO:1904951,GO:1905818,GO:1990573,GO:2000026,GO:2000027,GO:2000145,GO:2000147
0,A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
1,ABAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
2,ABCA3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
3,ABCC8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.666667,0.0,0.666667,0.571429,0.0,0.857143,0.0,0.0,0.0,0.0
4,ACE2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,ZBTB16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
611,ZNF185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
612,ZNF365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0
613,ZNF469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.0,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0


In [52]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_results_df[pos_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0000775,GO:0000776,GO:0000779,GO:0001533,GO:0005576,GO:0005581,GO:0005583,GO:0005604,GO:0005615,...,GO:1903828,GO:1903900,GO:1904950,GO:1904951,GO:1905818,GO:1990573,GO:2000026,GO:2000027,GO:2000145,GO:2000147


**Positive - Save File**

In [53]:
# Save the DataFrame as a CSV file
pos_results_df.to_csv(r'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ML_input_files/ClosenessCentrality_tables/GO/local_basal_positive.csv')

**ALL Genes**

In [54]:
# Specify the folder path
all_CC_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_GO/local_basal_all_genesetsCC'
all_MF_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_GO/local_basal_all_genesetsMF'
all_BP_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_GO/local_basal_all_genesetsBP'

In [55]:
# Get the folder
all_CC_folder = os.fsencode(all_CC_folderpath)
all_MF_folder = os.fsencode(all_MF_folderpath)
all_BP_folder = os.fsencode(all_BP_folderpath)

**ALL - Get CC files**

In [56]:
# Initialize an empty list to store dataframes
all_df_list = []

In [57]:
# loop over files in the CC folder
for CC_file in os.listdir(all_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [58]:
# check the number of dataframes in the list 
print(len(all_df_list))

191


**ALL - Get MF files**

In [59]:
# loop over files in the MF folder
for MF_file in os.listdir(all_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [60]:
# check the number of dataframes in the list 
print(len(all_df_list))

422


**ALL - Get BP files**

In [61]:
# loop over files in the BP folder
for BP_file in os.listdir(all_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [62]:
# check the number of dataframes in the list 
print(len(all_df_list))

2149


**ALL - Get a list of all genes symbols**

In [63]:
# Specify the file path of the genes list
all_genes_filepath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/filtered-degs-genes-only/local_basal_all_genes.tsv'

In [64]:
# get the file of genes list
all_genes_df = pd.read_csv(all_genes_filepath, sep='\t')

In [65]:
all_genes_df

Unnamed: 0,ID
0,SLC2A1
1,SLC16A3
2,TNS4
3,SCEL
4,ZBED2
...,...
15389,MAP4
15390,MTHFD2
15391,FGF5
15392,GLOD4


In [66]:
# Rename the column 
all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [67]:
# Sort the genes ID alphabetically
all_genes_list_df = all_genes_df.sort_values('hgnc_symbol')

In [68]:
all_genes_list_df

Unnamed: 0,hgnc_symbol
6285,A1BG
211,A1CF
6849,A2M
13474,A2ML1
11664,A3GALT2
...,...
5943,ZXDC
10684,ZYG11A
4407,ZYG11B
3480,ZYX


**ALL - Combine files**

In [69]:
all_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), all_df_list)

In [70]:
# Sort the genes ID alphabetically
all_combined_df = all_combined_df.sort_values('hgnc_symbol')

In [71]:
all_combined_df

Unnamed: 0,hgnc_symbol,GO:0000139,GO:0000228,GO:0000307,GO:0000323,GO:0000502,GO:0000775,GO:0000776,GO:0000779,GO:0000781,...,GO:2001056,GO:2001187,GO:2001204,GO:2001233,GO:2001235,GO:2001236,GO:2001237,GO:2001238,GO:2001242,GO:2001251
2106,A2M,,,,,,,,,,...,,,,,,,,,,
107,A4GALT,0.272727,,,,,,,,,...,,,,,,,,,,
2854,AADAC,,,,,,,,,,...,,,,,,,,,,
4221,AANAT,,,,,,,,,,...,,,,,,,,,,
3027,AATF,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1764,ZNF804A,,,,,,,,,,...,,,,,,,,,,
2463,ZNFX1,,,,,,,,,,...,,,,,,,,,,
443,ZWILCH,,,,,,0.679612,0.701299,0.742857,,...,,,,,,,,,,0.806452
409,ZWINT,,,,,,0.707071,0.701299,0.722222,,...,,,,,,,,,,0.833333


In [72]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_combined_df[all_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0000139,GO:0000228,GO:0000307,GO:0000323,GO:0000502,GO:0000775,GO:0000776,GO:0000779,GO:0000781,...,GO:2001056,GO:2001187,GO:2001204,GO:2001233,GO:2001235,GO:2001236,GO:2001237,GO:2001238,GO:2001242,GO:2001251


**ALL - Combine Results with A List of All Genes Symbols**

In [73]:
all_results_df = pd.merge(all_genes_list_df, all_combined_df, how='left', on='hgnc_symbol')

In [74]:
# Sort the genes ID alphabetically
all_results_df = all_results_df.sort_values('hgnc_symbol')

In [75]:
all_empty_cells = all_results_df.isnull().sum()
all_empty_cells

hgnc_symbol        0
GO:0000139     15247
GO:0000228     15337
GO:0000307     15378
GO:0000323     15227
               ...  
GO:2001236     15345
GO:2001237     15358
GO:2001238     15375
GO:2001242     15360
GO:2001251     15368
Length: 2150, dtype: int64

In [76]:
# Replace missing values with 0
all_results_df = all_results_df.fillna(0)

In [77]:
all_results_df

Unnamed: 0,hgnc_symbol,GO:0000139,GO:0000228,GO:0000307,GO:0000323,GO:0000502,GO:0000775,GO:0000776,GO:0000779,GO:0000781,...,GO:2001056,GO:2001187,GO:2001204,GO:2001233,GO:2001235,GO:2001236,GO:2001237,GO:2001238,GO:2001242,GO:2001251
0,A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15389,ZXDC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15390,ZYG11A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15391,ZYG11B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15392,ZYX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_results_df[all_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0000139,GO:0000228,GO:0000307,GO:0000323,GO:0000502,GO:0000775,GO:0000776,GO:0000779,GO:0000781,...,GO:2001056,GO:2001187,GO:2001204,GO:2001233,GO:2001235,GO:2001236,GO:2001237,GO:2001238,GO:2001242,GO:2001251


**ALL - Save File**

In [79]:
# Save the DataFrame as a CSV file
all_results_df.to_csv(r'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ML_input_files/ClosenessCentrality_tables/GO/local_basal_all.csv')