In [1]:
import numpy as np
import pandas as pd
import os
from functools import reduce

**Negative Genes**

In [2]:
# Specify the folder path
neg_CC_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ClosenessCentrality_tables/qm_negative_genesetsCC'
neg_MF_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ClosenessCentrality_tables/qm_negative_genesetsMF'
neg_BP_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ClosenessCentrality_tables/qm_negative_genesetsBP'

In [3]:
# Get the folder
neg_CC_folder = os.fsencode(neg_CC_folderpath)
neg_MF_folder = os.fsencode(neg_MF_folderpath)
neg_BP_folder = os.fsencode(neg_BP_folderpath)

**Negative - Get CC files**

In [4]:
# Initialize an empty list to store dataframes
neg_df_list = []

In [5]:
# loop over files in the CC folder
for CC_file in os.listdir(neg_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [6]:
# check the number of dataframes in the list 
print(len(neg_df_list))

111


**Negative - Get MF files**

In [7]:
# loop over files in the MF folder
for MF_file in os.listdir(neg_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [8]:
# check the number of dataframes in the list 
print(len(neg_df_list))

207


**Negative - Get BP files**

In [9]:
# loop over files in the BP folder
for BP_file in os.listdir(neg_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [10]:
# check the number of dataframes in the list 
print(len(neg_df_list))

542


**Negative - Get a list of all genes symbols**

In [11]:
# Specify the file path of the genes list
neg_all_genes_filepath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/filtered-degs-genes-only/qm_negative_genes.tsv'

In [12]:
# get the file of genes list
neg_all_genes_df = pd.read_csv(neg_all_genes_filepath, sep='\t')

In [13]:
neg_all_genes_df

Unnamed: 0,ID
0,A1BG
1,AAAS
2,AACS
3,AADAT
4,AAGAB
...,...
3392,ZSWIM1
3393,ZW10
3394,ZXDA
3395,ZXDB


In [14]:
# Rename the column 
neg_all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [15]:
# Sort the genes ID alphabetically
neg_genes_list_df = neg_all_genes_df.sort_values('hgnc_symbol')

In [16]:
neg_genes_list_df

Unnamed: 0,hgnc_symbol
0,A1BG
1,AAAS
2,AACS
3,AADAT
4,AAGAB
...,...
3392,ZSWIM1
3393,ZW10
3394,ZXDA
3395,ZXDB


**Negative - Combine files**

In [17]:
neg_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), neg_df_list)

In [18]:
# Sort the genes ID alphabetically
neg_combined_df = neg_combined_df.sort_values('hgnc_symbol')

In [19]:
neg_combined_df

Unnamed: 0,hgnc_symbol,GO:0000118,GO:0000123,GO:0000151,GO:0000228,GO:0000323,GO:0000421,GO:0000502,GO:0000775,GO:0000776,...,GO:1903322,GO:1903530,GO:1990542,GO:2000045,GO:2000142,GO:2000144,GO:2000278,GO:2000573,GO:2001235,GO:2001252
679,A1BG,,,,,,,,,,...,,,,,,,,,,
1291,AAAS,,,,,,,,,,...,,,,,,,,,,
1465,AACS,,,,,,,,,,...,,0.333333,,,,,,,,
1500,AADAT,,,,,,,,,,...,,,,,,,,,,
189,AAR2,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1,ZNHIT1,0.571429,,,0.392857,,,,,,...,,,,,1.0,,,,,
942,ZNHIT3,,,,,,,,,,...,,,,,,,,,,
1026,ZNRF3,,,,,,,,,,...,,,,,,,,,,
1053,ZSCAN16,,,,,,,,,,...,,,,,,,,,,


In [20]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_combined_df[neg_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0000118,GO:0000123,GO:0000151,GO:0000228,GO:0000323,GO:0000421,GO:0000502,GO:0000775,GO:0000776,...,GO:1903322,GO:1903530,GO:1990542,GO:2000045,GO:2000142,GO:2000144,GO:2000278,GO:2000573,GO:2001235,GO:2001252


**Negative - Combine Results with A List of All Genes Symbols**

In [21]:
neg_results_df = pd.merge(neg_genes_list_df, neg_combined_df, how='left', on='hgnc_symbol')

In [22]:
# Sort the genes ID alphabetically
neg_results_df = neg_results_df.sort_values('hgnc_symbol')

In [23]:
neg_empty_cells = neg_results_df.isnull().sum()
neg_empty_cells

hgnc_symbol       0
GO:0000118     3388
GO:0000123     3380
GO:0000151     3371
GO:0000228     3374
               ... 
GO:2000144     3380
GO:2000278     3385
GO:2000573     3390
GO:2001235     3392
GO:2001252     3383
Length: 543, dtype: int64

In [24]:
# Replace missing values with 0
neg_results_df = neg_results_df.fillna(0)

In [25]:
neg_results_df

Unnamed: 0,hgnc_symbol,GO:0000118,GO:0000123,GO:0000151,GO:0000228,GO:0000323,GO:0000421,GO:0000502,GO:0000775,GO:0000776,...,GO:1903322,GO:1903530,GO:1990542,GO:2000045,GO:2000142,GO:2000144,GO:2000278,GO:2000573,GO:2001235,GO:2001252
0,A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,AAAS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AACS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.333333,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AADAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,AAGAB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3392,ZSWIM1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3393,ZW10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3394,ZXDA,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3395,ZXDB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_results_df[neg_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0000118,GO:0000123,GO:0000151,GO:0000228,GO:0000323,GO:0000421,GO:0000502,GO:0000775,GO:0000776,...,GO:1903322,GO:1903530,GO:1990542,GO:2000045,GO:2000142,GO:2000144,GO:2000278,GO:2000573,GO:2001235,GO:2001252


**Negative - Save File**

In [27]:
# Save the DataFrame as a CSV file
neg_results_df.to_csv(r'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ML_input_files/ClosenessCentrality_tables/qm_negative.csv')

**Positive Genes**

In [28]:
# Specify the folder path
pos_CC_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ClosenessCentrality_tables/qm_positive_genesetsCC'
pos_MF_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ClosenessCentrality_tables/qm_positive_genesetsMF'
pos_BP_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ClosenessCentrality_tables/qm_positive_genesetsBP'

In [29]:
# Get the folder
pos_CC_folder = os.fsencode(pos_CC_folderpath)
pos_MF_folder = os.fsencode(pos_MF_folderpath)
pos_BP_folder = os.fsencode(pos_BP_folderpath)

**Positive - Get CC files**

In [30]:
# Initialize an empty list to store dataframes
pos_df_list = []

In [31]:
# loop over files in the CC folder
for CC_file in os.listdir(pos_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [32]:
# check the number of dataframes in the list 
print(len(pos_df_list))

53


**Positive - Get MF files**

In [33]:
# loop over files in the MF folder
for MF_file in os.listdir(pos_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [34]:
# check the number of dataframes in the list 
print(len(pos_df_list))

127


**Positive - Get BP files**

In [35]:
# loop over files in the BP folder
for BP_file in os.listdir(pos_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [36]:
# check the number of dataframes in the list 
print(len(pos_df_list))

899


**Positive - Get a list of all genes symbols**

In [37]:
# Specify the file path of the genes list
pos_all_genes_filepath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/filtered-degs-genes-only/qm_sig_genes.tsv'

In [38]:
# get the file of genes list
pos_all_genes_df = pd.read_csv(pos_all_genes_filepath, sep='\t')

In [39]:
pos_all_genes_df

Unnamed: 0,ID
0,A1CF
1,A2M
2,AADAC
3,AATK
4,ABAT
...,...
3758,ZSCAN10
3759,ZSCAN2
3760,ZWILCH
3761,ZWINT


In [40]:
# Rename the column 
pos_all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [41]:
# Sort the genes ID alphabetically
pos_genes_list_df = pos_all_genes_df.sort_values('hgnc_symbol')

In [42]:
pos_genes_list_df

Unnamed: 0,hgnc_symbol
0,A1CF
1,A2M
2,AADAC
3,AATK
4,ABAT
...,...
3758,ZSCAN10
3759,ZSCAN2
3760,ZWILCH
3761,ZWINT


**Positive - Combine files**

In [43]:
pos_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), pos_df_list)

In [44]:
# Sort the genes ID alphabetically
pos_combined_df = pos_combined_df.sort_values('hgnc_symbol')

In [45]:
pos_combined_df

Unnamed: 0,hgnc_symbol,GO:0001772,GO:0005581,GO:0005604,GO:0005743,GO:0005759,GO:0005767,GO:0005783,GO:0005788,GO:0005884,...,GO:2000351,GO:2000401,GO:2000403,GO:2000404,GO:2000406,GO:2000514,GO:2000648,GO:2001057,GO:2001235,GO:2001236
1348,AADAC,,,,,,,,,,...,,,,,,,,,,
92,ABAT,,,,,0.520833,,,,,...,,,,,,,,,,
621,ABCA1,,,,,,,,,,...,,,,,,,,,,
928,ABCC2,,,,,,,,,,...,,,,,,,,,,
860,ABCC6,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1146,ZFPM2,,,,,,,,,,...,,,,,,,,,,
588,ZG16,,,,,,,,,,...,,,,,,,,,,
1496,ZNF281,,,,,,,,,,...,,,,,,,,,,
1548,ZNF521,,,,,,,,,,...,,,,,,,,,,


In [46]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_combined_df[pos_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0001772,GO:0005581,GO:0005604,GO:0005743,GO:0005759,GO:0005767,GO:0005783,GO:0005788,GO:0005884,...,GO:2000351,GO:2000401,GO:2000403,GO:2000404,GO:2000406,GO:2000514,GO:2000648,GO:2001057,GO:2001235,GO:2001236


**Positive - Combine Results with A List of All Genes Symbols**

In [47]:
pos_results_df = pd.merge(pos_genes_list_df, pos_combined_df, how='left', on='hgnc_symbol')

In [48]:
# Sort the genes ID alphabetically
pos_results_df = pos_results_df.sort_values('hgnc_symbol')

In [49]:
pos_empty_cells = pos_results_df.isnull().sum()
pos_empty_cells

hgnc_symbol       0
GO:0001772     3744
GO:0005581     3741
GO:0005604     3738
GO:0005743     3752
               ... 
GO:2000514     3729
GO:2000648     3750
GO:2001057     3745
GO:2001235     3740
GO:2001236     3728
Length: 900, dtype: int64

In [50]:
# Replace missing values with 0
pos_results_df = pos_results_df.fillna(0)

In [51]:
pos_results_df

Unnamed: 0,hgnc_symbol,GO:0001772,GO:0005581,GO:0005604,GO:0005743,GO:0005759,GO:0005767,GO:0005783,GO:0005788,GO:0005884,...,GO:2000351,GO:2000401,GO:2000403,GO:2000404,GO:2000406,GO:2000514,GO:2000648,GO:2001057,GO:2001235,GO:2001236
0,A1CF,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A2M,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,AADAC,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AATK,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ABAT,0.0,0.0,0.0,0.0,0.520833,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3758,ZSCAN10,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3759,ZSCAN2,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3760,ZWILCH,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3761,ZWINT,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [52]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_results_df[pos_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0001772,GO:0005581,GO:0005604,GO:0005743,GO:0005759,GO:0005767,GO:0005783,GO:0005788,GO:0005884,...,GO:2000351,GO:2000401,GO:2000403,GO:2000404,GO:2000406,GO:2000514,GO:2000648,GO:2001057,GO:2001235,GO:2001236


**Positive - Save File**

In [53]:
# Save the DataFrame as a CSV file
pos_results_df.to_csv(r'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ML_input_files/ClosenessCentrality_tables/qm_positive.csv')

**ALL Genes**

In [54]:
# Specify the folder path
all_CC_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ClosenessCentrality_tables/qm_all_genesetsCC'
all_MF_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ClosenessCentrality_tables/qm_all_genesetsMF'
all_BP_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ClosenessCentrality_tables/qm_all_genesetsBP'

In [55]:
# Get the folder
all_CC_folder = os.fsencode(all_CC_folderpath)
all_MF_folder = os.fsencode(all_MF_folderpath)
all_BP_folder = os.fsencode(all_BP_folderpath)

**ALL - Get CC files**

In [56]:
# Initialize an empty list to store dataframes
all_df_list = []

In [57]:
# loop over files in the CC folder
for CC_file in os.listdir(all_CC_folder):
    CC_filename = os.fsdecode(CC_file)
    if CC_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_CC_folderpath, CC_filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [58]:
# check the number of dataframes in the list 
print(len(all_df_list))

158


**ALL - Get MF files**

In [59]:
# loop over files in the MF folder
for MF_file in os.listdir(all_MF_folder):
    MF_filename = os.fsdecode(MF_file)
    if MF_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_MF_folderpath, MF_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [60]:
# check the number of dataframes in the list 
print(len(all_df_list))

375


**ALL - Get BP files**

In [61]:
# loop over files in the BP folder
for BP_file in os.listdir(all_BP_folder):
    BP_filename = os.fsdecode(BP_file)
    if BP_filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_BP_folderpath, BP_filename), sep='\t')
                
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [62]:
# check the number of dataframes in the list 
print(len(all_df_list))

2102


**ALL - Get a list of all genes symbols**

In [63]:
# Specify the file path of the genes list
all_genes_filepath = 'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/filtered-degs-genes-only/qm_all_genes.tsv'

In [64]:
# get the file of genes list
all_genes_df = pd.read_csv(all_genes_filepath, sep='\t')

In [65]:
all_genes_df

Unnamed: 0,ID
0,A1BG
1,A1CF
2,A2M
3,A4GALT
4,A4GNT
...,...
14433,ZXDB
14434,ZXDC
14435,ZYG11B
14436,ZYX


In [66]:
# Rename the column 
all_genes_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [67]:
# Sort the genes ID alphabetically
all_genes_list_df = all_genes_df.sort_values('hgnc_symbol')

In [68]:
all_genes_list_df

Unnamed: 0,hgnc_symbol
0,A1BG
1,A1CF
2,A2M
3,A4GALT
4,A4GNT
...,...
14433,ZXDB
14434,ZXDC
14435,ZYG11B
14436,ZYX


**ALL - Combine files**

In [69]:
all_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), all_df_list)

In [70]:
# Sort the genes ID alphabetically
all_combined_df = all_combined_df.sort_values('hgnc_symbol')

In [71]:
all_combined_df

Unnamed: 0,hgnc_symbol,GO:0000502,GO:0000775,GO:0000776,GO:0000779,GO:0000922,GO:0000932,GO:0000940,GO:0001725,GO:0001726,...,GO:2001233,GO:2001234,GO:2001235,GO:2001236,GO:2001237,GO:2001238,GO:2001242,GO:2001244,GO:2001251,GO:2001267
3348,AADAC,,,,,,,,,,...,,,,,,,,,,
4696,AADAT,,,,,,,,,,...,,,,,,,,,,
554,AAR2,,,,,,,,,,...,,,,,,,,,,
2923,ABAT,,,,,,,,,,...,,,,,,,,,,
1381,ABCA1,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
510,ZRSR2,,,,,,,,,,...,,,,,,,,,,
144,ZWILCH,,0.529412,0.530864,0.547619,,,,,,...,,,,,,,,,0.535714,
59,ZWINT,,0.540773,0.514970,0.538012,,,1.0,,,...,,,,,,,,,0.625000,
4578,ZYG11B,,,,,,,,,,...,,,,,,,,,,


In [72]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_combined_df[all_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0000502,GO:0000775,GO:0000776,GO:0000779,GO:0000922,GO:0000932,GO:0000940,GO:0001725,GO:0001726,...,GO:2001233,GO:2001234,GO:2001235,GO:2001236,GO:2001237,GO:2001238,GO:2001242,GO:2001244,GO:2001251,GO:2001267


**ALL - Combine Results with A List of All Genes Symbols**

In [73]:
all_results_df = pd.merge(all_genes_list_df, all_combined_df, how='left', on='hgnc_symbol')

In [74]:
# Sort the genes ID alphabetically
all_results_df = all_results_df.sort_values('hgnc_symbol')

In [75]:
all_empty_cells = all_results_df.isnull().sum()
all_empty_cells

hgnc_symbol        0
GO:0000502     14391
GO:0000775     14311
GO:0000776     14351
GO:0000779     14345
               ...  
GO:2001238     14421
GO:2001242     14398
GO:2001244     14424
GO:2001251     14407
GO:2001267     14431
Length: 2103, dtype: int64

In [76]:
# Replace missing values with 0
all_results_df = all_results_df.fillna(0)

In [77]:
all_results_df

Unnamed: 0,hgnc_symbol,GO:0000502,GO:0000775,GO:0000776,GO:0000779,GO:0000922,GO:0000932,GO:0000940,GO:0001725,GO:0001726,...,GO:2001233,GO:2001234,GO:2001235,GO:2001236,GO:2001237,GO:2001238,GO:2001242,GO:2001244,GO:2001251,GO:2001267
0,A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A4GALT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A4GNT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14433,ZXDB,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14434,ZXDC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14435,ZYG11B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14436,ZYX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.607843,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [78]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_results_df[all_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,GO:0000502,GO:0000775,GO:0000776,GO:0000779,GO:0000922,GO:0000932,GO:0000940,GO:0001725,GO:0001726,...,GO:2001233,GO:2001234,GO:2001235,GO:2001236,GO:2001237,GO:2001238,GO:2001242,GO:2001244,GO:2001251,GO:2001267


**ALL - Save File**

In [79]:
# Save the DataFrame as a CSV file
all_results_df.to_csv(r'C:/Users/WCVan/Downloads/pdac_analysis_Collision_data/qm_results/ML_input_files/ClosenessCentrality_tables/qm_all.csv')