In [1]:
import numpy as np
import pandas as pd
import os
from functools import reduce

**NEGATIVE Genes**

In [2]:
# Specify the folder path
neg_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_msigdb/local_basal_negative_genesets'

In [3]:
# Get the folder
neg_folder = os.fsencode(neg_folderpath)

**NEGATIVE - Get files**

In [4]:
# Initialize an empty list to store dataframes
neg_df_list = []

In [5]:
# loop over files in the CC folder
for file in os.listdir(neg_folder):
    filename = os.fsdecode(file)
    if filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_folderpath, filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [6]:
# check the number of dataframes in the list 
print(len(neg_df_list))

26


**NEGATIVE - Get a list of all genes symbols**

In [7]:
# Specify the file path of the genes list
neg_all_genes_filepath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/filtered-degs-genes-only/local_basal_negative_genes.tsv'

In [8]:
# get the file of genes list
neg_all_genes_df = pd.read_csv(neg_all_genes_filepath, sep='\t')

In [9]:
neg_all_genes_df

Unnamed: 0,ID
0,BRIP1
1,FHAD1
2,WDR19
3,LIN54
4,UBA52
...,...
416,KRAS
417,OAZ2
418,LEO1
419,MTHFSD


In [10]:
# Create a data frame that contains a column of all gene symbols
neg_genes_list_df = neg_all_genes_df[['ID']]

In [11]:
# Rename the column 
neg_genes_list_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [12]:
# Sort the genes ID alphabetically
neg_genes_list_df = neg_genes_list_df.sort_values('hgnc_symbol')

In [13]:
neg_genes_list_df

Unnamed: 0,hgnc_symbol
18,ABCB4
117,ABHD14B
195,ACER3
105,ACSM2A
187,ACTL6A
...,...
19,ZNF695
379,ZNF75D
241,ZNF835
46,ZNF90


**Negative - Combine files**

In [14]:
neg_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), neg_df_list)

In [15]:
# Sort the genes ID alphabetically
neg_combined_df = neg_combined_df.sort_values('hgnc_symbol')

In [16]:
neg_combined_df

Unnamed: 0,hgnc_symbol,M10237,M1036,M10501,M11961,M12113,M13893,M14427,M149,M16189,...,M42508,M4409,M5275,M5336,M543,M5611,M6782,M80,M864,M9585
6,ACTL6A,0.461538,,,0.384615,,,,0.333333,0.545455,...,,,,,,0.464789,,,,
33,ADAM22,,,,,,,1.0,,,...,,,,,,,,,,
79,ADAR,,,,,,,,,,...,,,,,,,,,,0.452381
74,ARFGEF2,,,,,,,,,,...,,,,,,0.366667,,,,0.372549
10,ATP6V1C1,,1.0,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
43,USP39,,,,,,,,0.393939,,...,0.6,,,,,,,0.395833,,
22,UTP15,,,0.4,,,1.0,,,,...,,,,,,,,,,
53,UVRAG,,,,,,,,,,...,,,,,,0.388235,,,,0.395833
58,VCPIP1,,,,,,,,,,...,,,,,,,,,,


In [17]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_combined_df[neg_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M10237,M1036,M10501,M11961,M12113,M13893,M14427,M149,M16189,...,M42508,M4409,M5275,M5336,M543,M5611,M6782,M80,M864,M9585


**Negative - Combine Results with A List of All Genes Symbols**

In [18]:
neg_results_df = pd.merge(neg_genes_list_df, neg_combined_df, how='left', on='hgnc_symbol')

In [19]:
# Sort the genes ID alphabetically
neg_results_df = neg_results_df.sort_values('hgnc_symbol')

In [20]:
neg_empty_cells = neg_results_df.isnull().sum()
neg_empty_cells

hgnc_symbol      0
M10237         412
M1036          411
M10501         412
M11961         410
M12113         414
M13893         415
M14427         408
M149           405
M16189         408
M1743          417
M19745         415
M19806         401
M2241          416
M2325          405
M2368          415
M39736         413
M42508         406
M4409          412
M5275          417
M5336          410
M543           407
M5611          387
M6782          416
M80            397
M864           413
M9585          401
dtype: int64

In [21]:
# Replace missing values with 0
neg_results_df = neg_results_df.fillna(0)

In [22]:
neg_results_df

Unnamed: 0,hgnc_symbol,M10237,M1036,M10501,M11961,M12113,M13893,M14427,M149,M16189,...,M42508,M4409,M5275,M5336,M543,M5611,M6782,M80,M864,M9585
0,ABCB4,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,ABHD14B,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,ACER3,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,ACSM2A,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,ACTL6A,0.461538,0.0,0.0,0.384615,0.0,0.0,0.0,0.333333,0.545455,...,0.0,0.0,0.0,0.0,0.0,0.464789,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416,ZNF695,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
417,ZNF75D,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
418,ZNF835,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
419,ZNF90,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [23]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_results_df[neg_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M10237,M1036,M10501,M11961,M12113,M13893,M14427,M149,M16189,...,M42508,M4409,M5275,M5336,M543,M5611,M6782,M80,M864,M9585


**Negative - Save File**

In [24]:
# Save the DataFrame as a CSV file
neg_results_df.to_csv(r'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ML_input_files/ClosenessCentrality_tables/msigdb/local_basal_negative.csv')

**POSITIVE Genes**

In [25]:
# Specify the folder path
pos_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_msigdb/local_basal_positive_genesets'

In [26]:
# Get the folder
pos_folder = os.fsencode(pos_folderpath)

**POSITIVE - Get files**

In [27]:
# Initialize an empty list to store dataframes
pos_df_list = []

In [28]:
# loop over files in the CC folder
for file in os.listdir(pos_folder):
    filename = os.fsdecode(file)
    if filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_folderpath, filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [29]:
# check the number of dataframes in the list 
print(len(pos_df_list))

347


**POSITIVE - Get a list of all genes symbols**

In [30]:
# Specify the file path of the genes list
pos_all_genes_filepath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/filtered-degs-genes-only/local_basal_sig_genes.tsv'

In [31]:
# get the file of genes list
pos_all_genes_df = pd.read_csv(pos_all_genes_filepath, sep='\t')

In [32]:
pos_all_genes_df

Unnamed: 0,ID
0,SLC2A1
1,SLC16A3
2,TNS4
3,SCEL
4,ZBED2
...,...
610,DNASE1
611,TFF2
612,SPRR3
613,CACNA1E


In [33]:
# Create a data frame that contains a column of all gene symbols
pos_genes_list_df = pos_all_genes_df[['ID']]

In [34]:
# Rename the column 
pos_genes_list_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [35]:
# Sort the genes ID alphabetically
pos_genes_list_df = pos_genes_list_df.sort_values('hgnc_symbol')

In [36]:
pos_genes_list_df

Unnamed: 0,hgnc_symbol
154,A1CF
195,ABAT
114,ABCA3
459,ABCC8
506,ACE2
...,...
422,ZBTB16
189,ZNF185
273,ZNF365
254,ZNF469


**Positive - Combine files**

In [37]:
pos_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), pos_df_list)

In [38]:
# Sort the genes ID alphabetically
pos_combined_df = pos_combined_df.sort_values('hgnc_symbol')

In [39]:
pos_combined_df

Unnamed: 0,hgnc_symbol,M10041,M10165,M10237,M10279,M10320,M10351,M10431,M10501,M10508,...,M9197,M9199,M9257,M929,M9365,M9483,M9524,M9585,M9639,M9893
383,A1CF,,,,,,,,,,...,,,,,,,,,,
213,ABAT,,,,,,,,,,...,,,,,,,,,,
216,ABCC8,,,,,,,,,,...,,,,,,,,,,
312,ACE2,,,,,,,,,,...,,,,,,,,,,
243,ACP5,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223,WNT2,,,,,,,,,,...,,,,,,,,,,
72,WNT5A,,,,,,0.615385,,,,...,,,,,,,,,0.428571,
110,XAF1,,,,,,,,,,...,,,,,,,,,,
365,ZNF469,,,,,,,,,,...,,,,,,,,,,


In [40]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_combined_df[pos_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M10041,M10165,M10237,M10279,M10320,M10351,M10431,M10501,M10508,...,M9197,M9199,M9257,M929,M9365,M9483,M9524,M9585,M9639,M9893


**Positive - Combine Results with A List of All Genes Symbols**

In [41]:
pos_results_df = pd.merge(pos_genes_list_df, pos_combined_df, how='left', on='hgnc_symbol')

In [42]:
# Sort the genes ID alphabetically
pos_results_df = pos_results_df.sort_values('hgnc_symbol')

In [43]:
pos_empty_cells = pos_results_df.isnull().sum()
pos_empty_cells

hgnc_symbol      0
M10041         607
M10165         599
M10237         582
M10279         593
              ... 
M9483          607
M9524          606
M9585          598
M9639          603
M9893          604
Length: 348, dtype: int64

In [44]:
# Replace missing values with 0
pos_results_df = pos_results_df.fillna(0)

In [45]:
pos_results_df

Unnamed: 0,hgnc_symbol,M10041,M10165,M10237,M10279,M10320,M10351,M10431,M10501,M10508,...,M9197,M9199,M9257,M929,M9365,M9483,M9524,M9585,M9639,M9893
0,A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ABAT,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ABCA3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ABCC8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ACE2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
610,ZBTB16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
611,ZNF185,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
612,ZNF365,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
613,ZNF469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_results_df[pos_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M10041,M10165,M10237,M10279,M10320,M10351,M10431,M10501,M10508,...,M9197,M9199,M9257,M929,M9365,M9483,M9524,M9585,M9639,M9893


**Positive - Save File**

In [47]:
# Save the DataFrame as a CSV file
pos_results_df.to_csv(r'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ML_input_files/ClosenessCentrality_tables/msigdb/local_basal_positive.csv')

**ALL Genes**

In [48]:
# Specify the folder path
all_folderpath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ClosenessCentrality_tables_msigdb/local_basal_all_genesets'

In [49]:
# Get the folder
all_folder = os.fsencode(all_folderpath)

**ALL - Get files**

In [50]:
# Initialize an empty list to store dataframes
all_df_list = []

In [51]:
# loop over files in the CC folder
for file in os.listdir(all_folder):
    filename = os.fsdecode(file)
    if filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_folderpath, filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [52]:
# check the number of dataframes in the list 
print(len(all_df_list))

1673


**ALL - Get a list of all genes symbols**

In [53]:
# Specify the file path of the genes list
all_genes_filepath = 'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/filtered-degs-genes-only/local_basal_all_genes.tsv'

In [54]:
# get the file of genes list
all_genes_df = pd.read_csv(all_genes_filepath, sep='\t')

In [55]:
all_genes_df

Unnamed: 0,ID
0,SLC2A1
1,SLC16A3
2,TNS4
3,SCEL
4,ZBED2
...,...
15389,MAP4
15390,MTHFD2
15391,FGF5
15392,GLOD4


In [56]:
# Create a data frame that contains a column of all gene symbols
all_genes_list_df = all_genes_df[['ID']]

In [57]:
# Rename the column 
all_genes_list_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [58]:
# Sort the genes ID alphabetically
all_genes_list_df = all_genes_list_df.sort_values('hgnc_symbol')

In [59]:
all_genes_list_df

Unnamed: 0,hgnc_symbol
6285,A1BG
211,A1CF
6849,A2M
13474,A2ML1
11664,A3GALT2
...,...
5943,ZXDC
10684,ZYG11A
4407,ZYG11B
3480,ZYX


**ALL - Combine files**

In [60]:
all_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), all_df_list)

In [61]:
# Sort the genes ID alphabetically
all_combined_df = all_combined_df.sort_values('hgnc_symbol')

In [62]:
all_combined_df

Unnamed: 0,hgnc_symbol,M1,M10041,M10065,M1007,M10117,M1012,M10134,M10137,M10142,...,M9905,M9911,M9936,M9940,M9946,M9948,M9951,M998,M9982,M999
4044,A1CF,,,,,,,,,,...,,,,,,,,,,
1724,A2M,,,,,,,,,,...,,,,,,,,,,
3699,A4GALT,,,,,,,,,,...,,,,,,,,,,
691,AADAC,,,,,,,,,,...,,,,,,,,,,
2252,AANAT,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3734,ZNHIT6,,,,,,,,,,...,,,,,,,,,,
3778,ZSCAN20,,,,,,,,,,...,,,,,,,,,,
1913,ZWILCH,,,,,,,,,,...,,,,,,,,,,
223,ZWINT,,,,,,,,,,...,,,,,,,,,,1.0


In [63]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_combined_df[all_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M1,M10041,M10065,M1007,M10117,M1012,M10134,M10137,M10142,...,M9905,M9911,M9936,M9940,M9946,M9948,M9951,M998,M9982,M999


**ALL - Combine Results with A List of All Genes Symbols**

In [64]:
all_results_df = pd.merge(all_genes_list_df, all_combined_df, how='left', on='hgnc_symbol')

In [65]:
# Sort the genes ID alphabetically
all_results_df = all_results_df.sort_values('hgnc_symbol')

In [66]:
all_empty_cells = all_results_df.isnull().sum()
all_empty_cells

hgnc_symbol        0
M1             15371
M10041         15362
M10065         15374
M1007          15387
               ...  
M9948          15382
M9951          15383
M998           15364
M9982          15305
M999           15378
Length: 1674, dtype: int64

In [67]:
# Replace missing values with 0
all_results_df = all_results_df.fillna(0)

In [68]:
all_results_df

Unnamed: 0,hgnc_symbol,M1,M10041,M10065,M1007,M10117,M1012,M10134,M10137,M10142,...,M9905,M9911,M9936,M9940,M9946,M9948,M9951,M998,M9982,M999
0,A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
1,A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
2,A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
3,A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
4,A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15389,ZXDC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
15390,ZYG11A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
15391,ZYG11B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.00000,0.0,0.0,0.0
15392,ZYX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.37037,0.0,0.0,0.0


In [69]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_results_df[all_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M1,M10041,M10065,M1007,M10117,M1012,M10134,M10137,M10142,...,M9905,M9911,M9936,M9940,M9946,M9948,M9951,M998,M9982,M999


**ALL - Save File**

In [70]:
# Save the DataFrame as a CSV file
all_results_df.to_csv(r'C:/Users/WCVan/Downloads/pdac_analysis_vangie/local_basal_results/ML_input_files/ClosenessCentrality_tables/msigdb/local_basal_all.csv')