In [26]:
import numpy as np
import pandas as pd
import os
from functools import reduce

In [27]:
os.chdir('/Users/anb/Documents/CMEB-Lab/Projects/ML-DRG-PDAC-2025/pdac_feature_generation')

**NEGATIVE Genes**

In [28]:
# Specify the folder path
neg_folderpath = 'local_classic_results/ClosenessCentrality_tables_msigdb/local_classic_negative_genesets'

In [29]:
# Get the folder
neg_folder = os.fsencode(neg_folderpath)

**NEGATIVE - Get files**

In [30]:
# Initialize an empty list to store dataframes
neg_df_list = []

In [31]:
# loop over files in the CC folder
for file in os.listdir(neg_folder):
    filename = os.fsdecode(file)
    if filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(neg_folderpath, filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        neg_df_list.append(df)

In [32]:
# check the number of dataframes in the list 
print(len(neg_df_list))

10


**NEGATIVE - Get a list of all genes symbols**

In [33]:
# Specify the file path of the genes list
neg_all_genes_filepath = 'local_classic_results/filtered-degs-genes-only/local_classic_negative_genes.tsv'

In [34]:
# get the file of genes list
neg_all_genes_df = pd.read_csv(neg_all_genes_filepath, sep='\t')

In [35]:
neg_all_genes_df

Unnamed: 0,ID
0,GEN1
1,BRIP1
2,TMC2
3,BCAS3
4,MTBP
...,...
199,SLC3A2
200,YTHDF2
201,TMEM106A
202,LYSMD4


In [36]:
# Create a data frame that contains a column of all gene symbols
neg_genes_list_df = neg_all_genes_df[['ID']]

In [37]:
# Rename the column 
neg_genes_list_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [38]:
# Sort the genes ID alphabetically
neg_genes_list_df = neg_genes_list_df.sort_values('hgnc_symbol')

In [39]:
neg_genes_list_df

Unnamed: 0,hgnc_symbol
72,ACAP1
40,ADAM22
35,ADAMTS17
127,AGPS
5,ALS2
...,...
167,ZNF12
163,ZNF134
197,ZNF226
43,ZNF835


**Negative - Combine files**

In [40]:
neg_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), neg_df_list)

In [41]:
# Sort the genes ID alphabetically
neg_combined_df = neg_combined_df.sort_values('hgnc_symbol')

In [42]:
neg_combined_df

Unnamed: 0,hgnc_symbol,M543,M1743,M10501,M734,M16843,M5611,M80,M18120,M14427,M17728
0,ADAM22,,,,,,,,,1.0,
1,BRIP1,0.75,,,,,,0.833333,1.0,0.714286,
2,CKAP5,1.0,,,,,1.0,1.0,,,
3,CLSPN,0.75,,,,,,,0.6,,
4,COX7C,,,,,,,,,,0.444444
5,CSTF3,,,,1.0,0.666667,,,,,
6,EAF1,,,,0.666667,,,,,,
7,FIP1L1,,,,1.0,1.0,,,,,
8,GEN1,,1.0,,,,,0.5,0.6,,
9,GINS1,0.5,,1.0,,,1.0,0.416667,,0.384615,


In [43]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_combined_df[neg_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M543,M1743,M10501,M734,M16843,M5611,M80,M18120,M14427,M17728


**Negative - Combine Results with A List of All Genes Symbols**

In [44]:
neg_results_df = pd.merge(neg_genes_list_df, neg_combined_df, how='left', on='hgnc_symbol')

In [45]:
# Sort the genes ID alphabetically
neg_results_df = neg_results_df.sort_values('hgnc_symbol')

In [46]:
neg_empty_cells = neg_results_df.isnull().sum()
neg_empty_cells

hgnc_symbol      0
M543           195
M1743          198
M10501         200
M734           195
M16843         196
M5611          195
M80            196
M18120         200
M14427         196
M17728         195
dtype: int64

In [47]:
# Replace missing values with 0
neg_results_df = neg_results_df.fillna(0)

In [48]:
neg_results_df

Unnamed: 0,hgnc_symbol,M543,M1743,M10501,M734,M16843,M5611,M80,M18120,M14427,M17728
0,ACAP1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ADAM22,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,ADAMTS17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,AGPS,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ALS2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
199,ZNF12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
200,ZNF134,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
201,ZNF226,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
202,ZNF835,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [49]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = neg_results_df[neg_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M543,M1743,M10501,M734,M16843,M5611,M80,M18120,M14427,M17728


**Negative - Save File**

In [50]:
# Save the DataFrame as a CSV file
neg_results_df.to_csv('local_classic_results/ML_input_files/ClosenessCentrality_tables/msigdb/local_classic_negative.csv')

**POSITIVE Genes**

In [51]:
# Specify the folder path
pos_folderpath = 'local_classic_results/ClosenessCentrality_tables_msigdb/local_classic_positive_genesets'

In [52]:
# Get the folder
pos_folder = os.fsencode(pos_folderpath)

**POSITIVE - Get files**

In [53]:
# Initialize an empty list to store dataframes
pos_df_list = []

In [54]:
# loop over files in the CC folder
for file in os.listdir(pos_folder):
    filename = os.fsdecode(file)
    if filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(pos_folderpath, filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        pos_df_list.append(df)

In [55]:
# check the number of dataframes in the list 
print(len(pos_df_list))

42


**POSITIVE - Get a list of all genes symbols**

In [56]:
# Specify the file path of the genes list
pos_all_genes_filepath = 'local_classic_results/filtered-degs-genes-only/local_classic_sig_genes.tsv'

In [57]:
# get the file of genes list
pos_all_genes_df = pd.read_csv(pos_all_genes_filepath, sep='\t')

In [58]:
pos_all_genes_df

Unnamed: 0,ID
0,TMPRSS4
1,CTSE
2,S100P
3,AGR2
4,STYK1
...,...
354,MSMB
355,APOBEC2
356,DMBT1
357,FXYD2


In [59]:
# Create a data frame that contains a column of all gene symbols
pos_genes_list_df = pos_all_genes_df[['ID']]

In [60]:
# Rename the column 
pos_genes_list_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [61]:
# Sort the genes ID alphabetically
pos_genes_list_df = pos_genes_list_df.sort_values('hgnc_symbol')

In [62]:
pos_genes_list_df

Unnamed: 0,hgnc_symbol
79,ACSL5
42,ADAM28
77,ADAM8
198,ADARB2
269,ADCYAP1R1
...,...
141,WNT5A
98,ZBED2
207,ZBTB16
273,ZNF469


**Positive - Combine files**

In [63]:
pos_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), pos_df_list)

In [64]:
# Sort the genes ID alphabetically
pos_combined_df = pos_combined_df.sort_values('hgnc_symbol')

In [65]:
pos_combined_df

Unnamed: 0,hgnc_symbol,M46596,M4210,M46621,M46609,M9524,M12671,M15336,M10702,M17454,...,M2342,M6744,M15472,M19062,M12527,M17299,M5062,M46606,M2533,M14791
0,ADAM8,,,,,,,,,,...,,,,,,,,,,
1,AGR2,0.761905,,,0.666667,,0.75,0.800000,,,...,,0.777778,,0.677419,,0.833333,,,,
2,AGR3,0.500000,,,,,,,,,...,,,,0.446809,,,,,,
3,AKR1B10,,,,,,,,1.0,,...,,,,,,,,,,
4,ANPEP,,,,,,,,,,...,,,,,,,,,,0.4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
125,TMPRSS4,,,,,,,,,,...,,1.000000,,0.500000,,,,,0.263158,
126,TSPAN1,0.457143,,,0.421053,,0.50,0.461538,,1.0,...,,0.500000,,0.428571,,0.500000,,,,
127,TSPAN8,0.444444,,,0.421053,,,0.461538,,,...,,,,,,,,,,
128,VSIG1,0.444444,,,,,,,,,...,,,,,,,,,,


In [66]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_combined_df[pos_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M46596,M4210,M46621,M46609,M9524,M12671,M15336,M10702,M17454,...,M2342,M6744,M15472,M19062,M12527,M17299,M5062,M46606,M2533,M14791


**Positive - Combine Results with A List of All Genes Symbols**

In [67]:
pos_results_df = pd.merge(pos_genes_list_df, pos_combined_df, how='left', on='hgnc_symbol')

In [68]:
# Sort the genes ID alphabetically
pos_results_df = pos_results_df.sort_values('hgnc_symbol')

In [69]:
pos_empty_cells = pos_results_df.isnull().sum()
pos_empty_cells

hgnc_symbol      0
M46596         340
M4210          346
M46621         346
M46609         350
M9524          353
M12671         349
M15336         344
M10702         339
M17454         352
M126           352
M4306          329
M18311         353
M9185          351
M246           338
M16667         342
M9032          349
M3645          337
M929           354
M5547          354
M48093         345
M8706          341
M13661         353
M8124          342
M10961         347
M13273         350
M10431         353
M12795         353
M2591          355
M3102          352
M289           349
M19982         349
M1884          344
M2342          345
M6744          345
M15472         346
M19062         331
M12527         350
M17299         348
M5062          343
M46606         349
M2533          348
M14791         348
dtype: int64

In [70]:
# Replace missing values with 0
pos_results_df = pos_results_df.fillna(0)

In [71]:
pos_results_df

Unnamed: 0,hgnc_symbol,M46596,M4210,M46621,M46609,M9524,M12671,M15336,M10702,M17454,...,M2342,M6744,M15472,M19062,M12527,M17299,M5062,M46606,M2533,M14791
0,ACSL5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ADAM28,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ADAM8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ADARB2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ADCYAP1R1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
354,WNT5A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
355,ZBED2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
356,ZBTB16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
357,ZNF469,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [72]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = pos_results_df[pos_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M46596,M4210,M46621,M46609,M9524,M12671,M15336,M10702,M17454,...,M2342,M6744,M15472,M19062,M12527,M17299,M5062,M46606,M2533,M14791


**Positive - Save File**

In [73]:
# Save the DataFrame as a CSV file
pos_results_df.to_csv('local_classic_results/ML_input_files/ClosenessCentrality_tables/msigdb/local_classic_positive.csv')

**ALL Genes**

In [74]:
# Specify the folder path
all_folderpath = 'local_classic_results/ClosenessCentrality_tables_msigdb/local_classic_all_genesets'

In [75]:
# Get the folder
all_folder = os.fsencode(all_folderpath)

**ALL - Get files**

In [76]:
# Initialize an empty list to store dataframes
all_df_list = []

In [77]:
# loop over files in the CC folder
for file in os.listdir(all_folder):
    filename = os.fsdecode(file)
    if filename.endswith('.tsv'):
        # read the files as the dataframes
        df = pd.read_csv(os.path.join(all_folderpath, filename), sep='\t')
        
        # Remove row numbers (index)
        df.reset_index(drop=True, inplace=True)
        
        # store the dataframes into the list
        all_df_list.append(df)

In [78]:
# check the number of dataframes in the list 
print(len(all_df_list))

1221


**ALL - Get a list of all genes symbols**

In [79]:
# Specify the file path of the genes list
all_genes_filepath = 'local_classic_results/filtered-degs-genes-only/local_classic_all_genes.tsv'

In [80]:
# get the file of genes list
all_genes_df = pd.read_csv(all_genes_filepath, sep='\t')

In [81]:
all_genes_df

Unnamed: 0,ID
0,TMPRSS4
1,CTSE
2,S100P
3,GYPC
4,AGR2
...,...
15389,CGB2
15390,SALL1
15391,INTS9
15392,ACTL7A


In [82]:
# Create a data frame that contains a column of all gene symbols
all_genes_list_df = all_genes_df[['ID']]

In [83]:
# Rename the column 
all_genes_list_df.rename(columns={'ID': 'hgnc_symbol'}, inplace=True)

In [84]:
# Sort the genes ID alphabetically
all_genes_list_df = all_genes_list_df.sort_values('hgnc_symbol')

In [85]:
all_genes_list_df

Unnamed: 0,hgnc_symbol
6176,A1BG
2875,A1CF
13870,A2M
3534,A2ML1
9767,A3GALT2
...,...
10183,ZXDC
9982,ZYG11A
12003,ZYG11B
10052,ZYX


**ALL - Combine files**

In [86]:
all_combined_df = reduce(lambda  left,right: pd.merge(left, right, on=['hgnc_symbol'], how='outer'), all_df_list)

In [87]:
# Sort the genes ID alphabetically
all_combined_df = all_combined_df.sort_values('hgnc_symbol')

In [88]:
all_combined_df

Unnamed: 0,hgnc_symbol,M145,M8365,M8417,M16174,M186,M3218,M6189,M46582,M19938,...,M39818,M2123,M16637,M13968,M148,M160,M7098,M14791,M11825,M39617
0,A1CF,,,,,,,,,,...,,,,,,,,,,
1,AANAT,,,,,,,,,,...,,,,,,,,,,
2,ABAT,,,,,,,,,,...,,,,,,,,,,
3,ABCA12,,,,,,,,,,...,,,,,,,,,,
4,ABCA13,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4257,ZNHIT6,,,,,,,,,,...,,,,,,,,,,
4258,ZNRF2,,,,,,,,,,...,,,,,,,,,,
4259,ZSCAN20,,,,,,,,,,...,,,,,,,,,,
4260,ZWILCH,,,,,,,,,,...,,,,,,,,,,


In [89]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_combined_df[all_combined_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M145,M8365,M8417,M16174,M186,M3218,M6189,M46582,M19938,...,M39818,M2123,M16637,M13968,M148,M160,M7098,M14791,M11825,M39617


**ALL - Combine Results with A List of All Genes Symbols**

In [90]:
all_results_df = pd.merge(all_genes_list_df, all_combined_df, how='left', on='hgnc_symbol')

In [91]:
# Sort the genes ID alphabetically
all_results_df = all_results_df.sort_values('hgnc_symbol')

In [92]:
all_empty_cells = all_results_df.isnull().sum()
all_empty_cells

hgnc_symbol        0
M145           15351
M8365          15364
M8417          15381
M16174         15341
               ...  
M160           15375
M7098          15371
M14791         15331
M11825         15378
M39617         15369
Length: 1222, dtype: int64

In [93]:
# Replace missing values with 0
all_results_df = all_results_df.fillna(0)

In [94]:
all_results_df

Unnamed: 0,hgnc_symbol,M145,M8365,M8417,M16174,M186,M3218,M6189,M46582,M19938,...,M39818,M2123,M16637,M13968,M148,M160,M7098,M14791,M11825,M39617
0,A1BG,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,A1CF,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,A2M,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,A2ML1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,A3GALT2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15389,ZXDC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15390,ZYG11A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15391,ZYG11B,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15392,ZYX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [95]:
# check duplicated values in the 'hgnc_symbol' column
duplicates = all_results_df[all_results_df.duplicated('hgnc_symbol', keep=False)]
duplicates

Unnamed: 0,hgnc_symbol,M145,M8365,M8417,M16174,M186,M3218,M6189,M46582,M19938,...,M39818,M2123,M16637,M13968,M148,M160,M7098,M14791,M11825,M39617


**ALL - Save File**

In [96]:
# Save the DataFrame as a CSV file
all_results_df.to_csv('local_classic_results/ML_input_files/ClosenessCentrality_tables/msigdb/local_classic_all.csv')