In [2]:
import pandas as pd

# Load the column names from each file
file_paths = {
    "pcawg_found_columns_iqr_corr": "pcawg_found_columns_iqr_corr.txt",
    "pcawg_found_columns_prim": "pcawg_found_columns_prim.txt",
    "pcawg_found_columns_sith_corr": "pcawg_found_columns_sith_corr.txt",
    "tgca_found_columns_iqr_corr": "tgca_found_columns_iqr_corr.txt",
    "tgca_found_columns_prim": "tgca_found_columns_prim.txt",
    "tgca_found_columns_sith_corr": "tgca_found_columns_sith_corr.txt",
}

# Extract column names from each file
column_sets = {}
for name, path in file_paths.items():
    with open(path, "r") as file:
        columns = [line.split(",")[0].strip().split(": ")[1] for line in file.readlines() if "Column:" in line]
        column_sets[name] = set(columns)

# Find common columns across all six files
common_columns = set.intersection(*column_sets.values())

# Find pairwise similarities
pairwise_similarities = {}
file_names = list(column_sets.keys())

for i in range(len(file_names)):
    for j in range(i + 1, len(file_names)):
        file1, file2 = file_names[i], file_names[j]
        shared_columns = column_sets[file1].intersection(column_sets[file2])
        pairwise_similarities[(file1, file2)] = shared_columns

# Convert to DataFrame for better visualization
similarities_df = pd.DataFrame([(k[0], k[1], len(v)) for k, v in pairwise_similarities.items()],
                               columns=["File 1", "File 2", "Common Columns"])

similarities_df

Unnamed: 0,File 1,File 2,Common Columns
0,pcawg_found_columns_iqr_corr,pcawg_found_columns_prim,3
1,pcawg_found_columns_iqr_corr,pcawg_found_columns_sith_corr,20
2,pcawg_found_columns_iqr_corr,tgca_found_columns_iqr_corr,0
3,pcawg_found_columns_iqr_corr,tgca_found_columns_prim,0
4,pcawg_found_columns_iqr_corr,tgca_found_columns_sith_corr,0
5,pcawg_found_columns_prim,pcawg_found_columns_sith_corr,3
6,pcawg_found_columns_prim,tgca_found_columns_iqr_corr,0
7,pcawg_found_columns_prim,tgca_found_columns_prim,0
8,pcawg_found_columns_prim,tgca_found_columns_sith_corr,0
9,pcawg_found_columns_sith_corr,tgca_found_columns_iqr_corr,0


In [5]:
# Create a dictionary to store pairwise common columns as DataFrames
pairwise_common_columns = {}

for (file1, file2), shared_columns in pairwise_similarities.items():
    df = pd.DataFrame(list(shared_columns), columns=["Common Columns"])
    pairwise_common_columns[f"{file1} & {file2}"] = df

pairwise_common_columns


{'pcawg_found_columns_iqr_corr & pcawg_found_columns_prim':      Common Columns
 0        cg27086014
 1  icgc_specimen_id
 2    icgc_sample_id,
 'pcawg_found_columns_iqr_corr & pcawg_found_columns_sith_corr':       Common Columns
 0         cg00870279
 1         cg07235253
 2     icgc_sample_id
 3         cg08532569
 4         cg16401270
 5         cg10982913
 6         cg12164232
 7         cg17163967
 8         cg16338877
 9         cg16272777
 10        cg19477190
 11        cg08598483
 12        cg01842321
 13        cg22635155
 14        cg06805320
 15        cg27086014
 16        cg14387626
 17        cg21142743
 18  icgc_specimen_id
 19        cg01861555,
 'pcawg_found_columns_iqr_corr & tgca_found_columns_iqr_corr': Empty DataFrame
 Columns: [Common Columns]
 Index: [],
 'pcawg_found_columns_iqr_corr & tgca_found_columns_prim': Empty DataFrame
 Columns: [Common Columns]
 Index: [],
 'pcawg_found_columns_iqr_corr & tgca_found_columns_sith_corr': Empty DataFrame
 Columns: [Common

In [7]:
# Extract column names from each file
column_sets = {}
for name, path in file_paths.items():
    with open(path, "r") as file:
        columns = [line.split(",")[0].strip().split(": ")[1] for line in file.readlines() if "Column:" in line]
        column_sets[name] = set(columns)

# Collect all unique probes from all files
all_probes = list(set.union(*column_sets.values()))

# Create a DataFrame with probes as rows and files as columns
presence_matrix = pd.DataFrame(index=all_probes, columns=file_paths.keys(), dtype=int)

# Populate the DataFrame with 1 (present) or 0 (absent)
for file, probes in column_sets.items():
    presence_matrix[file] = presence_matrix.index.isin(probes).astype(int)

# Reset index to move probes into the first column
presence_matrix.reset_index(inplace=True)
presence_matrix.rename(columns={"index": "Probe"}, inplace=True)


In [8]:
presence_matrix

Unnamed: 0,Probe,pcawg_found_columns_iqr_corr,pcawg_found_columns_prim,pcawg_found_columns_sith_corr,tgca_found_columns_iqr_corr,tgca_found_columns_prim,tgca_found_columns_sith_corr
0,cg24788333,0,0,0,1,0,0
1,cg13621113,0,0,0,0,0,1
2,cg17368114,1,0,0,0,0,0
3,cg09110388,0,0,0,1,0,0
4,cg00829845,0,1,0,0,0,0
...,...,...,...,...,...,...,...
1455,cg06483559,0,0,0,0,0,1
1456,cg26788570,1,0,0,0,0,0
1457,cg24429159,0,1,0,0,0,0
1458,cg08088970,0,1,0,0,0,0


In [10]:
presence_matrix.to_csv('../_OUTPUTS_/common_probes_presence_matrix.csv', index=False)