In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib.colors import ListedColormap
import os
from comut import comut
from comut import fileparsers
import palettable
import matplotlib

In [None]:
#prepare data
import csv

# Define file paths
input_tsv = '/Hartwig_mutational_sig_data.tsv'  # Path to the main TSV file
sample_ids_tsv = '/Hartwig_Pt_Sample_ID.tsv'  # Path to the file containing sample IDs
output_tsv = '/Hartwig_mutational_sig_data_filtered.tsv'  # Path to the output TSV file

# Step 1: Read the sample IDs into a set
with open(sample_ids_tsv, 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')  # Use DictReader to handle column names
    sample_ids = {row['Sample ID'] for row in reader}  # Use the correct column name

# Step 2: Filter the main TSV file
with open(input_tsv, 'r') as infile, open(output_tsv, 'w', newline='') as outfile:
    reader = csv.DictReader(infile, delimiter='\t')
    writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames, delimiter='\t')

    # Write header to the output file
    writer.writeheader()

    # Filter rows based on sample IDs
    for row in reader:
        if row['sample_id'] in sample_ids:  # Use the correct column name
            writer.writerow(row)

In [None]:
import csv

# Define file paths
input_tsv = '/Hartwig_mutational_sig_data.tsv'  # Path to the main TSV file
sample_ids_tsv = '/Hartwig_Pt_Sample_ID.tsv'  # Path to the file containing sample IDs
output_tsv = '/Hartwig_mutational_sig_data_filtered.tsv'  # Path to the output TSV file

# Step 1: Read the sample IDs in order from the ID file
with open(sample_ids_tsv, 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')
    sample_ids = [row['Sample ID'] for row in reader]  # Maintain order of Sample IDs

# Step 2: Read and filter rows from the main TSV file
with open(input_tsv, 'r') as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    fieldnames = reader.fieldnames  # Preserve original column order

    # Store rows in a dictionary for fast lookup by Sample ID
    rows_by_id = {row['sample_id']: row for row in reader if row['sample_id'] in sample_ids}

# Step 3: Write rows in the order of sample IDs to the output file
with open(output_tsv, 'w', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t')

    # Write the header to the output file
    writer.writeheader()

    # Write rows in the order of sample IDs from the ID file
    for sample_id in sample_ids:
        if sample_id in rows_by_id:  # Only write rows that exist in the main file
            writer.writerow(rows_by_id[sample_id])


In [None]:
import csv

# Define file paths
filtered_tsv = '/Hartwig_mutational_sig_data_filtered.tsv'  # Input file with filtered and ordered rows
output_contribution_tsv = '/Hartwig_mutational_sig_relative_contribution.tsv'  # Output file with relative contributions

# Step 1: Read the filtered data
with open(filtered_tsv, 'r') as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    fieldnames = reader.fieldnames

    # Separate the Sample ID column from the signature columns
    sample_id_col = 'sample_id'
    signature_cols = [col for col in fieldnames if col != sample_id_col]

    # Collect rows with normalized contributions
    normalized_rows = []
    for row in reader:
        # Convert signature columns to floats
        signature_values = [float(row[col]) for col in signature_cols]

        # Calculate the sum of signature values
        total = sum(signature_values)

        # Normalize the values to get relative contributions
        normalized_values = [value / total if total > 0 else 0 for value in signature_values]

        # Create a new row with normalized values
        normalized_row = {sample_id_col: row[sample_id_col]}
        normalized_row.update({col: normalized_values[i] for i, col in enumerate(signature_cols)})

        normalized_rows.append(normalized_row)

# Step 2: Write the normalized data to a new file
with open(output_contribution_tsv, 'w', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t')

    # Write the header
    writer.writeheader()

    # Write the normalized rows
    writer.writerows(normalized_rows)


In [None]:
import csv

# Define file paths
input_tsv = '/Hartwig_EACmets_drivergenes.tsv'  # Path to the main TSV file
sample_ids_tsv = '/Hartwig_Pt_Sample_ID.tsv'  # Path to the file containing sample IDs
output_tsv = '/Hartwig_EACmets_drivergenes_filtered.tsv'  # Path to the output TSV file

# Step 1: Read the sample IDs into a set
with open(sample_ids_tsv, 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')  # Use DictReader to handle column names
    sample_ids = {row['Sample ID'] for row in reader}  # Use the correct column name

# Step 2: Filter the main TSV file
with open(input_tsv, 'r') as infile, open(output_tsv, 'w', newline='') as outfile:
    reader = csv.DictReader(infile, delimiter='\t')
    writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames, delimiter='\t')

    # Write header to the output file
    writer.writeheader()

    # Filter rows based on sample IDs
    for row in reader:
        if row['sampleId'] in sample_ids:  # Use the correct column name
            writer.writerow(row)


In [None]:
# Define file paths
input_tsv = '/Hartwig_EACmets_drivergenes.tsv'  # Path to the main TSV file
output_tsv = '/Hartwig_EACmets_drivergenes_filtered_wide.tsv'  # Path to the output TSV file
from collections import defaultdict

# Step 1: Process the data
data = defaultdict(lambda: defaultdict(str))  # Nested dictionary to store mutation classes

genes = set()  # To collect all unique gene names for columns

with open(input_tsv, 'r') as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    
    for row in reader:
        sample_id = row['sampleId']
        gene_name = row['gene']
        mutation_class = row['mutationClass']

        # Populate the data structure
        data[sample_id][gene_name] = mutation_class
        genes.add(gene_name)

# Step 2: Create the wide-format data
genes = sorted(genes)  # Sort gene names for consistent column order
fieldnames = ['sampleId'] + genes  # Columns for the output file

output_rows = []
for sample_id, gene_data in data.items():
    row = {'sampleId': sample_id}
    # Add mutation classes under the respective gene columns
    for gene in genes:
        row[gene] = gene_data.get(gene, '')  # Use empty string if no data for the gene
    output_rows.append(row)

# Step 3: Write the wide-format data to a TSV file
with open(output_tsv, 'w', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t')

    # Write header
    writer.writeheader()

    # Write rows
    writer.writerows(output_rows)

In [None]:
import csv

# Define file paths
input_tsv = '/DO_ID_WGS_id_PCAWG.tsv'  # Path to the main TSV file
sample_ids_tsv = '/DO_filtered_PCAWG_list.tsv'  # Path to the file containing sample IDs
output_tsv = '/DO_ID_WGS_id_PCAWG_filtered.tsv'  # Path to the output TSV file

# Step 1: Read the sample IDs into a set
with open(sample_ids_tsv, 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')  # Use DictReader to handle column names
    sample_ids = {row['DO_filtered_list'] for row in reader}  # Use the correct column name

# Step 2: Filter the main TSV file
with open(input_tsv, 'r') as infile, open(output_tsv, 'w', newline='') as outfile:
    reader = csv.DictReader(infile, delimiter='\t')
    writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames, delimiter='\t')

    # Write header to the output file
    writer.writeheader()

    # Filter rows based on sample IDs
    for row in reader:
        if row['icgc_donor_id'] in sample_ids:  # Use the correct column name
            writer.writerow(row)

In [None]:
import csv

# Define file paths
input_tsv = '/PCAWG_driver_mutations_ICGC_EAC.tsv'  # Path to the main TSV file
sample_ids_tsv = '/DO_ID_WGS_id_PCAWG_filtered.tsv'  # Path to the file containing sample IDs
output_tsv = '/PCAWG_driver_mutations_ICGC_EAC_filtered.tsv'  # Path to the output TSV file

# Step 1: Read the sample IDs into a set
with open(sample_ids_tsv, 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')  # Use DictReader to handle column names
    sample_ids = {row['tumor_wgs_aliquot_id'] for row in reader}  # Use the correct column name

# Step 2: Filter the main TSV file
with open(input_tsv, 'r') as infile, open(output_tsv, 'w', newline='') as outfile:
    reader = csv.DictReader(infile, delimiter='\t')
    writer = csv.DictWriter(outfile, fieldnames=reader.fieldnames, delimiter='\t')

    # Write header to the output file
    writer.writeheader()

    # Filter rows based on sample IDs
    for row in reader:
        if row['sample_id'] in sample_ids:  # Use the correct column name
            writer.writerow(row)


In [None]:
# Define file paths
input_tsv = '/PCAWG_driver_mutations_ICGC_EAC_filtered.tsv'  # Path to the main TSV file
output_tsv = '/PCAWG_driver_mutations_ICGC_EAC_filtered_wide.tsv'  # Path to the output TSV file
from collections import defaultdict

# Step 1: Process the data
data = defaultdict(lambda: defaultdict(str))  # Nested dictionary to store mutation classes

genes = set()  # To collect all unique gene names for columns

with open(input_tsv, 'r') as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    
    for row in reader:
        sample_id = row['sample_id']
        gene_name = row['gene']
        mutation_class = row['category']

        # Populate the data structure
        data[sample_id][gene_name] = mutation_class
        genes.add(gene_name)

# Step 2: Create the wide-format data
genes = sorted(genes)  # Sort gene names for consistent column order
fieldnames = ['sample_id'] + genes  # Columns for the output file

output_rows = []
for sample_id, gene_data in data.items():
    row = {'sample_id': sample_id}
    # Add mutation classes under the respective gene columns
    for gene in genes:
        row[gene] = gene_data.get(gene, '')  # Use empty string if no data for the gene
    output_rows.append(row)

# Step 3: Write the wide-format data to a TSV file
with open(output_tsv, 'w', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t')

    # Write header
    writer.writeheader()

    # Write rows
    writer.writerows(output_rows)

In [None]:
import csv

# Define file paths
input_tsv = '/PCAWG_signatures_fileID.tsv'  # Path to the main TSV file
sample_ids_tsv = '/DO_ID_WGS_id_PCAWG_filtered.tsv'  # Path to the file containing sample IDs
output_tsv = '/PCAWG_signatures_filtered.tsv'  # Path to the output TSV file

# Step 1: Read the sample IDs in order from the ID file
with open(sample_ids_tsv, 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')
    sample_ids = [row['tumor_wgs_aliquot_id'] for row in reader]  # Maintain order of Sample IDs

# Step 2: Read and filter rows from the main TSV file
with open(input_tsv, 'r') as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    fieldnames = reader.fieldnames  # Preserve original column order

    # Store rows in a dictionary for fast lookup by Sample ID
    rows_by_id = {row['file_id']: row for row in reader if row['file'] in sample_ids}

# Step 3: Write rows in the order of sample IDs to the output file
with open(output_tsv, 'w', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames, delimiter='\t')

    # Write the header to the output file
    writer.writeheader()

    # Write rows in the order of sample IDs from the ID file
    for sample_id in sample_ids:
        if sample_id in rows_by_id:  # Only write rows that exist in the main file
            writer.writerow(rows_by_id[sample_id])

In [None]:
# Define file paths
input_tsv = '/PCAWG_driver_mutations_ICGC_EAC_filtered_wide.tsv'  # Path to the main TSV file
sample_ids_tsv = '/COMUT_plot_input_files/DO_ID_WGS_id_PCAWG_filtered.tsv'  # Path to the file containing sample IDs
output_tsv = '/PCAWG_driver_mutations_ICGC_EAC_filtered_wide-updated.tsv'  # Path to the output TSV file

import csv

# Step 1: Read the sample IDs in order from the ID file
with open(sample_ids_tsv, 'r') as f:
    reader = csv.DictReader(f, delimiter='\t')
    sample_ids = [row['tumor_wgs_aliquot_id'] for row in reader]  # Maintain order of Sample IDs

# List of columns to keep
columns_to_keep = ['ERBB2', 'EGFR', 'CCND1', 'TP53', 'JAK2']

# Step 2: Read and filter rows from the main TSV file
with open(input_tsv, 'r') as infile:
    reader = csv.DictReader(infile, delimiter='\t')
    fieldnames = reader.fieldnames  # Preserve original column order
    
    # Filter out columns that are not in `columns_to_keep`
    fieldnames_to_write = ['sample_id'] + [col for col in columns_to_keep if col in fieldnames]
    
    # Store rows in a dictionary for fast lookup by Sample ID
    rows_by_id = {row['sample_id']: row for row in reader if row['sample_id'] in sample_ids}

# Step 3: Write rows in the order of sample IDs to the output file
with open(output_tsv, 'w', newline='') as outfile:
    writer = csv.DictWriter(outfile, fieldnames=fieldnames_to_write, delimiter='\t')

    # Write the header to the output file
    writer.writeheader()

    # Write rows in the order of sample IDs from the ID file
    for sample_id in sample_ids:
        if sample_id in rows_by_id:  # Only write rows that exist in the main file
            # Filter the row to keep only the desired columns
            filtered_row = {key: rows_by_id[sample_id][key] for key in fieldnames_to_write}
            writer.writerow(filtered_row)


In [None]:
driver_genes_wide = pd.read_csv('/driver_gene_all_samples_long.csv')


# Step 2: Reshape the data into long format using pd.melt
driver_genes_mut = pd.melt(driver_genes_wide, 
                  id_vars=['Sample ID'],  # Keep 'Sample ID' as identifier
                  value_vars=['EGFR alteration', 'CCND1 alteration', 'ERBB2 alteration', 'TP53 alteration', 'JAK2 alteration'], 
                  var_name='category',  # New column for the gene name
                  value_name='value')   # New column for the alteration values

# Step 3: Rename columns to match desired output
driver_genes_mut = driver_genes_mut.rename(columns={'Sample ID': 'sample'})

# Step 4: Remove " alteration" from the gene names in the 'category' column
driver_genes_mut['category'] = driver_genes_mut['category'].str.replace(' alteration', '', regex=False)


driver_genes_mut.to_csv('/comut_driver_genes.csv')


In [None]:
##load data

#load
data = pd.read_csv('/All_sample_metadata.csv')
data_WGD = pd.read_csv('/comut_WGD.csv')
driver_genes_mut = pd.read_csv('/comut_driver_genes_filtered_by_driver_likelihood.csv')
data_mutsig = pd.read_csv('/comut_mutsig_contribution.csv')



In [None]:
#library and functions
def plot_categorical(comut, data, name, column, mapping = None):
    data = data.copy()
    data['category'] = name
    data['value'] = data[column]
    comut.add_categorical_data(data, name = name, mapping = mapping)

def plot_continuous(comut, data, name, column, mapping, cat_mapping, value_range):
    data = data.copy()
    data['category'] = name
    data['value'] = data[column]
    comut.add_continuous_data(data, name = name, mapping = mapping, cat_mapping = cat_mapping, value_range = value_range)

def plot_bar(comut, data, name, columns, mapping=None):
    data = data.copy()
    data = data[['sample']+columns]
    comut.add_bar_data(data, stacked = True, name = name, ylabel = name, mapping=mapping)
    
def plot_scatter(comut, data, name, columns, mapping=None, scatter_kwargs=None):
    data = data.copy()
    data = data[['sample']+columns]
    comut.add_scatter_data(data, stacked = True, name = name, ylabel = name, mapping=mapping, scatter_kwargs=scatter_kwargs)

In [None]:

purp_7 = palettable.cartocolors.sequential.Purp_7.mpl_colormap
vivid_10 = palettable.cartocolors.qualitative.Vivid_10.mpl_colors
balance_6 = palettable.cmocean.diverging.Balance_6.mpl_colors

from matplotlib import rcParams
custom_rcParams = {
    'font.family': 'Arial',
    'font.size': 13,
    'axes.labelsize': 13,
    'legend.fontsize':9,
    'xtick.labelsize': 7,
    'ytick.labelsize': 9
}

# update rcParams
rcParams.update(custom_rcParams)

data['sample'] = data['Sample ID']
data["Group"] = data["Group"].astype("category")
data.Group = pd.Categorical(data.Group, 
                    categories=['EAC Brain Mets', 'PCAWG Primary EAC', 'Hartwig EAC Mets'],
                    ordered=True)
data.sort_values(by=['Group', 'SNV count'], ascending=[True,False], inplace=True)
data['snv_overlap'] = data['SNV count'].clip(upper=200_000)

comut1 = comut.CoMut()


# Now proceed with plotting
plot_categorical(comut1, data, "Location", 'Location', 
                 mapping={'Esophagus': '#C86050', 
                          'Brain Met': '#6685c2', 
                          'Esophageal Met': '#f1b6da', 
                          'Liver Met': '#fee391', 
                          'Lymph Met': '#fdb863', 
                          'Lung Met': '#d8daeb',
                          'Other Met': '#F3BF5A', })  

#Driver genes
def plot_snv_and_cnv(comut1, data, driver_genes_mut):
    vivid_10 = palettable.cartocolors.qualitative.Vivid_10.mpl_colors

    gene_list = ['TP53','ERBB2','EGFR','CCND1','JAK2']
        

    driver_genes_mut.sort_values(by="category", key=lambda column: column.map(lambda e: gene_list.index(e)), inplace=True, ascending=False)

    driver_genes = driver_genes_mut[driver_genes_mut['value'].notnull()]

    mut_mapping = {'Mutation': {'facecolor':'#BCDD78','edgecolor': 'white'}, 
                    'Deletion': {'facecolor':'#8AB0D0', 'edgecolor': 'white', 'linewidth': 0.75},
                    'Amplification': {'facecolor':'#EB8777', 'edgecolor': 'white', 'linewidth': 0.75}, 'No data': 'white', 'No alteration':'white',
                   }
    
    comut1.add_categorical_data(driver_genes, name = "Driver Gene Alterations", mapping = mut_mapping)

plot_snv_and_cnv(comut1, driver_genes_mut, driver_genes_mut)

plot_categorical(comut1, data, "WGD",'WGD status', mapping={'Yes':"#4d4d4d",'No':'#ffffff'})



plot_bar(comut1, data, "SV count", ['SV count'],  mapping={'SV count': '#EB8777'})

#SNV
data_mutsig['sample'] = data_mutsig['Sample ID']

plot_bar(comut1, data_mutsig, 'MutSig',  ['MutSig 17',  'Other'],mapping={'MutSig 17':'#01665e', 'Other':'#f5f5f5'})


# Cap SNV count values in the data
data['SNV count'] = data['SNV count'].clip(upper=80000) 


plot_bar(comut1, data, 'SNV count', ['SNV count'], mapping={'SNV count': '#ab7ca3'})

#groups
plot_categorical(comut1, data, "Group",'Group', mapping={'PCAWG Primary EAC':'#C86050', 'Hartwig EAC Mets':'#F3BF5A', 'EAC Brain Mets':'#6685c2'})




### plot


heights = {'SNV': 5}
hspace = 0.08
wspace = 0.05

comut1.plot_comut(figsize = (14,8), x_padding = 0.08, y_padding = 0.07, tri_padding = 0.08, heights=heights,
                hspace = hspace, 
                wspace= wspace)

comut1.add_unified_legend(ncol = 1)

# Save the plot as a PNG file
output_file = "/comut_PCAWG_Hartwig_BrainmetEAC_plot.png"
plt.savefig(output_file, dpi=300, bbox_inches='tight')  # Save with high resolution
print(f"Plot saved to {output_file}")