# Randomly assign mutations to samples in each cancer type

Based on the combination of risk factors, there are 27 possible samples per cancer type. For each of these, we will create 10 different random samples, each containing a fixed set of drivers but random set of passengers. 

We will export two tables (one for skin and another for lung cancers), containing 270 rows each. 

In [1]:
import pandas as pd
import numpy as np

#### Output files 

In [12]:
skin_output_f = 'skin_db.txt'
lung_output_f = 'lung_db.txt'

#### Samples

In [13]:
# Age
var_items1 = ['1a', '1b', '1c']
# Solar exposure / smoking
var_items2 = ['2a', '2b', '2c']
# Solar protection / passive smoking
var_items3 = ['3a', '3b', '3c']
# Sample number
var_items4 = [f'4{i}' for i in 'abcdefghij']
var_items4

['4a', '4b', '4c', '4d', '4e', '4f', '4g', '4h', '4i', '4j']

#### Passenger mutations (common between cancer types)

In [14]:
mutations_f = '/home/claudia/Claudia/outreach/cancerdetective/mutations_db.tsv'
mutations_df = pd.read_csv(mutations_f, sep='\t', header=0)
mutations_df.head(2)

Unnamed: 0,cancer_type,mutation_id,gene,description,aa_change,dna_change,driver_passenger,og_tsg,targeted_therapy,targeted_therapy_approved
0,lung,FBXW7_R505P,FBXW7,F-box and WD repeat domain containing 7,R505P,chr4:152326136_C>G,driver,tsg,,False
1,lung,FAM135B_G186E,FAM135B,Family with sequence similarity 135 member B,G186E,chr8:138243054_C>T,driver,og,,False


In [15]:
passengers_df = mutations_df.loc[mutations_df['driver_passenger'] == 'passenger']
print(len(passengers_df))
passengers_df.head(2)

193


Unnamed: 0,cancer_type,mutation_id,gene,description,aa_change,dna_change,driver_passenger,og_tsg,targeted_therapy,targeted_therapy_approved
528,lung,OR8B2_A119E,OR8B2,Olfactory receptor family 8 subfamily B member 2,A119E,chr11:124382988_G>T,passenger,unknown,,False
529,lung,OR51A4_R124G,OR51A4,Olfactory receptor family 51 subfamily A member 4,R124G,chr11:4946731_T>C,passenger,unknown,,False


In [16]:
passenger_list = passengers_df['mutation_id'].tolist()
len(passenger_list)

193

## Skin melanomas

#### Drivers

Genes: BRAF, NRAS, HRAS, KRAS, CDKN2A, NF1, TP53

Driver mutations are assigned based on TGCA-SKCM frequencies. This is a simplified version: 
- 50% of melanomas are BRAF mutant. 
- 30% of melanomas are Ras mutant: NRAS (most frequently), HRAS or KRAS. These three are mutually exclusive
- 10-15% of melanomas are CDKN2A, or NF1, or TP53 mutated
- BRAF mutations can occur alone or together with CDKN2A, NF1, TP53 
- Ras mutations can occur alone or together with CDKN2A, NF1, TP53

In [17]:
drivers_skin_dict = {
    'a': ['BRAF_V600E'], 
    'b': ['BRAF_G469A'], 
    'c': ['BRAF_V600E', 'TP53_R248G', 'NF1_R1276P'], 
    'd': ['BRAF_V600E', 'CDKN2A_P81L'], 
    'e': ['BRAF_V600E', 'NF1_R1276P'], 
    'f': ['NRAS_Q61H'], 
    'g': ['NRAS_Q61H', 'CDKN2A_P81L'], 
    'h': ['NRAS_Q61K', 'TP53_Y220C', 'PTEN_D162G'], 
    'i': ['KRAS_G12D', 'CDKN2A_P81L'], 
    'j': ['HRAS_G12V', 'TP53_L145R'],
}

In [18]:
lines = []
ctype = 'a'
variable_mutation_values = {'a': 1, 'b':2, 'c':3}
for v1 in var_items1: 
    for v2 in var_items2: 
        for v3 in var_items3: 
            # Sample type 
            sample_type = '_'.join([v1, v2, v3])
            # Number of mutations to be assigned to the sample
            exp_mut_count = sum([variable_mutation_values[i[1]] for i in [v1, v2, v3]])
            
            # Iterate over the 10 random samples in each possible sample type (e.g., 1a_2a_3a)
            for v4 in var_items4: 
                # Get drivers
                sample_drivers = drivers_skin_dict[v4[1]]
                # Get random passengers
                passengers_n = exp_mut_count - len(sample_drivers)
                # Add 1 passenger if none
                passengers_n = 1 if passengers_n == 0 else passengers_n
                sample_passengers = list(np.random.choice(passenger_list, size=passengers_n, replace=False))
                lines.append(pd.DataFrame([[
                    sample_type, 
                    sample_type + '_' + v4,
                    exp_mut_count,
                    ';'.join(sorted(sample_drivers)), 
                    ';'.join(sorted(sample_passengers)), 
                    ';'.join(sorted(sample_drivers+sample_passengers))
                ]]))
melanomas_results = pd.concat(lines)
melanomas_results.columns = [
    'sample_type', 
    'number_mutations',
    'sample', 
    'drivers', 
    'passengers', 
    'total'
]

In [19]:
melanomas_results.head(15)

Unnamed: 0,sample_type,number_mutations,sample,drivers,passengers,total
0,1a_2a_3a,1a_2a_3a_4a,3,BRAF_V600E,OR5L2_P283S;OR8D2_M145I,BRAF_V600E;OR5L2_P283S;OR8D2_M145I
0,1a_2a_3a,1a_2a_3a_4b,3,BRAF_G469A,CSH1_V153M;KRT28_P251L,BRAF_G469A;CSH1_V153M;KRT28_P251L
0,1a_2a_3a,1a_2a_3a_4c,3,BRAF_V600E;NF1_R1276P;TP53_R248G,SULT2A1_T259I,BRAF_V600E;NF1_R1276P;SULT2A1_T259I;TP53_R248G
0,1a_2a_3a,1a_2a_3a_4d,3,BRAF_V600E;CDKN2A_P81L,OR5J2_S67L,BRAF_V600E;CDKN2A_P81L;OR5J2_S67L
0,1a_2a_3a,1a_2a_3a_4e,3,BRAF_V600E;NF1_R1276P,CACNA1S_N143K,BRAF_V600E;CACNA1S_N143K;NF1_R1276P
0,1a_2a_3a,1a_2a_3a_4f,3,NRAS_Q61H,KCNJ3_P462L;OR4K17_L45F,KCNJ3_P462L;NRAS_Q61H;OR4K17_L45F
0,1a_2a_3a,1a_2a_3a_4g,3,CDKN2A_P81L;NRAS_Q61H,GP2_E265Q,CDKN2A_P81L;GP2_E265Q;NRAS_Q61H
0,1a_2a_3a,1a_2a_3a_4h,3,NRAS_Q61K;PTEN_D162G;TP53_Y220C,OR1D2_H88Q,NRAS_Q61K;OR1D2_H88Q;PTEN_D162G;TP53_Y220C
0,1a_2a_3a,1a_2a_3a_4i,3,CDKN2A_P81L;KRAS_G12D,GPR119_L288M,CDKN2A_P81L;GPR119_L288M;KRAS_G12D
0,1a_2a_3a,1a_2a_3a_4j,3,HRAS_G12V;TP53_L145R,IQCF2_G146S,HRAS_G12V;IQCF2_G146S;TP53_L145R


In [22]:
# Check there are no repeated samples
len(melanomas_results), len(melanomas_results['total'].unique())

(270, 270)

In [23]:
# Export
melanomas_results.to_csv(skin_output_f, sep='\t', header=True, index=False)

## Lung cancers

#### Drivers

Genes: TP53, EGFR, KRAS, NFE2L2, KEAP1, CDKN2A, PTEN, CDH10, KMT2D, KMT2C

Driver mutations are assigned based on TGCA-LUSC and TCGA-LUAD frequencies. This is a simplified version: 
- 60-80% of lung cancers are TP53 mutant
- NFE2L2 and KEAP1 are mutually exclusive

In [24]:
drivers_lung_dict = {
    'a': ['TP53_R248G', 'KRAS_G12C', 'NFE2L2 G31R'], 
    'b': ['TP53_R273H', 'EGFR_L858R'], 
    'c': ['TP53_Y220C', 'EGFR_L861Q'], 
    'd': ['TP53_R248G', 'NFE2L2 G31R'], 
    'e': ['TP53_R273H', 'KEAP1_V155F'], 
    'f': ['EGFR_L861Q'], 
    'g': ['TP53_L145R', 'CDKN2A_P81L'], 
    'h': ['TP53_P151S', 'EGFR_L858R', 'PTEN_D162G'], 
    'i': ['KEAP1_C23Y', 'EGFR_L861Q'], 
    'j': ['EGFR_L858R'],
}

In [25]:
lines = []
ctype = 'a'
variable_mutation_values = {'a': 1, 'b':2, 'c':3}
for v1 in var_items1: 
    for v2 in var_items2: 
        for v3 in var_items3: 
            # Sample type 
            sample_type = '_'.join([v1, v2, v3])
            # Number of mutations to be assigned to the sample
            exp_mut_count = sum([variable_mutation_values[i[1]] for i in [v1, v2, v3]])
            
            # Iterate over the 10 random samples in each possible sample type (e.g., 1a_2a_3a)
            for v4 in var_items4: 
                # Get drivers
                sample_drivers = drivers_lung_dict[v4[1]]
                # Get random passengers
                passengers_n = exp_mut_count - len(sample_drivers)
                # Add 1 passenger if none
                passengers_n = 1 if passengers_n == 0 else passengers_n
                sample_passengers = list(np.random.choice(passenger_list, size=passengers_n, replace=False))
                lines.append(pd.DataFrame([[
                    sample_type, 
                    sample_type + '_' + v4,
                    exp_mut_count,
                    ';'.join(sorted(sample_drivers)), 
                    ';'.join(sorted(sample_passengers)), 
                    ';'.join(sorted(sample_drivers+sample_passengers))
                ]]))
lung_results = pd.concat(lines)
lung_results.columns = [
    'sample_type', 
    'number_mutations',
    'sample', 
    'drivers', 
    'passengers', 
    'total'
]

In [26]:
lung_results.head(15)

Unnamed: 0,sample_type,number_mutations,sample,drivers,passengers,total
0,1a_2a_3a,1a_2a_3a_4a,3,KRAS_G12C;NFE2L2 G31R;TP53_R248G,OR10J3_A77T,KRAS_G12C;NFE2L2 G31R;OR10J3_A77T;TP53_R248G
0,1a_2a_3a,1a_2a_3a_4b,3,EGFR_L858R;TP53_R273H,AHSG_P358L,AHSG_P358L;EGFR_L858R;TP53_R273H
0,1a_2a_3a,1a_2a_3a_4c,3,EGFR_L861Q;TP53_Y220C,OR9K2_L335H,EGFR_L861Q;OR9K2_L335H;TP53_Y220C
0,1a_2a_3a,1a_2a_3a_4d,3,NFE2L2 G31R;TP53_R248G,SULT2A1_T259I,NFE2L2 G31R;SULT2A1_T259I;TP53_R248G
0,1a_2a_3a,1a_2a_3a_4e,3,KEAP1_V155F;TP53_R273H,TFAP2D_P313S,KEAP1_V155F;TFAP2D_P313S;TP53_R273H
0,1a_2a_3a,1a_2a_3a_4f,3,EGFR_L861Q,C8orf86_R125W;OR1C1_L215F,C8orf86_R125W;EGFR_L861Q;OR1C1_L215F
0,1a_2a_3a,1a_2a_3a_4g,3,CDKN2A_P81L;TP53_L145R,TP53TG5_D203N,CDKN2A_P81L;TP53TG5_D203N;TP53_L145R
0,1a_2a_3a,1a_2a_3a_4h,3,EGFR_L858R;PTEN_D162G;TP53_P151S,IL21_R34S,EGFR_L858R;IL21_R34S;PTEN_D162G;TP53_P151S
0,1a_2a_3a,1a_2a_3a_4i,3,EGFR_L861Q;KEAP1_C23Y,TECTB_R302W,EGFR_L861Q;KEAP1_C23Y;TECTB_R302W
0,1a_2a_3a,1a_2a_3a_4j,3,EGFR_L858R,OR4C46_A101V;OR5D13_E85K,EGFR_L858R;OR4C46_A101V;OR5D13_E85K


In [27]:
# Check there are no repeated samples
len(lung_results), len(lung_results['total'].unique())

(270, 270)

In [23]:
# Export
lung_results.to_csv(lung_output_f, sep='\t', header=True, index=False)