# Randomly assign mutations to samples in each cancer type

In [1]:
import pandas as pd
import numpy as np

#### Samples

In [2]:
# Age
var_items1 = ['1a', '1b', '1c']
# Solar exposure / smoking
var_items2 = ['2a', '2b', '2c']
# Solar protection / passive smoking
var_items3 = ['3a', '3b', '3c']
# Sample number
var_items4 = [f'4{i}' for i in 'abcdefghij']
var_items4

['4a', '4b', '4c', '4d', '4e', '4f', '4g', '4h', '4i', '4j']

#### Passenger mutations (common between cancer types)

In [3]:
mutations_f = '/home/claudia/Claudia/outreach/cancerdetective/mutations_db.tsv'
mutations_df = pd.read_csv(mutations_f, sep='\t', header=0)
mutations_df.head(2)

Unnamed: 0,cancer_type,mutation_id,gene,description,aa_change,dna_change,driver_passenger,og_tsg,targeted_therapy,targeted_therapy_approved
0,lung,FBXW7_R505P,FBXW7,F-box and WD repeat domain containing 7,R505P,chr4:152326136_C>G,driver,tsg,,False
1,lung,FAM135B_G186E,FAM135B,Family with sequence similarity 135 member B,G186E,chr8:138243054_C>T,driver,og,,False


In [4]:
passengers_df = mutations_df.loc[mutations_df['driver_passenger'] == 'passenger']
print(len(passengers_df))
passengers_df.head(10)

193


Unnamed: 0,cancer_type,mutation_id,gene,description,aa_change,dna_change,driver_passenger,og_tsg,targeted_therapy,targeted_therapy_approved
528,lung,OR8B2_A119E,OR8B2,Olfactory receptor family 8 subfamily B member 2,A119E,chr11:124382988_G>T,passenger,unknown,,False
529,lung,OR51A4_R124G,OR51A4,Olfactory receptor family 51 subfamily A member 4,R124G,chr11:4946731_T>C,passenger,unknown,,False
530,lung,OR4F15_V46E,OR4F15,Olfactory receptor family 4 subfamily F member 15,V46E,chr15:101818323_T>A,passenger,unknown,,False
531,lung,OR4L1_V273L,OR4L1,Olfactory receptor family 4 subfamily L member 1,V273L,chr14:20060861_G>T,passenger,unknown,,False
532,lung,OR4K1_S57F,OR4K1,Olfactory receptor family 4 subfamily K member 1,S57F,chr14:19935836_C>T,passenger,unknown,,False
533,lung,OR1L6_P22S,OR1L6,Olfactory receptor family 1 subfamily L member 6,P22S,chr9:122749911_C>T,passenger,unknown,,False
534,lung,OR52A5_L210P,OR52A5,Olfactory receptor family 52 subfamily A member 5,L210P,chr11:5132014_A>G,passenger,unknown,,False
535,lung,OR9K2_L335H,OR9K2,Olfactory receptor family 9 subfamily K member 2,L335H,chr12:55130772_T>A,passenger,unknown,,False
536,lung,OR5T1_G53V,OR5T1,Olfactory receptor family 5 subfamily T member 1,G53V,chr11:56275796_G>T,passenger,unknown,,False
537,lung,OR2T5_S71T,OR2T5,Olfactory receptor family 2 subfamily T member 5,S71T,chr1:248488799_T>A,passenger,unknown,,False


In [5]:
passenger_list = passengers_df['mutation_id'].tolist()
len(passenger_list)

193

## Skin melanomas

#### Drivers

Genes: BRAF, NRAS, HRAS, KRAS, CDKN2A, NF1, TP53

Driver mutations are assigned based on TGCA-SKCM frequencies. This is a simplified version: 
- 50% of melanomas are BRAF mutant. 
- 30% of melanomas are Ras mutant: NRAS (most frequently), HRAS or KRAS. These three are mutually exclusive
- 10-15% of melanomas are CDKN2A, or NF1, or TP53 mutated
- BRAF mutations can occur alone or together with CDKN2A, NF1, TP53 
- Ras mutations can occur alone or together with CDKN2A, NF1, TP53

In [13]:
drivers_skin_dict = {
    'a': ['BRAF_V600E'], 
    'b': ['BRAF_G469A'], 
    'c': ['BRAF_V600E', 'TP53_R248G', 'NF1_R1276P'], 
    'd': ['BRAF_V600E', 'CDKN2A_P81L'], 
    'e': ['BRAF_V600E', 'NF1_R1276P'], 
    'f': ['NRAS_Q61H'], 
    'g': ['NRAS_Q61H', 'CDKN2A_P81L'], 
    'h': ['NRAS_Q61K', 'TP53_Y220C', 'PTEN_D162G'], 
    'i': ['KRAS_G12D', 'CDKN2A_P81L'], 
    'j': ['HRAS_G12V', 'TP53_L145R'],
}

In [14]:
lines = []
ctype = 'a'
variable_mutation_values = {'a': 1, 'b':2, 'c':3}
for v1 in var_items1: 
    for v2 in var_items2: 
        for v3 in var_items3: 
            # Sample type 
            sample_type = '_'.join([v1, v2, v3])
            # Number of mutations to be assigned to the sample
            exp_mut_count = sum([variable_mutation_values[i[1]] for i in [v1, v2, v3]])
            print(sample_type, exp_mut_count)
            
            # Iterate over the 10 random samples in each possible sample type (e.g., 1a_2a_3a)
            for v4 in var_items4: 
                # Get drivers
                sample_drivers = drivers_skin_dict[v4[1]]
                # Get random passengers
                passengers_n = exp_mut_count - len(sample_drivers)
                # Add 1 passenger if none
                passengers_n = 1 if passengers_n == 0 else passengers_n
                sample_passengers = list(np.random.choice(passenger_list, size=passengers_n, replace=False))
                lines.append(pd.DataFrame([[
                    sample_type, 
                    sample_type + '_' + v4,
                    exp_mut_count,
                    ';'.join(sorted(sample_drivers)), 
                    ';'.join(sorted(sample_passengers)), 
                    ';'.join(sorted(sample_drivers+sample_passengers))
                ]]))
melanomas_results = pd.concat(lines)
melanomas_results.columns = [
    'sample_type', 
    'number_mutations',
    'sample', 
    'drivers', 
    'passengers', 
    'total'
]
melanomas_results.head(30)

1a_2a_3a 3
1a_2a_3b 4
1a_2a_3c 5
1a_2b_3a 4
1a_2b_3b 5
1a_2b_3c 6
1a_2c_3a 5
1a_2c_3b 6
1a_2c_3c 7
1b_2a_3a 4
1b_2a_3b 5
1b_2a_3c 6
1b_2b_3a 5
1b_2b_3b 6
1b_2b_3c 7
1b_2c_3a 6
1b_2c_3b 7
1b_2c_3c 8
1c_2a_3a 5
1c_2a_3b 6
1c_2a_3c 7
1c_2b_3a 6
1c_2b_3b 7
1c_2b_3c 8
1c_2c_3a 7
1c_2c_3b 8
1c_2c_3c 9


Unnamed: 0,sample_type,number_mutations,sample,drivers,passengers,total
0,1a_2a_3a,1a_2a_3a_4a,3,BRAF_V600E,OR5L2_P283S;TECTB_R302W,BRAF_V600E;OR5L2_P283S;TECTB_R302W
0,1a_2a_3a,1a_2a_3a_4b,3,BRAF_G469A,FAM71A_S517Y;OR4K15_V270E,BRAF_G469A;FAM71A_S517Y;OR4K15_V270E
0,1a_2a_3a,1a_2a_3a_4c,3,BRAF_V600E;NF1_R1276P;TP53_R248G,CYP17A1_E84K,BRAF_V600E;CYP17A1_E84K;NF1_R1276P;TP53_R248G
0,1a_2a_3a,1a_2a_3a_4d,3,BRAF_V600E;CDKN2A_P81L,STATH_L10F,BRAF_V600E;CDKN2A_P81L;STATH_L10F
0,1a_2a_3a,1a_2a_3a_4e,3,BRAF_V600E;NF1_R1276P,KCNJ3_R499G,BRAF_V600E;KCNJ3_R499G;NF1_R1276P
0,1a_2a_3a,1a_2a_3a_4f,3,NRAS_Q61H,OR13J1_Y123S;OR51L1_P20S,NRAS_Q61H;OR13J1_Y123S;OR51L1_P20S
0,1a_2a_3a,1a_2a_3a_4g,3,CDKN2A_P81L;NRAS_Q61H,OR1L8_P287S,CDKN2A_P81L;NRAS_Q61H;OR1L8_P287S
0,1a_2a_3a,1a_2a_3a_4h,3,NRAS_Q61K;PTEN_D162G;TP53_Y220C,SLC10A2_S335L,NRAS_Q61K;PTEN_D162G;SLC10A2_S335L;TP53_Y220C
0,1a_2a_3a,1a_2a_3a_4i,3,CDKN2A_P81L;KRAS_G12D,GFRAL_A48S,CDKN2A_P81L;GFRAL_A48S;KRAS_G12D
0,1a_2a_3a,1a_2a_3a_4j,3,HRAS_G12V;TP53_L145R,OR51E2_M184I,HRAS_G12V;OR51E2_M184I;TP53_L145R


## Lung cancers

#### Drivers

Genes: TP53, EGFR, KRAS, NFE2L2, KEAP1, CDKN2A, PTEN, CDH10, KMT2D, KMT2C

Driver mutations are assigned based on TGCA-LUSC and TCGA-LUAD frequencies. This is a simplified version: 
- 60-80% of lung cancers are TP53 mutant
- NFE2L2 and KEAP1 are mutually exclusive

In [15]:
drivers_lung_dict = {
    'a': ['TP53_R248G', 'KRAS_G12C', 'NFE2L2 G31R'], 
    'b': ['TP53_R273H', 'EGFR_L858R'], 
    'c': ['TP53_Y220C', 'EGFR_L861Q'], 
    'd': ['TP53_R248G', 'NFE2L2 G31R'], 
    'e': ['TP53_R273H', 'KEAP1_V155F'], 
    'f': ['EGFR_L861Q'], 
    'g': ['TP53_L145R', 'CDKN2A_P81L'], 
    'h': ['TP53_P151S', 'EGFR_L858R', 'PTEN_D162G'], 
    'i': ['KEAP1_C23Y', 'EGFR_L861Q'], 
    'j': ['EGFR_L858R'],
}

In [16]:
lines = []
ctype = 'a'
variable_mutation_values = {'a': 1, 'b':2, 'c':3}
for v1 in var_items1: 
    for v2 in var_items2: 
        for v3 in var_items3: 
            # Sample type 
            sample_type = '_'.join([v1, v2, v3])
            # Number of mutations to be assigned to the sample
            exp_mut_count = sum([variable_mutation_values[i[1]] for i in [v1, v2, v3]])
            print(sample_type, exp_mut_count)
            
            # Iterate over the 10 random samples in each possible sample type (e.g., 1a_2a_3a)
            for v4 in var_items4: 
                # Get drivers
                sample_drivers = drivers_lung_dict[v4[1]]
                # Get random passengers
                passengers_n = exp_mut_count - len(sample_drivers)
                # Add 1 passenger if none
                passengers_n = 1 if passengers_n == 0 else passengers_n
                sample_passengers = list(np.random.choice(passenger_list, size=passengers_n, replace=False))
                lines.append(pd.DataFrame([[
                    sample_type, 
                    sample_type + '_' + v4,
                    exp_mut_count,
                    ';'.join(sorted(sample_drivers)), 
                    ';'.join(sorted(sample_passengers)), 
                    ';'.join(sorted(sample_drivers+sample_passengers))
                ]]))
melanomas_results = pd.concat(lines)
melanomas_results.columns = [
    'sample_type', 
    'number_mutations',
    'sample', 
    'drivers', 
    'passengers', 
    'total'
]
melanomas_results.head(30)

1a_2a_3a 3
1a_2a_3b 4
1a_2a_3c 5
1a_2b_3a 4
1a_2b_3b 5
1a_2b_3c 6
1a_2c_3a 5
1a_2c_3b 6
1a_2c_3c 7
1b_2a_3a 4
1b_2a_3b 5
1b_2a_3c 6
1b_2b_3a 5
1b_2b_3b 6
1b_2b_3c 7
1b_2c_3a 6
1b_2c_3b 7
1b_2c_3c 8
1c_2a_3a 5
1c_2a_3b 6
1c_2a_3c 7
1c_2b_3a 6
1c_2b_3b 7
1c_2b_3c 8
1c_2c_3a 7
1c_2c_3b 8
1c_2c_3c 9


Unnamed: 0,sample_type,number_mutations,sample,drivers,passengers,total
0,1a_2a_3a,1a_2a_3a_4a,3,KRAS_G12C;NFE2L2 G31R;TP53_R248G,OR8B8_P129S,KRAS_G12C;NFE2L2 G31R;OR8B8_P129S;TP53_R248G
0,1a_2a_3a,1a_2a_3a_4b,3,EGFR_L858R;TP53_R273H,TMEM72_D151E,EGFR_L858R;TMEM72_D151E;TP53_R273H
0,1a_2a_3a,1a_2a_3a_4c,3,EGFR_L861Q;TP53_Y220C,CYP17A1_E84K,CYP17A1_E84K;EGFR_L861Q;TP53_Y220C
0,1a_2a_3a,1a_2a_3a_4d,3,NFE2L2 G31R;TP53_R248G,CSN2_A104G,CSN2_A104G;NFE2L2 G31R;TP53_R248G
0,1a_2a_3a,1a_2a_3a_4e,3,KEAP1_V155F;TP53_R273H,OR1J2_W313L,KEAP1_V155F;OR1J2_W313L;TP53_R273H
0,1a_2a_3a,1a_2a_3a_4f,3,EGFR_L861Q,CT47B1_E278K;PDHA2_E335K,CT47B1_E278K;EGFR_L861Q;PDHA2_E335K
0,1a_2a_3a,1a_2a_3a_4g,3,CDKN2A_P81L;TP53_L145R,OR4D10_D309E,CDKN2A_P81L;OR4D10_D309E;TP53_L145R
0,1a_2a_3a,1a_2a_3a_4h,3,EGFR_L858R;PTEN_D162G;TP53_P151S,INSL4_E60K,EGFR_L858R;INSL4_E60K;PTEN_D162G;TP53_P151S
0,1a_2a_3a,1a_2a_3a_4i,3,EGFR_L861Q;KEAP1_C23Y,INSL4_E60K,EGFR_L861Q;INSL4_E60K;KEAP1_C23Y
0,1a_2a_3a,1a_2a_3a_4j,3,EGFR_L858R,OR2M5_M206I;TM4SF20_E127K,EGFR_L858R;OR2M5_M206I;TM4SF20_E127K
