In [1]:
%run lgs03a_data_unification.ipynb

In [176]:
def remove_mutation_from_combination(mut_combination, mutation):
    return ':'.join([m for m in mut_combination.split(':') if m != mutation])


def contains_mutation(mutations, mutation):
    return mutation in mutations.split(':')


def find_genotypes_containing_mutations(df, mutations):
    # returns a df with all genotypes containing the specified mutations
    if type(mutations) == str:
        mutations = mutations.split(':')
    for m in mutations:
        df = df[df['aa_genotype_pseudo'].apply(lambda comb: contains_mutation(comb, m))]
    return df


def get_mutation_in_all_backgrounds(df, mutation, lowest_acceptable_fitness=None):
    containing_mutation = find_genotypes_containing_mutations(df, mutation).copy()
    containing_mutation['background'] = containing_mutation['aa_genotype_pseudo'].apply(lambda 
                                mut_combination: remove_mutation_from_combination(mut_combination, mutation))
    containing_mutation.set_index('background', inplace=True)
    
    if lowest_acceptable_fitness:
        df = df[df['brightness'] >= lowest_acceptable_fitness].copy()
        
    df['aa_genotype_pseudo'] = df['aa_genotype_pseudo'].apply(lambda x: '' if x=='wt' else x)
    df = df[df['aa_genotype_pseudo'].isin(containing_mutation.index)].copy()
    df['with_mut'] = df['aa_genotype_pseudo'].map(containing_mutation['brightness'])
    
    panel = pd.DataFrame({'bg_genotype_pseudo' : df['aa_genotype_pseudo'],
                        'bg_brightness' : df['brightness'],
                        'mut_brightness' : df['with_mut']})
    panel.set_index('bg_genotype_pseudo', inplace=True)
    panel['mut_effect'] = panel['mut_brightness'] - panel['bg_brightness']
    
    try:
        effect_in_wt = panel.loc['', 'mut_brightness']
    except:
        effect_in_wt = panel['mut_brightness'].median()
    
    panel['bg_effect']  = panel['mut_brightness'] - effect_in_wt
    
    return panel

In [177]:
unique_single_mutations = set(flatten([x.split(':') for x in data_aa.aa_genotype_pseudo]))
unique_single_mutations = {x for x in unique_single_mutations if '.' not in x and '*' not in x}

In [178]:
from tqdm import tqdm

In [179]:
mutations_in_all_backgrounds = {}

for gene in tqdm(['amacGFP', 'cgreGFP', 'ppluGFP', 'amacV14LGFP', 'avGFP'], desc='Genes'):
    
    mutations_in_all_backgrounds[gene] = {}
    
    data_subset = data_aa[data_aa['gene'] == gene]
    data_subset = data_subset[data_subset['n_mut'] <= 10]
    if gene == 'avGFP':
        lowest_acceptable_fitness = 2.5
    else:
        lowest_acceptable_fitness = gate_borders_log[gene[0:4]][1] # arbitrary threshold: P4-P5 border
        
    for mutation in unique_single_mutations:
        in_all_backgrounds = get_mutation_in_all_backgrounds(data_subset, mutation)
        if len(in_all_backgrounds) > 0:
            mutations_in_all_backgrounds[gene][mutation] = in_all_backgrounds


Genes:   0%|          | 0/5 [00:00<?, ?it/s][A
Genes:  20%|██        | 1/5 [03:29<13:56, 209.01s/it][A
Genes:  40%|████      | 2/5 [05:57<09:32, 190.75s/it][A
Genes:  60%|██████    | 3/5 [08:44<06:07, 183.76s/it][A
Genes:  80%|████████  | 4/5 [10:08<02:33, 153.81s/it][A
Genes: 100%|██████████| 5/5 [14:46<00:00, 177.30s/it][A


In [160]:
import pickle

In [180]:
# pickle.dump(mutations_in_all_backgrounds, open( 'mutation_effects_by_background_log10.pkl', 'wb' ) )

#### same but scaled data

In [181]:
mutations_in_all_backgrounds_scaled = {}

for gene in tqdm(['amacGFP', 'cgreGFP', 'ppluGFP', 'amacV14LGFP', 'avGFP'], desc='Genes'):
    
    mutations_in_all_backgrounds_scaled[gene] = {}
    
    data_subset = data_aa_scaled[data_aa_scaled['gene'] == gene]
    data_subset = data_subset[data_subset['n_mut'] <= 10]
    if gene == 'avGFP':
        lowest_acceptable_fitness = 2.5
    else:
        lowest_acceptable_fitness = gate_borders_log[gene[0:4]][1] # arbitrary threshold: P4-P5 border
        
    for mutation in unique_single_mutations:
        in_all_backgrounds = get_mutation_in_all_backgrounds(data_subset, mutation)
        if len(in_all_backgrounds) > 0:
            mutations_in_all_backgrounds_scaled[gene][mutation] = in_all_backgrounds


Genes:   0%|          | 0/5 [00:00<?, ?it/s][A
Genes:  20%|██        | 1/5 [03:24<13:37, 204.36s/it][A
Genes:  40%|████      | 2/5 [06:18<09:46, 195.41s/it][A
Genes:  60%|██████    | 3/5 [09:24<06:24, 192.44s/it][A
Genes:  80%|████████  | 4/5 [10:54<02:41, 161.87s/it][A
Genes: 100%|██████████| 5/5 [15:42<00:00, 188.49s/it][A


In [182]:
# pickle.dump(mutations_in_all_backgrounds, open( 'mutation_effects_by_background_scaled.pkl', 'wb' ) )