In [1]:
%run lgs00_general_functions.ipynb
%run lgs01_functions_for_getting_aa_pseudopositions.ipynb

In [2]:
%run lgs02_natural_FP_alignment.ipynb

## Data import

### Amino acid genotypes

In [3]:
data_aa = pd.read_csv(os.path.join(data_folder, 'final_datasets', 
                                'amacGFP_cgreGFP_ppluGFP2__final_aminoacid_genotypes_to_brightness.csv'),)
data_aa['log_brightness'] = data_aa['replicates_mean_brightness'].apply(lambda x: np.log10(x))

av_aa = pd.read_csv(os.path.join(data_folder, 'final_datasets', 
                                 'avGFP__rf_aminoacid_genotypes_to_brightness.csv'))
av_aa['gene'] = 'avGFP'

cols = ['aa_genotype_native', 'aa_genotype_pseudo', 'gene', 'log_brightness']
data_aa = pd.concat([data_aa[cols], av_aa[cols]])


In [4]:
data_aa = data_aa.set_index(['aa_genotype_pseudo','gene'])

ref_wt_log = {gene : data_aa.loc[('wt',gene), 'log_brightness']
                            for gene in ['amacGFP', 'avGFP', 'cgreGFP', 'ppluGFP']}
ref_wt_log['amacV14LGFP'] = data_aa.loc[('V14L', 'amacGFP'), 'log_brightness']

data_aa = data_aa.reset_index().set_index('gene')

ref_min_log = {gene : data_aa.loc[gene]['log_brightness'].min()
                          for gene in ['amacGFP', 'avGFP', 'cgreGFP', 'ppluGFP']}
ref_min_log['amacV14LGFP'] = data_aa.loc['amacGFP']['log_brightness'].min()

data_aa = data_aa.reset_index()

#### Get scaled values (0 = minimum value, 1 = wt)

In [5]:
def get_scaled_effects(gene, data):
    if gene=='amacV14LGFP':
        df = data[(data.gene == 'amacGFP') & (data.aa_genotype_pseudo.str.contains('V14L'))].copy()
        df['gene'] = 'amacV14LGFP'
        df['aa_genotype_pseudo'] = df['aa_genotype_pseudo'].apply(lambda x: x.replace('V14L:', ''))
        df['aa_genotype_pseudo'] = df['aa_genotype_pseudo'].apply(lambda x: x.replace(':V14L', ''))
        df['aa_genotype_native'] = df['aa_genotype_native'].apply(lambda x: x.replace('V11L:', ''))
        df['aa_genotype_native'] = df['aa_genotype_native'].apply(lambda x: x.replace(':V11L', ''))
        df['aa_genotype_pseudo'] = df['aa_genotype_pseudo'].apply(lambda x: x.replace('V14L', 'wt'))
        df['aa_genotype_native'] = df['aa_genotype_native'].apply(lambda x: x.replace('V11L', 'wt'))
    
    else:
        df = data[data.gene == gene].copy()
        
    df['measured_effect_log'] = df['log_brightness'] - ref_wt_log[gene]
    df['scaled_brightness'] = (df['log_brightness'] - ref_min_log[gene]) / (ref_wt_log[gene] - ref_min_log[gene])
    df['measured_effect_scaled'] = df['scaled_brightness'] - 1
    df['n_mut'] = df['aa_genotype_pseudo'].apply(lambda x: x.count(':')+1 if x!='wt' else 0)
    
    return df

In [6]:
amac = get_scaled_effects('amacGFP', data_aa)
cgre = get_scaled_effects('cgreGFP', data_aa)
pplu = get_scaled_effects('ppluGFP', data_aa)
av = get_scaled_effects('avGFP', data_aa)
amacV14L = get_scaled_effects('amacV14LGFP', data_aa)

#### Get effects of single mutations

In [8]:
def get_singlemut_effects(data):
    df = data[data.n_mut==1]
    singles_effects_log = dict(zip(df['aa_genotype_pseudo'], df['measured_effect_log']))
    singles_effects_scaled = dict(zip(df['aa_genotype_pseudo'], df['measured_effect_scaled']))
    return singles_effects_log, singles_effects_scaled

In [9]:
siffects_amacV14L_log, siffects_amacV14L_scaled = get_singlemut_effects(amacV14L)
siffects_amac_log, siffects_amac_scaled = get_singlemut_effects(amac)
siffects_pplu_log, siffects_pplu_scaled = get_singlemut_effects(pplu)
siffects_av_log, siffects_av_scaled = get_singlemut_effects(av)
siffects_cgre_log, siffects_cgre_scaled = get_singlemut_effects(cgre)

Get dataframe with only single mutations

In [10]:
def combine_singlemut_effects(scaled=False):
    if scaled == False:
        sif_amac = omit_wt_state(siffects_amac_log)
        sif_amacV14L = omit_wt_state(siffects_amacV14L_log)
        sif_av = omit_wt_state(siffects_av_log)
        sif_cgre = omit_wt_state(siffects_cgre_log)
        sif_pplu = omit_wt_state(siffects_pplu_log)
    elif scaled == True:
        sif_amac = omit_wt_state(siffects_amac_scaled)
        sif_amacV14L = omit_wt_state(siffects_amacV14L_scaled)
        sif_av = omit_wt_state(siffects_av_scaled)
        sif_cgre = omit_wt_state(siffects_cgre_scaled)
        sif_pplu = omit_wt_state(siffects_pplu_scaled)
    
    all_aasubs = set(list(sif_amac.keys()) + list(sif_amacV14L.keys()) + list(sif_av.keys())
                    + list(sif_cgre.keys()) + list(sif_pplu.keys()))
    all_aasubs = {x:[] for x in all_aasubs}
    for x in all_aasubs:
        for gene in ['av', 'amac', 'cgre', 'pplu', 'amacV14L']:
            all_aasubs[x].extend([eval('sif_'+gene)[x] if x in eval('sif_'+gene) else np.nan])

    
    df = pd.DataFrame({'effect_in_av' : np.array(list(all_aasubs.values()))[:,0],
                            'effect_in_amac' : np.array(list(all_aasubs.values()))[:,1],
                          'effect_in_amacV14L' : np.array(list(all_aasubs.values()))[:,4],
                          'effect_in_cgre' : np.array(list(all_aasubs.values()))[:,2],
                          'effect_in_pplu' : np.array(list(all_aasubs.values()))[:,3],
                           'aa_genotype_pseudo' : list(all_aasubs.keys())}    )
    
    df['wt_state_av'] = df['aa_genotype_pseudo'].apply(lambda x: pseudopos_to_nativeaa[int(x[:-1])][0])
    df['wt_state_amac'] = df['aa_genotype_pseudo'].apply(lambda x: pseudopos_to_nativeaa[int(x[:-1])][1])
    df['wt_state_cgre'] = df['aa_genotype_pseudo'].apply(lambda x: pseudopos_to_nativeaa[int(x[:-1])][2])
    df['wt_state_pplu'] = df['aa_genotype_pseudo'].apply(lambda x: pseudopos_to_nativeaa[int(x[:-1])][3])
    
    df = df.set_index('aa_genotype_pseudo')
    df['position'] = list(df.reset_index()['aa_genotype_pseudo'].apply(lambda x: int(x[:-1])))
    df['mutation'] = list(df.reset_index()['aa_genotype_pseudo'].apply(lambda x: x[-1]))
    
    return df

In [11]:
df_effects_singles = combine_singlemut_effects()

In [12]:
df_effects_singles_scaled = combine_singlemut_effects(True)

#### Get expected fitnesses (additive, no epistasis assumption)

In [13]:
def get_expected_effect(muts_str, siffects_dict, min_brightness, max_brightness):
    muts = muts_str.split(':')
    expectation = 0 
    try:
        for m in muts:
            expectation += siffects_dict[m]
        if expectation > max_brightness:
            return max_brightness
        elif expectation < min_brightness:
            return min_brightness
        else:
            return expectation
    except:
        return np.nan
    
    
    
def get_expected_effect_df(data, gene, column):

    if 'log' in column:    
        min_fluo = data[column].min() - ref_wt_log[gene+'GFP']
        max_fluo = data[column].max() - ref_wt_log[gene+'GFP']
        sifs = eval('siffects_'+gene+'_log')
        data['expected_effect_log'] = data['aa_genotype_pseudo'].apply(lambda x: 
                                            get_expected_effect(x, sifs, min_fluo, max_fluo) )
    elif 'scaled' in column:
        min_fluo = data[column].min() - 1
        max_fluo = data[column].max() - 1
        sifs = eval('siffects_'+gene+'_scaled')
        data['expected_effect_scaled'] = data['aa_genotype_pseudo'].apply(lambda x: 
                                            get_expected_effect(x, sifs, min_fluo, max_fluo) )
    return data

In [14]:
amac = get_expected_effect_df(amac, 'amac', 'log_brightness')
pplu = get_expected_effect_df(pplu, 'pplu', 'log_brightness')
cgre = get_expected_effect_df(cgre, 'cgre', 'log_brightness')
av = get_expected_effect_df(av, 'av', 'log_brightness')
amacV14L = get_expected_effect_df(amacV14L, 'amacV14L', 'log_brightness')

In [15]:
amac = get_expected_effect_df(amac, 'amac', 'scaled_brightness')
pplu = get_expected_effect_df(pplu, 'pplu', 'scaled_brightness')
cgre = get_expected_effect_df(cgre, 'cgre', 'scaled_brightness')
av = get_expected_effect_df(av, 'av', 'scaled_brightness')
amacV14L = get_expected_effect_df(amacV14L, 'amacV14L', 'scaled_brightness')

#### Get info about extant alignments

In [16]:
def get_quasipos_genotype(gene, length, data):
    data['quasipos_genotype'] = data['aa_genotype_native'].apply(lambda x:
                                            ':'.join([str(nativepos_to_quasipos[length-1][gene]+1)
                                            + y[-1] if int(y[1:-1])>=length #avoid bugs on frameshift genotypes
                                                else str(nativepos_to_quasipos[int(y[1:-1])][gene]) + y[-1]
                                                    for y in x.split(':')
                                            ])   if x!='wt' else np.nan
                                                                )
    return data

In [17]:
amac = get_quasipos_genotype('GFPxm', 238, amac)
cgre = get_quasipos_genotype('cgreGFP', 235, cgre)
pplu = get_quasipos_genotype('ppluGFP2', 222, pplu)
av = get_quasipos_genotype('avGFP', 238, av)
amacV14L = get_quasipos_genotype('GFPxm', 238, amacV14L)

#### Add ddG predictions (old protocol)

In [18]:
def get_ddGs(data, gene):
    ddGs = pd.read_csv(os.path.join(data_folder, 'protein_structure', 
                                    'avGFP_amacGFP_cgreGFP_ppluGFP2__ddG_predictions.csv'), index_col=0)
    ddGs = ddGs[ddGs.gene==gene].copy()
    muts = set(ddGs.index)
    data['ddG_prediction'] = data['aa_genotype_pseudo'].apply(lambda x:
                                        ddGs.loc[x, 'ddG_prediction'] if x in muts else np.nan)
    return data

In [19]:
amac = get_ddGs(amac, 'amacGFP')
cgre = get_ddGs(cgre, 'cgreGFP')
pplu = get_ddGs(pplu, 'ppluGFP')
av = get_ddGs(av, 'avGFP')
amacV14L = get_ddGs(amacV14L, 'amacV14LGFP')

#### For clarity, put log values and scaled values into separate datasets, and group data from all genes

In [19]:
cols = ['aa_genotype_pseudo','aa_genotype_native', 'gene', 'n_mut', 'quasipos_genotype','ddG_prediction',
         'brightness', 'measured_effect', 'expected_effect']

cols_scaled = ['aa_genotype_pseudo','aa_genotype_native', 'gene', 'n_mut','quasipos_genotype','ddG_prediction',
                         'scaled_brightness', 'measured_effect_scaled', 'expected_effect_scaled']

cols_log = ['aa_genotype_pseudo','aa_genotype_native', 'gene', 'n_mut','quasipos_genotype','ddG_prediction',
                         'log_brightness', 'measured_effect_log', 'expected_effect_log']

In [20]:
amacV14L_scaled = amacV14L[cols_scaled].copy()
amac_scaled = amac[cols_scaled].copy()
av_scaled = av[cols_scaled].copy()
pplu_scaled = pplu[cols_scaled].copy()
cgre_scaled = cgre[cols_scaled].copy()

data_aa_scaled = pd.concat([amac_scaled, amacV14L_scaled, cgre_scaled, pplu_scaled, av_scaled])
data_aa_scaled.columns = cols

In [21]:
amacV14L = amacV14L[cols_log].copy()
amac = amac[cols_log].copy()
av = av[cols_log].copy()
pplu = pplu[cols_log].copy()
cgre = cgre[cols_log].copy()

data_aa = pd.concat([amac, amacV14L, cgre, pplu, av])
data_aa.columns = cols

#### Get epistasis, n_extant, etc.

In [22]:
data_aa['epistasis'] = data_aa['measured_effect'] - data_aa['expected_effect']
data_aa_scaled['epistasis'] = data_aa_scaled['measured_effect'] - data_aa_scaled['expected_effect']

In [23]:
data_aa['n_mut_extant'] = data_aa['quasipos_genotype'].apply(check_n_mut_extant)
data_aa_scaled['n_mut_extant'] = data_aa_scaled['quasipos_genotype'].apply(check_n_mut_extant)

In [24]:
data_aa['has_buried_mutation'] = data_aa['aa_genotype_pseudo'].apply(lambda x:
                                    bool(buried_pos & {int(y[1:-1]) for y in x.split(':') if y[1:-1].isdigit()}) 
                                                                     if x!='wt' else np.nan)
data_aa_scaled['has_buried_mutation'] = data_aa_scaled['aa_genotype_pseudo'].apply(lambda x:
                                    bool(buried_pos & {int(y[1:-1]) for y in x.split(':') if y[1:-1].isdigit()}) 
                                                                     if x!='wt' else np.nan)

#### Get dataset with only doublemuts

In [25]:
def get_epistasis_type_for_doublemuts(a, b, ab, threshold=0):
    if str(a)=='nan' or str(b)=='nan':
        return np.nan
    
    else:
        a_new = ab - b
        b_new = ab - a
        
        a_sign = a_new<0<a or a<0<a_new
        b_sign = b_new<0<b or b<0<b_new
        a_neutral = (abs(a) < threshold) or (abs(a_new) < threshold)
        b_neutral = (abs(b) < threshold) or (abs(b_new) < threshold)

        if ab - (a+b) > 0:
            e = 'positive'
        else:
            e = 'negative'
            
        if a_sign and not a_neutral and b_sign and not b_neutral:
            return 'reciprocal_sign'
        elif a_sign and not a_neutral or b_sign and not b_neutral:
            return 'sign'
        else:
            return e
    

In [26]:
def get_doublemuts(data, scaled=False):
    if scaled==True:
        addon = '_scaled'
    else:
        addon = '_log'
        
    df = data[data.n_mut == 2].copy()
    df['mut1_effect'] = df[['gene', 'aa_genotype_pseudo']].apply(lambda x:
                                        eval('siffects_'+x[0][:-3]+addon)[x[1].split(':')[0]]
                                        if x[1].split(':')[0] in eval('siffects_'+x[0][:-3]+addon) else np.nan, 
                                                                 axis=1)
    df['mut2_effect'] = df[['gene', 'aa_genotype_pseudo']].apply(lambda x:
                                        eval('siffects_'+x[0][:-3]+addon)[x[1].split(':')[1]]
                                        if x[1].split(':')[1] in eval('siffects_'+x[0][:-3]+addon) else np.nan, 
                                                                 axis=1)
    df['mut1_effect_mut2bg'] = df.measured_effect - df.mut2_effect
    df['mut2_effect_mut1bg'] = df.measured_effect - df.mut1_effect
    
    df['position_pseudo'] = df['aa_genotype_pseudo'].apply(lambda x:
                                       ':'.join([y[1:-1] for y in x.split(':')]))
    df['position_native'] = df['aa_genotype_native'].apply(lambda x:
                                       ':'.join([y[1:-1] for y in x.split(':')]))
    
    return df

In [27]:
doublemuts = get_doublemuts(data_aa)
doublemuts_scaled = get_doublemuts(data_aa_scaled, True)

Add minimal physical distance between pairs of amino acids

In [28]:
def get_distances(gene):
    df_distance = pd.read_csv(os.path.join(structure_folder, 'residue_distance_matrices',
                                          gene+'__minimal_distances_between_aa.csv'), index_col=0)
    distances = {}
    for i in df_distance.index:
        for j in df_distance.columns:
            distances[str(i) + ':' + str(j)] = df_distance.loc[i,str(j)]
    return distances

def add_distances(dm):
    amac_distances = get_distances('amacGFP')
    amacV14L_distances = amac_distances
    cgre_distances = get_distances('cgreGFP')
    pplu_distances = get_distances('ppluGFP2')
    av_distances = get_distances('avGFP')
    scope = locals()
    
    dm['distance'] = dm[['gene', 'position_native']].apply(lambda x: eval(x[0][:-3]+'_distances', scope)[x[1]] 
                                            if x[1] in eval(x[0][:-3]+'_distances', scope) else np.nan, axis=1)
    return dm

In [29]:
doublemuts = add_distances(doublemuts)

#### Masks

In [30]:
amacV14L_mask = data_aa['gene']=='amacV14LGFP'
amac_mask = (data_aa['gene']=='amacGFP')
amacV14V_mask = amac_mask & ~(data_aa['aa_genotype_pseudo'].str.contains('V14L'))
cgre_mask = data_aa['gene']=='cgreGFP'
pplu_mask = data_aa['gene']=='ppluGFP'
av_mask = data_aa['gene']=='avGFP'
wt_mask = data_aa['n_mut']==0
singles_mask = data_aa['n_mut']==1
buried_mask = data_aa['has_buried_mutation']==True
nonsense_mask = data_aa['aa_genotype_pseudo'].str.contains('\*')
chromomut_mask = data_aa['aa_genotype_pseudo'].str.contains('|'.join(chromomuts['pseudopos']))

### Nucleotide genotypes

In [31]:
data_nt = pd.read_csv(os.path.join(data_folder, 'final_datasets',
                                'amacGFP_cgreGFP_ppluGFP2__final_nucleotide_genotypes_to_brightness.csv'),)
data_nt['log_brightness'] = data_nt['replicates_mean_brightness'].apply(lambda x: np.log10(x))

data_nt_av = pd.read_csv(os.path.join(data_folder, 'final_datasets', 
                                'avGFP__rf_nucleotide_genotypes_to_brightness.csv'))
data_nt_av['gene'] = 'avGFP'
data_nt_av['nt_genotype'] = data_nt_av['nt_genotype'].apply(lambda x: x.replace('s', ''))

cols = ['nt_genotype', 'aa_genotype_native', 'aa_genotype_pseudo', 'gene', 'log_brightness']
data_nt = pd.concat([data_nt[cols], data_nt_av[cols]])

In [32]:
amac_nt = get_scaled_effects('amacGFP', data_nt)
cgre_nt = get_scaled_effects('cgreGFP', data_nt)
pplu_nt = get_scaled_effects('ppluGFP', data_nt)
av_nt = get_scaled_effects('avGFP', data_nt)
amacV14L_nt = get_scaled_effects('amacV14LGFP', data_nt)

In [33]:
data_nt = pd.concat([amac_nt, amacV14L_nt, cgre_nt, pplu_nt, av_nt])

In [34]:
amac_mask_nt = data_nt['gene']=='amacGFP'
amacV14L_mask_nt = data_nt['gene']=='amacV14LGFP'
amacV14V_mask_nt = amac_mask_nt & ~amacV14L_mask_nt
cgre_mask_nt = data_nt['gene']=='cgreGFP'
pplu_mask_nt = data_nt['gene']=='ppluGFP'
av_mask_nt = data_nt['gene']=='avGFP'
wt_mask_nt = data_nt['aa_genotype_pseudo'] == 'wt'

In [None]:
data_nt['n_nt_mut'] = data_nt['nt_genotype'].apply(lambda x : 0 if x=='wt' else x.count(':')+1)