In [6]:
import os
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)

from scipy.stats import spearmanr, pearsonr
from collections import Counter
import logomaker

#from fisher_tools.utils import highlight_differences
#from fisher_tools.sequence_utils import translate_dna#s, calculate_hamming_distance
#from fisher_tools.constants import AMINO_ACID_CHARS

import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.patches as mpatches
plt.rcParams['axes.prop_cycle'] = plt.cycler('color', ['#512b58', '#5b8c85','#f18867', '#50bda1', '#21243d', '#3282b8', '#ff3f98', '#c6f1d6'])

%matplotlib inline

In [7]:
DATA_LOCATION = '_02_RawData/'
df = pd.read_csv(f'{DATA_LOCATION}/amac_cgre_pplu_aa_genotype_indexed_clean_CDF_april2020.txt', sep='\t')
df = df.fillna('')

GFPs = sorted(list(set(df.gene)))

In [8]:
WT_SEQs = {}
WT_SEQs['avGFP'] = 'MSKGEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSALSKDPNEKRDHMVLLEFVTAAGITHGMDELYK*'
WT_SEQs['amacGFP'] = 'MSKGEELFTGIVPVLIELDGDVHGHKFSVRGEGEGDADYGKLEIKFICTTGKLPVPWPTLVTTLSYGILCFARYPEHMKMNDFFKSAMPEGYIQERTIFFQDDGKYKTRGEVKFEGDTLVNRIELKGMDFKEDGNILGHKLEYNFNSHNVYIMPDKANNGLKVNFKIRHNIEGGGVQLADHYQTNVPLGDGPVLIPINHYLSCQTAISKDRNETRDHMVFLEFFSACGHTHGMDELYK*'
WT_SEQs['cgreGFP'] = 'MTALTEGAKLFEKEIPYITELEGDVEGMKFIIKGEGTGDATTGTIKAKYICTTGDLPVPWATILSSLSYGVFCFAKYPRHIADFFKSTQPDGYSQDRIISFDNDGQYDVKAKVTYENGTLYNRVTVKGTGFKSNGNILGMRVLYHSPPHAVYILPDRKNGGMKIEYNKAFDVMGGGHQMARHAQFNKPLGAWEEDYPLYHHLTVWTSFGKDPDDDETDHLTIVEVIKAVDLETYR*'
WT_SEQs['ppluGFP'] = 'MPAMKIECRITGTLNGVEFELVGGGEGTPEQGRMTNKMKSTKGALTFSPYLLSHVMGYGFYHFGTYPSGYENPFLHAINNGGYTNTRIEKYEDGGVLHVSFSYRYEAGRVIGDFKVVGTGFPEDSVIFTDKIIRSNATVEHLHPMGDNVLVGSFARTFSLRDGGYYSFVVDSHMHFKSAIHPSILQNGGPMFAFRRVEELHSNTELGIVEYQHAFKTPIAFA*'


WT_SEQs_ALIGNED = {}
WT_SEQs_ALIGNED['avGFP']   = 'MSK---GEELFTGVVPILVELDGDVNGHKFSVSGEGEGDATYGKLTLKFICTTGKLPVPWPTLVTTLSYGVQCFSRYPDHMKQHDFFKSAM-PEGYVQERTIFFKDDGNYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNYNSHNVYIMADKQKNGIKVNFKIRHNIEDGSVQLADHYQQNTPIGDGPVLLP---DNHYLSTQSALSKDPNE-KRDHMVLLEFVTAAGITHGMDELYK*'
WT_SEQs_ALIGNED['amacGFP'] = 'MSK---GEELFTGIVPVLIELDGDVHGHKFSVRGEGEGDADYGKLEIKFICTTGKLPVPWPTLVTTLSYGILCFARYPEHMKMNDFFKSAM-PEGYIQERTIFFQDDGKYKTRGEVKFEGDTLVNRIELKGMDFKEDGNILGHKLEYNFNSHNVYIMPDKANNGLKVNFKIRHNIEGGGVQLADHYQTNVPLGDGPVLIP---INHYLSCQTAISKDRNE-TRDHMVFLEFFSACGHTHGMDELYK*'
WT_SEQs_ALIGNED['cgreGFP'] = 'MTALTEGAKLFEKEIPYITELEGDVEGMKFIIKGEGTGDATTGTIKAKYICTTGDLPVPWATILSSLSYGVFCFAKYPRHIA--DFFKSTQ-PDGYSQDRIISFDNDGQYDVKAKVTYENGTLYNRVTVKGTGFKSNGNILGMRVLYHSPPHAVYILPDRKNGGMKIEYNKAFDVMGGGHQMARHAQFNKPLGAWEEDYP---LYHHLTVWTSFGKDPDDDETDHLTIVEVIKAVDL--E---TYR*'
WT_SEQs_ALIGNED['ppluGFP'] = 'MPA-----------MKIECRITGTLNGVEFELVGGGEGTPEQGRMTNKMKSTKGALTFSPYLLSHVMGYGFYHFGTYPSGYE--NPFLHAINNGGYTNTRIEKYEDGGVLHVSFSYRYEAGRVIGDFKVVGTGFPEDSVIFTDKII-RSNATVEHLHPM-GDNVLVGSFARTFSLRDGGYYSFVVDSHMHFKSAIHPSILQNGGPMFAFRRVEELH-----SNTELGIVEYQHAFKTPIAFA----*'

In [9]:
def collapse_synonyms(df, protein_column, score_column):
    df = df.groupby(protein_column)[[score_column]].aggregate(list)
    df.columns=['all_scores']
    df['score_mean'] = [np.mean(x) for x in df.all_scores]
    df['score_std'] = [np.std(x) for x in df.all_scores]
    df['n_synonyms'] = [len(x) for x in df.all_scores]
    
    return df

In [10]:
def insert_mutations(wt_sq, mutations):
    if mutations == ['']:
        return wt_sq
    else:
        new_sq=str(wt_sq)
        for mut in mutations:
            position=int(mut[1:-1])
            aa=mut[-1]
            new_sq=new_sq[:position] + aa + new_sq[position+1:]
        return new_sq