In [192]:
# USER INPUTS

inFilename = '20210831_igaba_eppcr_input_masterlist.csv'
outFilename = '20211005_igaba_eppcr_mutations.csv'

translateDNA = 1
showTranslation = 1

In [193]:
# import dependencies and set starting alignment parameters
import pandas as pd

In [194]:
# set reference protein sequence to 602.2
#refseq = 'MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKAMVESGNVQWDVVDVEADFALRAAAEGLLEPLDFSVIQRDKIDPRFVSDHGVGSFLFSFVLGYNEGKLGASKPQDWTALFDTKTYPGKRALYKWPSPGVLELALLADGVPADKLYPLDLDRAFKKLDTIKKDIVWWGGGAQSQQLLASGEVSMGQFWNGRIHALQEDGAPVGVSWKQNLVMADILVVPKGTKNKAAAMKFLASASSAKGQDDFSALTAYAPVNIDSVQRLDLAQVRITADKQKNGIMANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITLGMDELYKGGTGGSMSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNWNANLAPNLPTAYVKDQITLDFAYWAKNGPAIATRWNEWLVKGSHHHHHH*'

# 2021-09-08
# Use the "CAG-insert" style sequence:
# The first two "MG" residues are replaced with "S", which is defined as the first residue in the 514.x vector.
# This is done to facilitate linear numbering of the sequence for mutation calls. 
refseq = 'SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKAMVESGNVQWDVVDVEADFALRAAAEGLLEPLDFSVIQRDKIDPRFVSDHGVGSFLFSFVLGYNEGKLGASKPQDWTALFDTKTYPGKRALYKWPSPGVLELALLADGVPADKLYPLDLDRAFKKLDTIKKDIVWWGGGAQSQQLLASGEVSMGQFWNGRIHALQEDGAPVGVSWKQNLVMADILVVPKGTKNKAAAMKFLASASSAKGQDDFSALTAYAPVNIDSVQRLDLAQVRITADKQKNGIMANFKIRHNVEDGSVQLADHYQQNTPIGDGPVLLPDNHYLSTQSVLSKDPNEKRDHMVLLEFVTAAGITLGMDELYKGGTGGSMSKGEELFTGVVPILVELDGDVNGHKFSVRGEGEGDATNGKLTLKFICTTGKLPVPWPTLVTTLTYGVQCFSRYPDHMKQHDFFKSAMPEGYVQERTISFKDDGTYKTRAEVKFEGDTLVNRIELKGIDFKEDGNILGHKLEYNWNANLAPNLPTAYVKDQITLDFAYWAKNGPAIATRWNEWLVKGSHHHHHH*'

igk_leader = 'METDTLLLWVLLLWVPGSTGDR'

In [195]:
# read the csv of sequences
prot_seq = pd.read_csv(inFilename)

In [196]:
#prot_seq

In [197]:
prot_seq.shape
print(prot_seq.shape)

numItems = prot_seq.shape[0]
print(numItems)

(81, 7)
81


In [198]:
# Create length and mutations columns for all entries
prot_seq.loc[(prot_seq['Plate'] != ''), 'Insert Seq'] = '' # formerly called CAG-insert
prot_seq.loc[(prot_seq['Plate'] != ''), 'Igk + Insert Seq'] = ''
prot_seq.loc[(prot_seq['Plate'] != ''), 'Length'] = ''
prot_seq.loc[(prot_seq['Plate'] != ''), 'Mutations'] = ''
#prot_seq

In [199]:
# derived from /tsanga/code/find_mutations.py
# version 2021-08-18

# go through the sequences and compare each to 602.2. 
# create a column in the pandas table that shows the amino acid changes
# it's important to note that the residue numbering is specific to this analysis, it must be converted
# to Jonny's system for interpretation. 

for i in range(numItems):
    mutList = []
    
    # on each iteration, assign the entry sequence to seq
    seq = prot_seq.iloc[i]['Amino Sequence']
    
    # make the "CAG-insert" sequence
    cagSeq = 'S' + seq[2:]
    prot_seq.loc[(prot_seq["Amino Sequence"] == seq), 'Insert Seq'] = cagSeq
    
    # make Igk + Insert sequence
    fullseq = igk_leader + cagSeq
    prot_seq.loc[(prot_seq["Amino Sequence"] == seq), 'Igk + Insert Seq'] = fullseq
    
    # find the length of the sequence
    prot_seq.loc[(prot_seq["Amino Sequence"] == seq), 'Length'] = len(cagSeq)
    
    
    # find the differences in letters at each index of the sequence
    for j in range(len(cagSeq)):
        if cagSeq[j] != refseq[j]:
            if cagSeq[j] == '*':
                mutation = refseq[j] + str(j+1) + 'STOP'
            else:
                mutation = refseq[j] + str(j+1) + cagSeq[j]
            mutList.append(mutation)
            
    # convert array to string
    mutString = str(mutList)
    
    # remove excess characters from mutString
    mutString = mutString.replace("'","")
    mutString = mutString.replace("[","")
    mutString = mutString.replace("]","")
                
    # assign the mutation list to the correct item by matching the sequence
    prot_seq.loc[(prot_seq["Amino Sequence"] == seq), 'Mutations'] = mutString

In [200]:
if translateDNA == 1:
    from translate_v2 import translate
        
    prot_seq.loc[(prot_seq['Plate'] != ''), 'Translation matched'] = ''
    
    if showTranslation == 1:
        prot_seq.loc[(prot_seq['Plate'] != ''), 'Translated AminoSeq'] = ''
    
    for k in range(numItems):
        
        plate = prot_seq.iloc[k]["Plate"]
        well = prot_seq.iloc[k]["Well"]
        dnaseq = prot_seq.iloc[k]["DNA Sequence"]        
        aminoseq = prot_seq.iloc[k]['Amino Sequence']
        translatedseq = translate(dnaseq)

        
        if showTranslation == 1:
            #prot_seq.loc[(prot_seq["Amino Sequence"] == aminoseq), 'Translated AminoSeq'] = translatedseq
            prot_seq.loc[(prot_seq["Plate"]==plate) & (prot_seq["Well"]==well), 'Translated AminoSeq'] = translatedseq
        
        if translatedseq == aminoseq:
            #prot_seq.loc[(prot_seq["Amino Sequence"] == aminoseq), 'Translation matched'] = 'True'
            prot_seq.loc[(prot_seq["Plate"]==plate) & (prot_seq["Well"]==well), 'Translation matched'] = 'True'
        else:
            #prot_seq.loc[(prot_seq["Amino Sequence"] == aminoseq), 'Translation matched'] = 'False'
            prot_seq.loc[(prot_seq["Plate"]==plate) & (prot_seq["Well"]==well), 'Translation matched'] = 'False'


In [201]:
prot_seq

Unnamed: 0,Plate,Well,F0,Fmax,DFF,DNA Sequence,Amino Sequence,Insert Seq,Igk + Insert Seq,Length,Mutations,Translation matched,Translated AminoSeq
0,602dot7,control,,,,ATG,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"S99A, F102Y, L308R",False,M
1,602dot8,control,,,,ATG,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"S99A, F102Y, L308S",False,M
2,602dot9,control,,,,ATG,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"S99A, F102Y, F104Y, K253L, L308S",False,M
3,602dot10,control,,,,ATG,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"S99A, F102Y, F104Y, L308S",False,M
4,602dot11,control,,,,ATG,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"S99A, F102Y, K253L, L308S",False,M
5,M0003,E04,753.0,3500.0,3.65,ATG,MGESINFVSWGGSAQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSAQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSAQDAQKQAWADP...,569,"T13A, S99A, F102Y, F104Y, T129S, K253L, L308S",False,M
6,M0004,B06,1451.0,5184.0,2.57,ATG,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDFGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDFGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"Y41F, N52D, S99A, F102Y, T122A, K221M, L308S, ...",False,M
7,M0004,H11,1006.0,3398.0,2.38,ATG,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"V58L, S99A, F102Y, F104Y, L308S",False,M
8,M0005,B09,663.0,4213.0,5.35,ATG,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"S80T, D94G, S99A, F102Y, Y137C, K253L, L308S, ...",False,M
9,M0005,D07,759.0,3146.0,3.14,ATG,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"S99A, F102N, F104Y, L308S, R345C",False,M


In [202]:
prot_seq.to_csv(outFilename)

In [205]:
prot_seq.loc[(prot_seq["Plate"])=="M0016"]

Unnamed: 0,Plate,Well,F0,Fmax,DFF,DNA Sequence,Amino Sequence,Insert Seq,Igk + Insert Seq,Length,Mutations,Translation matched,Translated AminoSeq
31,M0016,B09,843.0,4971.0,4.9,ATGGGAGAATCGATTAATTTCGTGAGCTGGGGCGGTAGCACCCAGG...,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"A62T, S99A, F102Y, K253L, L308S, I359F, P548Q",True,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...
32,M0016,E08,570.0,5121.0,7.98,ATGGGAGAATCGATTAATTTCGTGAGCTGGGGCGGTAGCACCCAGG...,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"S99A, F102Y, M226I, K253L, L308S, N526S",True,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...
33,M0016,G05,687.0,6143.0,7.94,ATGGGAGAATCGATTAATTTCGTGAGCTGGGGCGGTAGCACCCAGG...,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"D59V, D85A, S99A, F102Y, K253L, L308S, N526S",True,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...
34,M0016,H05,1054.0,8341.0,6.91,ATG,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"D59V, D85A, S99A, F102Y, K253L, L308S, N526S",False,M
35,M0016,H06,557.0,4877.0,7.76,ATGGGAGAATCGATTAATTTCGTGAGCTGGGGCGGTAGCACCCAGG...,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...,SESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLKA...,METDTLLLWVLLLWVPGSTGDRSESINFVSWGGSTQDAQKQAWADP...,569,"D59V, S99A, F102Y, Y137C, K253L, L308S",True,MGESINFVSWGGSTQDAQKQAWADPFSKASGITVVQDGPTDYGKLK...
