This script is for processing the fasta files from PandaSeq and generating peptide lists. 
Filters for peptides which are in the ordered library or matched the pattern of the doped sequences.

In [1]:
import numpy as np
import pandas as pd
import csv
import string
import matplotlib.pyplot as plt
import seaborn as sns
import filecmp
from scipy import stats
import re
import fnmatch


In [2]:
#import the list of DNA sequences that we ordered from Twist
merged = pd.read_csv('../nCov_proteome_15mers+SARS-CoV-1 N+S_encoded FINAL remove flanking DNA.csv') 
merged = merged.rename(columns={'aa':'name'})


In [3]:
def translate(pep_dna):
#Translate peptide to amino acids
    codons_aa = []
    codons_dna = []
    for j in range(0,len(pep_dna),3):   
        codons_dna.append(pep_dna[j:j+3])
    codons_aa = [codon_2_aa[x] for x in codons_dna] # translate
    pep_aa=''.join(codons_aa) #puts letters into string
    return pep_aa

codon_2_aa = {"TTT":"F", "TTC":"F", "TTA":"L", "TTG":"L",
    "TCT":"S", "TCC":"S", "TCA":"S", "TCG":"S",
    "TAT":"Y", "TAC":"Y", "TAA":"*", "TAG":"*",
    "TGT":"C", "TGC":"C", "TGG":"W", "TGA":"*",
    "CTT":"L", "CTC":"L", "CTA":"L", "CTG":"L",
    "CCT":"P", "CCC":"P", "CCA":"P", "CCG":"P",
    "CAT":"H", "CAC":"H", "CAA":"Q", "CAG":"Q",
    "CGT":"R", "CGC":"R", "CGA":"R", "CGG":"R",
    "ATT":"I", "ATC":"I", "ATA":"I", "ATG":"M",
    "ACT":"T", "ACC":"T", "ACA":"T", "ACG":"T",
    "AAT":"N", "AAC":"N", "AAA":"K", "AAG":"K",
    "AGT":"S", "AGC":"S", "AGA":"R", "AGG":"R",
    "GTT":"V", "GTC":"V", "GTA":"V", "GTG":"V",
    "GCT":"A", "GCC":"A", "GCA":"A", "GCG":"A",
    "GAT":"D", "GAC":"D", "GAA":"E", "GAG":"E",
    "GGT":"G", "GGC":"G", "GGA":"G", "GGG":"G"}

#round/allele correspondence of files
plex_dict = {
    '7096':' doped-401 R0 (doped)',
    '7097':' doped-402 R0 (doped)',
    '7098':' doped-404 R0 (doped)',
    '7099':' doped-401 post-R1 ',
    '7100':' doped-402 post-R1 ',
    '7101':' doped-404 post-R1 ',
    '7102':' doped-401 post-R2 ',
    '7103':' doped-402 post-R2 ',
    '7104':' doped-404 post-R2 ',
    '7105':' doped-401 post-R3 ',
    '7106':' doped-402 post-R3 ',
    '7107':' doped-404 post-R3 ',
    '7108':' doped-401 post-R4 ',
    '7109':' doped-402 post-R4 ',
    '7110':' doped-404 post-R4 ',
    '7111':' undoped-401 R0 (undoped)',
    '7112':' undoped-402 R0 (undoped)',
    '7113':' undoped-404 R0 (undoped)',
    '7114':' undoped-401 post-R1 - positive ',
    '7115':' undoped-402 post-R1 - positive ',
    '7116':' undoped-404 post-R1 - positive ',
    '7117':' undoped-401 post-R1 - negative ',
    '7118':' undoped-402 post-R1 - negative ',
    '7119':' undoped-404 post-R1 - negative ',
    '7120':' undoped-401 post-R2 - positive ',
    '7121':' undoped-402 post-R2 - positive ',
    '7122':' undoped-404 post-R2 - positive ',
    '7123':' undoped-401 post-R2 - negative ',
    '7124':' undoped-402 post-R2 - negative ',
    '7125':' undoped-404 post-R2 - negative ',
    '7126':' undoped-401 post-R3 - positive ',
    '7127':' undoped-402 post-R3 - positive ',
    '7128':' undoped-404 post-R3 - positive ',
    '7129':' undoped-401 post-R3 - negative ',
    '7130':' undoped-402 post-R3 - negative ',
    '7131':' undoped-404 post-R3 - negative '
    }


In [4]:
flank5_seq = 'TATTGCTAGCGTTTTGGCAGCT'
flank3_seq_401 = 'GGTGGATCCGGTGGCGGAGAACAAAAATTAATTAGTGAAGAAGATTTAGGCGGTCTAGAAGTTCTGTTCCAGGGGCCCGGTGGCGGGTCCGGCGGT'
flank3_seq_402 = 'GGTGGATCCGGTGGCGGAGAACAAAAATTAATTAGTGAGGAGGACCTTGGCGGTCTAGAAGTTCTGTTCCAGGGGCCCGGTGGCGGGTCCGGCGGT'
flank3_seq_404 = 'GGTGGATCCGGTGGCGGAGAACAAAAATTAATCTCAGAGGAAGATCTTGGCGGTCTAGAAGTTCTGTTCCAGGGGCCCGGTGGCGGGTCCGGCGGT'
flank_stop_splicevar = 'TATTGCTAGCGTTTTGGCAGCTGGATAAGCTGGTGTTTAGCGCTGGTTGCTGTGAGTGCCCGGTGGATCCGGTGGCGGGTCCGGCGGT'#variant seen in Sanger sequence of library for doping into;orig peptide+linker spliced out

def process(plex):
    file_name = 'twist_covid_'+plex+'_ps.fasta'
    dna = []
    aa = []

    number_contigs = 0

    with open(file_name, 'r') as read_file:  
        for line in read_file:
            if line[0] != '>':
                number_contigs +=1
                line = line.strip()
                
                #check if sequence contains the exact flanking region that should and remove:
                flank5 = line.find(flank5_seq)
                flank3_401 = line.find(flank3_seq_401) 
                flank3_402 = line.find(flank3_seq_402)
                flank3_404 = line.find(flank3_seq_404)
                if flank5 != -1 and flank3_401 !=-1:
                    trimmed = line[flank5+len(flank5_seq):flank3_401]
                    dna.append(trimmed)
                elif flank5 != -1 and flank3_402 !=-1:
                    trimmed = line[flank5+len(flank5_seq):flank3_402]
                    dna.append(trimmed)
                elif flank5 != -1 and flank3_404 !=-1:
                    trimmed = line[flank5+len(flank5_seq):flank3_404]
                    dna.append(trimmed)
                elif line.find(flank_stop_splicevar) !=-1:
                    dna.append('*splicevar')

    data = pd.DataFrame({'dna':dna})

    #collapse to unique sequences with counts
    data_uniq = data['dna'].value_counts()
    data_uniq = pd.DataFrame(data_uniq).reset_index()
    data_uniq.columns = ['dna','count']
    
    print('---')
    print('plex:',plex)
    print("number of contigs:",number_contigs)
    print("number of contigs with correct flanking sequence:", len(data))
    print("number of unique contigs:",len(data_uniq))

    return data_uniq

In [5]:
for plex in ["7096", "7097", "7098", "7099", "7100", "7101", "7102", "7103", "7104", "7105", "7106",
             "7107", "7108", "7109", "7110", "7111", "7112", "7113", "7114", "7115", "7116", "7117",
             "7118", "7119", "7120", "7121", "7122", "7123", "7124", "7125", "7126", "7127", "7128",
             "7129", "7130", "7131"]:
    data_uniq = process(plex)
    data_uniq = data_uniq.rename(columns={"count":'count_'+plex_dict[plex]})
    merged = merged.merge(data_uniq, left_on='dna', right_on='dna', how='outer')
    

---
plex: 7096
number of contigs: 349911
number of contigs with correct flanking sequence: 147523
number of unique contigs: 63406
---
plex: 7097
number of contigs: 373245
number of contigs with correct flanking sequence: 165281
number of unique contigs: 68991
---
plex: 7098
number of contigs: 315287
number of contigs with correct flanking sequence: 144975
number of unique contigs: 66583
---
plex: 7099
number of contigs: 322062
number of contigs with correct flanking sequence: 175865
number of unique contigs: 32078
---
plex: 7100
number of contigs: 354255
number of contigs with correct flanking sequence: 177871
number of unique contigs: 25635
---
plex: 7101
number of contigs: 342865
number of contigs with correct flanking sequence: 186120
number of unique contigs: 32909
---
plex: 7102
number of contigs: 338488
number of contigs with correct flanking sequence: 209115
number of unique contigs: 20545
---
plex: 7103
number of contigs: 343478
number of contigs with correct flanking sequence:

In [6]:
#identify peptides that match stop codon-containing NNN peptides from the library for doping into
## pattern: "NNNTAANNNNNNNNNTAGNNNNNNNNNNNNTGANNNNNN"

def check_doped(trimmed):
    if len(trimmed) == 39:
        if trimmed.find('TAA') == 3:
            if trimmed.find('TAG') == 15:
                if trimmed.find('TGA') == 30:
                    return 'match'
                else:
                    return float('NaN')
            else:
                return float('NaN')
        else:
            return float('NaN')
    else:
        return float('NaN')  
    
dna_list = merged['dna']
n = len(dna_list)
doped = ['']*n

for i in range(0,n):
    doped[i] = check_doped(dna_list[i])

merged['doped_match'] = doped


In [7]:
#translate DNA
aa = []
for dna in merged['dna']:
    if len(dna)%3 == 0:
        aa.append(translate(dna))
    else:
        aa.append('***DNA Not Mult of 3')
merged['aa'] = aa
    
merged.to_csv('twist_covid.csv')


In [8]:
#filter for peptides that don't match pattern of doped library or Twist order
print(len(merged))
merged_filtered = merged.dropna(subset=['name', 'doped_match'], how='all')
print(len(merged_filtered))
merged_filtered.to_csv('twist_covid_filtered.csv')


624638
167865
