### Primer search

Loop through all contigs / reads in a sam/fastq file and find primers, write out the contigs/reads to separate files

Common primers:

- T7 primer, 
- M13 primers
- SV40 ori
- WPRE
- bgH
- sp6 primer

In [1]:
import pandas as pd
import pathlib
import re
import os
import os.path

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [3]:
PRIMERS_BASE='/mnt/8TB_0/Data/databases/primers/'
ADDGENE_PRIMERS=PRIMERS_BASE+'Primers_Addgene.csv'
MRC_PRIMERS=PRIMERS_BASE+'Common_Primers_MRC.csv'
OPENWETWARE_PRIMERS=PRIMERS_BASE+'Common_Primers_openwetware.csv'
SIGNAGEN_PRIMERS=PRIMERS_BASE+'Common_Primers_signagen.csv'
UNIVERSAL_PRIMERS=PRIMERS_BASE+'Common_Primers_universal.csv'
ADDITIONAL_PRIMERS=PRIMERS_BASE+'Common_Primers_additional.csv'
SARS_PRIMERS_WIV=PRIMERS_BASE+'SARS_Primers_WIV.csv'
enzymes_comm_iso=PRIMERS_BASE+'enzymes_comm_iso.txt'
enzymes_type2a=PRIMERS_BASE+'enzymes_type2a.txt'
enzymes_type2p=PRIMERS_BASE+'enzymes_type2p.txt'
resitefinder_enzymes=PRIMERS_BASE+'resitefinder_enzymes.txt'

In [4]:
df_adg_primers = pd.read_csv(ADDGENE_PRIMERS)

In [5]:
df_adg_primers.head(n=200)

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,3'AOX1,GCAAATGGCATTCTGACATCC,"For Pichia vectors with AOX1 terminator, rever...",,addgene
1,5'AOX1,GACTGGTTCCAATTGACAAGC,"For Pichia vectors with AOX1 promoter, forward...",,addgene
2,35S promoter,CTATCCTTCGCAAGACCCTTC,"CaMV 35S promoter, forward primer",,addgene
3,AC5,ACACAAAGCCGCTCCATCAG,"Drosophila Actin 5C promoter, forward primer",,addgene
4,Alpha-factor,TACTATTGCCAGCATTGCTGC,"Alpha factor signal sequence, forward primer",,addgene
5,Amp-R,ATAATACCGCGCCACATAGC,"5' end of ampicillin resistance gene, reverse ...",,addgene
6,AUG1 Forward,CAATTTACATCTTTATTTATTAACG,"For Pichia vectors with AUG1 promoter, forward...",,addgene
7,AUG1 Reverse,GAAGAGAAAAACATTAGTTGGC,"For Pichia vectors with AUG1 promoter, reverse...",,addgene
8,BGH Reverse,TAGAAGGCACAGTCGAGG,"Bovine growth hormone terminator, reverse primer",,addgene
9,Bglob-intron-F,CTGGTCATCATCCTGCCTTT,"Rabbit beta-globin intron, forward primer",,addgene


In [6]:
#key_primers=['T7','M13 (-21) Forward','M13 (-40)','M13 Reverse','M13/pUC Forward','M13/pUC Reverse',\
#            'SV40pA-R','SV40pro-F','SV40-spliceR','WPRE-R','BGH Reverse','SP6','MMLV-F']

In [7]:
df_mrc_primers = pd.read_csv(MRC_PRIMERS)

In [8]:
df_mrc_primers.head(n=200)

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,CMV Fwd,CGCAAATGGGCGGTAGGCGTG,At 5' end of MCS in pCMV5 (4),"This is a ""Universal"" primer and should work i...",https://www.dnaseq.co.uk/resources/primers/sta...
1,CMV Rev,CCTCCACCCCATAATATTATAGAAGGACAC,At 3' end of MCS in pCMV5,,https://www.dnaseq.co.uk/resources/primers/sta...
2,M13 Fwd,GTAAAACGACGGCCAGTG,Common to many plasmids (-20 version),This primer does NOT work with Invitrogen Gate...,https://www.dnaseq.co.uk/resources/primers/sta...
3,M13 Fwd(GW),TGTAAAACGACGGCCAGT,Specific for Gateway vectors,,https://www.dnaseq.co.uk/resources/primers/sta...
4,M13 Rev,GGAAACAGCTATGACCATG,Common to many plasmids,There is a base deletion in some pUC18 vectors...,https://www.dnaseq.co.uk/resources/primers/sta...
5,M13 Rev(GW),CAGGAAACAGCTATGACC,Specific for Gateway vectors,,https://www.dnaseq.co.uk/resources/primers/sta...
6,SP6,AGCTATTTAGGTGACACTATAG,Common to many plasmids,Many plasmids diverge outside the core SP6 pro...,https://www.dnaseq.co.uk/resources/primers/sta...
7,T3,AATTAACCCTCACTAAAGGG,Common to many plasmids,,https://www.dnaseq.co.uk/resources/primers/sta...
8,T7,TAATACGACTCACTATAGGG,Common to many plasmids,,https://www.dnaseq.co.uk/resources/primers/sta...
9,T7 Term,TATGCTAGTTATTGCTCAG,Common to many plasmids,,https://www.dnaseq.co.uk/resources/primers/sta...


In [9]:
df_opw_primers = pd.read_csv(OPENWETWARE_PRIMERS)
df_opw_primers.head(n=200)

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,M13 forward (-20),GTAAAACGACGGCCAGT,M13 forward sequencing primer (-20),,https://openwetware.org/wiki/Common_primer_seq...
1,M13 forward (-47),CGCCAGGGTTTTCCCAGTCACGAC,M13 forward sequencing primer (-47),,https://openwetware.org/wiki/Common_primer_seq...
2,M13 reverse (-24),AACAGCTATGACCATG,M13 reverse sequencing primer: (-24),,https://openwetware.org/wiki/Common_primer_seq...
3,M13 reverse (-48),AGCGGATAACAATTTCACACAGGA,M13 reverse sequencing primer: (-48),,https://openwetware.org/wiki/Common_primer_seq...
4,SP6 primer,ATTTAGGTGACACTATAG,SP6 universal primer,,https://openwetware.org/wiki/Common_primer_seq...
5,VF2,tgccacctgacgtctaagaa,VF2,,https://openwetware.org/wiki/Common_primer_seq...
6,VR,attaccgcctttgagtgagc,VR,,https://openwetware.org/wiki/Common_primer_seq...


In [10]:
df_signagen_primers = pd.read_csv(SIGNAGEN_PRIMERS)
df_signagen_primers.Sequence = df_signagen_primers.Sequence.str.strip('\n')
df_signagen_primers.head(n=200)

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,3’AOX1,GCAAATGGCATTCTGACATCC,"For Pichia vectors with AOX1 terminator, rever...",,signagen
1,5’AOX1,GACTGGTTCCAATTGACAAGC,"For Pichia vectors with AOX1 promoter, forward...",,signagen
2,35S promoter,CTATCCTTCGCAAGACCCTTC,"CaMV 35S promoter, forward primer",,signagen
3,AC5,ACACAAAGCCGCTCCATCAG,"Drosophila Actin 5C promoter, forward primer",,signagen
4,Alpha-factor,TACTATTGCCAGCATTGCTGC,"Alpha factor signal sequence, forward primer",,signagen
5,Amp-R,ATAATACCGCGCCACATAGC,"5′ end of ampicillin resistance gene, reverse ...",,signagen
6,AUG1 Forward,CAATTTACATCTTTATTTATTAACG,"For Pichia vectors with AUG1 promoter, forward...",,signagen
7,AUG1 Reverse,GAAGAGAAAAACATTAGTTGGC,"For Pichia vectors with AUG1 promoter, reverse...",,signagen
8,BGH Reverse,TAGAAGGCACAGTCGAGG,"Bovine growth hormone terminator, reverse primer",,signagen
9,Bglob-intron-F,CTGGTCATCATCCTGCCTTT,"Rabbit beta-globin intron, forward primer",,signagen


In [11]:
df_universal_primers = pd.read_csv(UNIVERSAL_PRIMERS)
df_universal_primers.Sequence = df_universal_primers.Sequence.str.strip('\n')
df_universal_primers.head(n=200)

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,8F,AGAGTTTGATCCTGGCTCAG,8F,,https://en.wikipedia.org/wiki/16S_ribosomal_RNA
1,27F,AGAGTTTGATCM TGGCTCAG,27F,,https://en.wikipedia.org/wiki/16S_ribosomal_RNA
2,U1492R,GGTTACCTTGTTACGACTT,U1492R,,https://en.wikipedia.org/wiki/16S_ribosomal_RNA
3,928F,TAAAACTYAAAKGAATTGACGGG,928F,,https://en.wikipedia.org/wiki/16S_ribosomal_RNA
4,336R,ACTGCTGCSYCCCGTAGGAGTCT,336R,,https://en.wikipedia.org/wiki/16S_ribosomal_RNA
5,1100F,YAACGAGCGCAACCC,1100F,,https://en.wikipedia.org/wiki/16S_ribosomal_RNA
6,1100R,GGGTTGCGCTCGTTG,1100R,,https://en.wikipedia.org/wiki/16S_ribosomal_RNA
7,337F,GACTCCTACGGGAGGCWGCAG,337F,,https://en.wikipedia.org/wiki/16S_ribosomal_RNA
8,907R,CCGTCAATTCCTTTRAGTTT,907R,,https://en.wikipedia.org/wiki/16S_ribosomal_RNA
9,785F,GGATTAGATACCCTGGTA,785F,,https://en.wikipedia.org/wiki/16S_ribosomal_RNA


In [12]:
df_additional_primers = pd.read_csv(ADDITIONAL_PRIMERS)
df_additional_primers.Sequence = df_additional_primers.Sequence.str.strip('\n')
df_additional_primers.head()

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,pGEX_3_primer,cctctgacacatgcagctcccgg,pGEX_3_primer,,misc
1,lacZ_a,tcaggctgcgcaactgttgggaagggcgatcggtgcgggcctcttc...,lacZ_a,,misc
2,M13_pUC_fwd_primer,cccagtcacgacgttgtaaaacg,M13_pUC_fwd_primer,,misc
3,M13_forward20_primer,gtaaaacgacggccagt,M13_forward20_primer,,misc
4,M13_reverse_primer,catggtcatagctgtttcc,M13_reverse_primer,,misc


In [13]:
df_wiv_primers = pd.read_csv(SARS_PRIMERS_WIV)
df_wiv_primers.Sequence = df_wiv_primers.Sequence.str.strip('\n')
df_wiv_primers.head(n=80)

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,URB-28630RS,TGCTTCCCTCTGCGTAGAAGCC,Reverse primer,,Almazan et al. 2006
1,URB-29VS,GCCAACCAACCTCGATCTCTTG,Forward primer,,Almazan et al. 2006
2,URB-1995RS,ATGGCGTCGACAAGACGTAAT,Reverse primer,,Almazan et al. 2006
3,URB-1931VS,ACCACTCAATTCCTGATTTGCA,Forward primer,,Almazan et al. 2006
4,FA,TGACCGCGGCTAATACGATATTAGGTTTTTACCTACCCAGGAAAAG,Primer used for T-cloning of the cDNA fragment...,,Zeng Thesis (2018)
5,F-c1575a,CAGGCCATACTGGAATTACTGGTGACAACGTGGAG,Primer used for T-cloning of the cDNA fragment...,,Zeng Thesis (2018)
6,R-c1575a,GTCACCAGTAATTCCAGTATGGCCTGAACCAATATCAGC,Primer used for T-cloning of the cDNA fragment...,,Zeng Thesis (2018)
7,RA,ATGGTGGCCATTATGGCTCTAAC,Primer used for T-cloning of the cDNA fragment...,,Zeng Thesis (2018)
8,FB,TGTTAGAGCCATAATGGCCACCATC,Primer used for T-cloning of the cDNA fragment...,,Zeng Thesis (2018)
9,RB,CTGTAGCCACGAGGGCCCTAAGTTTTTCCATAGGGAC,Primer used for T-cloning of the cDNA fragment...,,Zeng Thesis (2018)


### Enzymes

In [14]:
enzymes_comm_iso=PRIMERS_BASE+'primers/enzymes_comm_iso.txt'
enzymes_type2a=PRIMERS_BASE+'primers/enzymes_type2a.txt'
enzymes_type2p=PRIMERS_BASE+'primers/enzymes_type2p.txt'
resitefinder_enzymes=PRIMERS_BASE+'primers/resitefinder_enzymes.txt'

In [15]:
def get_enz(fpath, min_len=8):
    enz=[]
    eseq=[]
    edescr=[]
    with open(fpath) as fp:
        lines = fp.readlines()
        for line in lines:
            #print(line)
            words=line.split()
            if len(words[1].replace('^',''))>=min_len:
                enz.append(words[0])
                eseq.append(words[1].replace('^',''))
                if len(words)>2:
                    edescr.append(' '.join(words[2:]))
    return enz, eseq, edescr

In [16]:
def build_enz_df_from_txt(fname, source='enzymes'):
    assert os.path.isfile(fname) 
    enz, eseq, edescr=get_enz(fname)
    if len(edescr)==0:
        edescr=['NA'] * len(enz)
    comments=['NA'] * len(enz)
    sources=[source] * len(enz)
    df=pd.DataFrame(list(zip(enz, eseq, edescr, comments, sources)),
              columns=['Primer','Sequence','Description','Comment','Source'])
    return df

In [17]:
df_enzymes_comm_iso = build_enz_df_from_txt(enzymes_comm_iso)
df_enzymes_type2a = build_enz_df_from_txt(enzymes_type2a)
df_enzymes_type2p = build_enz_df_from_txt(enzymes_type2p)

In [18]:
df_resitefinder = build_enz_df_from_txt(resitefinder_enzymes,'resitefinder')

In [19]:
df_resitefinder.head()

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,BglI,GCCNNNNNGGC,,,resitefinder
1,BstXI,CCANNNNNNTGG,,,resitefinder
2,FseI,GGCCGGCC,,,resitefinder
3,NotI,GCGGCCGC,,,resitefinder
4,PshAI,GACNNNNGTC,,,resitefinder


In [20]:
frames = [df_enzymes_comm_iso, df_enzymes_type2a, df_enzymes_type2p]
df_enzymes = pd.concat(frames)

### search for primers

In [21]:
def search_megahit_file(primer_name, file_name, string_to_search):
    list_of_results = []
    prev_line=""
    with open(file_name, 'r') as read_obj:
        for line in read_obj:
            if string_to_search.strip().upper() in line.upper():
                list_of_results.append((prev_line.strip()+'\n'+line.strip()+'\n'))
            prev_line=line
        if len(list_of_results)>0:
            print(f'primer: {primer_name}, found: {len(list_of_results)}')
    return list_of_results

In [22]:
def parse_line_contig(contig_data, string_to_search, list_of_results, contig_header):
    contig=''.join(contig_data)
    if string_to_search.strip().upper() in contig.upper():
        list_of_results.append((contig_header.strip()+'\n'+contig.strip()+'\n'))
    return list_of_results
    

In [23]:
def search_spades_file(primer_name, file_name, string_to_search):
    list_of_results = []
    prev_line=""
    count=0
    with open(file_name, 'r') as read_obj:
        start_contig=True
        contig_data=[]
        prev_contig_header=''
        for line in read_obj:
            if line.startswith('>'):
                if start_contig==False:
                    list_of_results= parse_line_contig(contig_data, string_to_search, list_of_results, prev_contig_header)
                    contig_data=[]
                prev_contig_header=line
            else:
                contig_data.append(line)
            start_contig=False
        #last contig
        list_of_results= parse_line_contig(contig_data, string_to_search, list_of_results, prev_contig_header)
        if len(list_of_results)>0:
            print(f'primer: {primer_name}, found: {len(list_of_results)}')
    return list_of_results

In [24]:
def get_primer_aphanumeric(primers):
    primers_alph=[]
    for s in primers:
        p=re.sub('[^0-9a-zA-Z]+', '_', s)
        primers_alph.append(p)
    return primers_alph

In [25]:
adg_primer_seqs=df_adg_primers['Sequence'].tolist()
adg_primers=df_adg_primers['Primer'].tolist()

mrc_primer_seqs=df_mrc_primers['Sequence'].tolist()
mrc_primers=df_mrc_primers['Primer'].tolist()

opw_primer_seqs=df_opw_primers['Sequence'].tolist()
opw_primers=df_opw_primers['Primer'].tolist()

signagen_primer_seqs=df_signagen_primers['Sequence'].tolist()
signagen_primers=df_signagen_primers['Primer'].tolist()

universal_primer_seqs=df_universal_primers['Sequence'].tolist()
universal_primers=df_universal_primers['Primer'].tolist()

additional_primer_seqs=df_additional_primers['Sequence'].tolist()
additional_primers=df_additional_primers['Primer'].tolist()

wiv_primer_seqs=df_wiv_primers['Sequence'].tolist()
wiv_primers=df_wiv_primers['Primer'].tolist()

enzymes_primer_seqs=df_enzymes['Sequence'].tolist()
enzymes_primers=df_enzymes['Primer'].tolist()

resitefinder_primer_seqs=df_resitefinder['Sequence'].tolist()
resitefinder_primers=df_resitefinder['Primer'].tolist()

In [26]:
#nadVcacc_primer="CACCATGCTAAATCTTAATCAAAA"
#nadVHisx6_rev_primer="TCAGTGATGGTGATGGTGGTGCGCAGTTTGAATCTTTTTCG"

In [27]:
def check_dupes(primer_seqs, primers, primers_to_check):
    #no need to duplicate
    r_seq_parsed=[]
    for sp, sn in zip(primer_seqs, primers):
        sp=sp.strip()
        res = sn in (item for sublist in primers for item in sublist)
        if not res:
            r_seq_parsed.append(sp)
    return r_seq_parsed

In [28]:
signagen_seq_parsed=check_dupes(signagen_primer_seqs, signagen_primers, [adg_primer_seqs,mrc_primer_seqs,opw_primer_seqs])

In [29]:
universal_seq_parsed=check_dupes(universal_primer_seqs, universal_primers, [adg_primer_seqs,mrc_primer_seqs,opw_primer_seqs])

In [30]:
enzymes_seq_parsed=check_dupes(enzymes_primer_seqs, enzymes_primers, [adg_primer_seqs,mrc_primer_seqs,opw_primer_seqs])

In [31]:
#optional: df_universal_primers, df_wiv_primers
#other: df_enzymes, df_resitefinder
frames = [df_adg_primers, df_mrc_primers, df_opw_primers, df_additional_primers]
df = pd.concat(frames)
len(df)

211

In [32]:
df=df.drop_duplicates(['Sequence'])
len(df)

196

In [33]:
df.head(n=200)

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,3'AOX1,GCAAATGGCATTCTGACATCC,"For Pichia vectors with AOX1 terminator, rever...",,addgene
1,5'AOX1,GACTGGTTCCAATTGACAAGC,"For Pichia vectors with AOX1 promoter, forward...",,addgene
2,35S promoter,CTATCCTTCGCAAGACCCTTC,"CaMV 35S promoter, forward primer",,addgene
3,AC5,ACACAAAGCCGCTCCATCAG,"Drosophila Actin 5C promoter, forward primer",,addgene
4,Alpha-factor,TACTATTGCCAGCATTGCTGC,"Alpha factor signal sequence, forward primer",,addgene
5,Amp-R,ATAATACCGCGCCACATAGC,"5' end of ampicillin resistance gene, reverse ...",,addgene
6,AUG1 Forward,CAATTTACATCTTTATTTATTAACG,"For Pichia vectors with AUG1 promoter, forward...",,addgene
7,AUG1 Reverse,GAAGAGAAAAACATTAGTTGGC,"For Pichia vectors with AUG1 promoter, reverse...",,addgene
8,BGH Reverse,TAGAAGGCACAGTCGAGG,"Bovine growth hormone terminator, reverse primer",,addgene
9,Bglob-intron-F,CTGGTCATCATCCTGCCTTT,"Rabbit beta-globin intron, forward primer",,addgene


In [34]:
def get_matches(df, primer_list, primer_list_source, read_file, sra, outpath):
    #iterate over primers and find matches
    for kp in range(len(primer_list)):
        row=df.loc[df['Primer'] == primer_list[kp]]
        seq=row['Sequence'].to_string(index=False)
        pname=row['Primer'].to_string(index=False)
        p=re.sub('[^0-9a-zA-Z]+', '_', pname)
        if 'coronaspades' in READ_TYPE:
            matching_reads=search_spades_file(pname, read_file, seq)
        else:
            matching_reads=search_megahit_file(pname, read_file, seq)
        if len(matching_reads)>0:
            print(f'Writing {len(matching_reads)} to ..{p}_{primer_list_source}.txt')
            f = open(outpath+f'{sra}_{K_LEVEL}_{READ_TYPE}_{p}_{primer_list_source}.txt', "w")
            f.writelines(matching_reads)
            f.close()

In [35]:
def get_megahit_contig_matches(contigs_file_name, primer_list, seq_list, outpath, sra):
    list_of_results = []
    prev_line=""
    contig_primer_dict={}
    matching_reads = []
    debug_count=0
    with open(contigs_file_name, 'r') as read_obj:
        contig_primers=[]
        prev_line=''
        for line in read_obj:
            if not line.startswith('>'):
                for pname, seq in zip(primer_list, seq_list):
                    #pfilenameparsed=re.sub('[^0-9a-zA-Z]+', '_', pname)
                    if seq.strip().upper() in line.upper():
                        count=line.upper().count(seq.strip().upper())
                        contig_primers.append(pname+': '+str(count))
            else:
                if len(contig_primers)>0:
                    contig_primer_dict[prev_line]=contig_primers
                    contig_primers=[]
                prev_line=line.split()[0]
    with open(outpath+f'{sra}_{K_LEVEL}_{READ_TYPE}_primer_names.txt', 'w') as fl:  
        for key, value in contig_primer_dict.items():  
            fl.write(f'{key}:{value}\n')

In [36]:
def write_contig_matches(contigs_file_name, high_graded, hvals, fname_code, outpath, sra):
    with open(outpath+f'{sra}_{K_LEVEL}_{READ_TYPE}_{fname_code}.txt', 'w') as hg: 
        with open(contigs_file_name, 'r') as read_obj:
            for line in read_obj:
                line1 = line.replace('\n','')
                line2 = read_obj.readline()
                for c, v in zip(high_graded, hvals):
                    if line1.startswith(c+' '):
                        hg.write(f'{line1} matches: {v}\n')
                        hg.write(f'{line2}')

In [37]:
PRIMER_LIST=df['Primer'].tolist()
SEQ_LIST=df['Sequence'].tolist()

### Match by contig 

In [38]:
def match_by_contig(read_file, outpath, sra):
    print(f'match_by_contig: {sra}')
    get_megahit_contig_matches(read_file, PRIMER_LIST, SEQ_LIST, outpath, sra)

### Match via primers

In [39]:
def match_via_primers(read_file, sra, outpath):
    print(f'match_via_primers: {sra}')
    #addgene primers
    get_matches(df_adg_primers, adg_primers, 'addgene', read_file, sra, outpath)
    get_matches(df_mrc_primers, mrc_primers, 'MRC', read_file, sra, outpath)
    get_matches(df_opw_primers, opw_primers, 'openwetware', read_file, sra, outpath)
    if len(signagen_seq_parsed)>0:
        get_matches(df_signagen_primers, signagen_primers, 'signagen', read_file, sra, outpath)
    #if len(universal_seq_parsed)>0:
    #    get_matches(df_universal_primers, universal_primers, 'universal', read_file, sra, outpath)
    #if len(enzymes_seq_parsed)>0:
    #    get_matches(df_enzymes, enzymes_primers, 'enzymes')
    get_matches(df_additional_primers, additional_primers, 'Additional', read_file, sra, outpath)
    #WIV_primers
    #get_matches(df_wiv_primers, wiv_primers, 'WIV', read_file, sra, outpath)

### Write summary of 'High Graded' contigs

In [40]:
def write_high_graded(sra, outpath):
    contig_stats={}
    primerslist=[]
    with open(outpath+f'{sra}_{K_LEVEL}_{READ_TYPE}_primer_names.txt', 'r') as fl: 
        for line in fl:
            contig=line.split(':', 1)[0]
            primers=line.split(':', 1)[1]
            primers=primers.replace('[', '').replace(']', '')
            primerl=primers.split(',')
            counts=0
            for p in primerl:
                counts+=int(p.split(':')[1].replace('\n', '').replace("'", "").replace('"', '').strip())
            varieds=len(primerl)
            contig_stats[contig]=[counts, varieds]
            primerslist.append(str(primers))
    return contig_stats, primerslist

In [41]:
def get_high_graded(contig_stats, primerslist):
    #Using default of 4 or more enzyme/primers, and at least 2 different one
    high_graded={}
    keys=contig_stats.keys()
    values=contig_stats.values()
    for k,v,p in zip(keys, values, primerslist):
        if v[0]>0  and v[1]>0:
            high_graded[k]=str(v)+' '+str(p).replace('\n','')
    return high_graded

In [42]:
def process(read_file, out_path,sra):
    if not os.path.isfile(out_path): 
        pathlib.Path(out_path).mkdir(exist_ok=True)
    try:
        match_by_contig(read_file, out_path, sra)
        match_via_primers(read_file, sra, out_path)
        contig_stats, primerslist=write_high_graded(sra, out_path)
        high_graded=get_high_graded(contig_stats, primerslist)
        write_contig_matches(read_file, high_graded.keys(), high_graded.values(), 'all_primer_gt1', out_path, sra)
    except FileNotFoundError as e:
            #assembly may not get to k79 contigs
        pass
    return

In [43]:
#read_file='f.fa'
#out_path='n'
#process(read_file, out_path)

In [44]:
def contig_workflow():
    for sra in SRAs:
        data_path=BASE_PATH+f'{sra}/'
        read_file=data_path+READ_AREA
        out_path=data_path+'primers/'
        if not os.path.isfile(out_path): 
            pathlib.Path(out_path).mkdir(exist_ok=True)
        try:
            match_by_contig(read_file, out_path, sra)
            match_via_primers(read_file, sra, out_path)
            contig_stats, primerslist=write_high_graded(sra, out_path)
            high_graded=get_high_graded(contig_stats, primerslist)
            write_contig_matches(read_file, high_graded.keys(), high_graded.values(), 'all_primer_gt1', out_path, sra)
        except FileNotFoundError as e:
            #assembly may not get to k79 contigs
            print(e)
            pass

In [45]:
def fasta_workflow(data_path, fasta_file):

    read_file=data_path+fasta_file
    out_path=data_path+'primers/'
    if not os.path.isfile(out_path): 
        pathlib.Path(out_path).mkdir(exist_ok=True)
    try:
        sra=''
        match_by_contig(read_file, out_path, sra)
        match_via_primers(read_file, sra, out_path)
        contig_stats, primerslist=write_high_graded(sra, out_path)
        high_graded=get_high_graded(contig_stats, primerslist)
        write_contig_matches(read_file, high_graded.keys(), high_graded.values(), 'all_primer_gt1', out_path, sra)
    except FileNotFoundError as e:
        #assembly may not get to k79 contigs
        print(e)
        pass

### Project and Paths

In [46]:
PRJ='PRJNA795267'

BASE_PATH=f'/mnt/1TB_0/Data/Assembly/{PRJ}/'

#assembly contigs
#READ_TYPE='coronaspades_asm' #raw
READ_TYPE='megahit_asm' #raw
#READ_FILE=DATA_PATH+'megahit_default/final.contigs.fa'
#READ_FILE=DATA_PATH+'coronaspades_default/contigs.fasta'
#READ_AREA=f'coronaspades_default/contigs.fasta'
K_LEVEL='k79'
READ_AREA=f'megahit_default/intermediate_contigs/{K_LEVEL}.contigs.fa'
#READ_AREA=f'megahit_default/{K_LEVEL}.contigs.fa'
#READ_AREA=f'megahit/Sars_SL3.megahit_asm/intermediate_contigs/{K_LEVEL}.contigs.fa'

### SRA data

In [47]:
SRAs=['SRR17497053','SRR17497039','SRR17497077','SRR17497078','SRR17497079','SRR17497080','SRR17497081','SRR17497082','SRR17497083','SRR17497084','SRR17497086']

In [48]:
#fasta_file='data.fasta'
#fasta_workflow('path_to', fasta_file)
contig_workflow()

match_by_contig: SRR17497053
match_via_primers: SRR17497053
primer:  Amp-R, found: 1
Writing 1 to .._Amp_R_addgene.txt
primer:  CAT-R, found: 1
Writing 1 to .._CAT_R_addgene.txt
primer:  LacI-R, found: 1
Writing 1 to .._LacI_R_addgene.txt
primer:  LexA, found: 1
Writing 1 to .._LexA_addgene.txt
primer:  mCherry-F, found: 1
Writing 1 to .._mCherry_F_addgene.txt
primer:  pENTR-R, found: 1
Writing 1 to .._pENTR_R_addgene.txt
primer:  T7 Terminal, found: 2
Writing 2 to .._T7_Terminal_addgene.txt
primer:  M13 Rev, found: 1
Writing 1 to .._M13_Rev_MRC.txt
primer:  T7 Term, found: 2
Writing 2 to .._T7_Term_MRC.txt
primer:  M13 reverse (-24), found: 1
Writing 1 to .._M13_reverse_24__openwetware.txt
primer:  Amp-R, found: 1
Writing 1 to .._Amp_R_signagen.txt
primer:  CAT-R, found: 1
Writing 1 to .._CAT_R_signagen.txt
primer:  LacI-R, found: 1
Writing 1 to .._LacI_R_signagen.txt
primer:  LexA, found: 1
Writing 1 to .._LexA_signagen.txt
primer:  mCherry-F, found: 1
Writing 1 to .._mCherry_F_signa

primer:  T7, found: 2
Writing 2 to .._T7_addgene.txt
primer:  T3, found: 1
Writing 1 to .._T3_MRC.txt
primer:  T7, found: 2
Writing 2 to .._T7_MRC.txt
primer:  pET Upstream, found: 2
Writing 2 to .._pET_Upstream_MRC.txt
primer:  pGEX Fwd, found: 1
Writing 1 to .._pGEX_Fwd_MRC.txt
primer:  pLEXA-C-R, found: 1
Writing 1 to .._pLEXA_C_R_MRC.txt
primer:  M13 reverse (-24), found: 1
Writing 1 to .._M13_reverse_24__openwetware.txt
primer:  Amp-R, found: 1
Writing 1 to .._Amp_R_signagen.txt
primer:  GW-3′, found: 1
Writing 1 to .._GW_3__signagen.txt
primer:  pGEX 5′, found: 1
Writing 1 to .._pGEX_5__signagen.txt
primer:  T7, found: 2
Writing 2 to .._T7_signagen.txt
primer:  T7_promoter, found: 2
Writing 2 to .._T7_promoter_Additional.txt
primer:  T3_promoter, found: 1
Writing 1 to .._T3_promoter_Additional.txt
primer:  SN-specific_P3_Liu_etal, found: 1
Writing 1 to .._SN_specific_P3_Liu_etal_Additional.txt
