### Primer search

Loop through all contigs / reads in a sam/fastq file and find primers, write out the contigs/reads to separate files

Common primers:

- T7 primer, 
- M13 primers
- SV40 ori
- WPRE
- bgH
- sp6 primer

In [1]:
import pandas as pd
import pathlib
import re
import os

In [2]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [34]:
ADDGENE_PRIMERS='/mnt/1TB_0/Data/databases/Primers_Addgene.csv'
MRC_PRIMERS='/mnt/1TB_0/Data/databases/Common_Primers_MRC.csv'
OPENWETWARE_PRIMERS='/mnt/1TB_0/Data/databases/Common_Primers_openwetware.csv'
SIGNAGEN_PRIMERS='/mnt/1TB_0/Data/databases/Common_Primers_signagen.csv'
PRJ='PRJNA605983'
SRA='SRR11092059'
DATA_PATH=f'/mnt/1TB_0/Data/Assembly/{PRJ}/{SRA}/'

#assembly contigs
READ_TYPE='asm' #raw
READ_FILE=DATA_PATH+'megahit_default/final.contigs.fa'
OUT_PATH=DATA_PATH+'primers/'

In [35]:
if not os.path.isfile(READ_FILE):
    READ_FILE=DATA_PATH+f'megahit_default/{SRA}_final.contigs.fa'

In [36]:
pathlib.Path(OUT_PATH).mkdir(exist_ok=True)

In [37]:
df_adg_primers = pd.read_csv(ADDGENE_PRIMERS)

In [38]:
df_adg_primers.head(n=200)

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,3'AOX1,GCAAATGGCATTCTGACATCC,"For Pichia vectors with AOX1 terminator, rever...",,addgene
1,5'AOX1,GACTGGTTCCAATTGACAAGC,"For Pichia vectors with AOX1 promoter, forward...",,addgene
2,35S promoter,CTATCCTTCGCAAGACCCTTC,"CaMV 35S promoter, forward primer",,addgene
3,AC5,ACACAAAGCCGCTCCATCAG,"Drosophila Actin 5C promoter, forward primer",,addgene
4,Alpha-factor,TACTATTGCCAGCATTGCTGC,"Alpha factor signal sequence, forward primer",,addgene
5,Amp-R,ATAATACCGCGCCACATAGC,"5' end of ampicillin resistance gene, reverse ...",,addgene
6,AUG1 Forward,CAATTTACATCTTTATTTATTAACG,"For Pichia vectors with AUG1 promoter, forward...",,addgene
7,AUG1 Reverse,GAAGAGAAAAACATTAGTTGGC,"For Pichia vectors with AUG1 promoter, reverse...",,addgene
8,BGH Reverse,TAGAAGGCACAGTCGAGG,"Bovine growth hormone terminator, reverse primer",,addgene
9,Bglob-intron-F,CTGGTCATCATCCTGCCTTT,"Rabbit beta-globin intron, forward primer",,addgene


In [39]:
#key_primers=['T7','M13 (-21) Forward','M13 (-40)','M13 Reverse','M13/pUC Forward','M13/pUC Reverse',\
#            'SV40pA-R','SV40pro-F','SV40-spliceR','WPRE-R','BGH Reverse','SP6','MMLV-F']

In [40]:
df_mrc_primers = pd.read_csv(MRC_PRIMERS)

In [41]:
df_mrc_primers.head(n=200)

Unnamed: 0,Primer,Sequence,Description,Comments,Source
0,CMV Fwd,CGCAAATGGGCGGTAGGCGTG,At 5' end of MCS in pCMV5 (4),"This is a ""Universal"" primer and should work i...",https://www.dnaseq.co.uk/resources/primers/sta...
1,CMV Rev,CCTCCACCCCATAATATTATAGAAGGACAC,At 3' end of MCS in pCMV5,,https://www.dnaseq.co.uk/resources/primers/sta...
2,M13 Fwd,GTAAAACGACGGCCAGTG,Common to many plasmids (-20 version),This primer does NOT work with Invitrogen Gate...,https://www.dnaseq.co.uk/resources/primers/sta...
3,M13 Fwd(GW),TGTAAAACGACGGCCAGT,Specific for Gateway vectors,,https://www.dnaseq.co.uk/resources/primers/sta...
4,M13 Rev,GGAAACAGCTATGACCATG,Common to many plasmids,There is a base deletion in some pUC18 vectors...,https://www.dnaseq.co.uk/resources/primers/sta...
5,M13 Rev(GW),CAGGAAACAGCTATGACC,Specific for Gateway vectors,,https://www.dnaseq.co.uk/resources/primers/sta...
6,SP6,AGCTATTTAGGTGACACTATAG,Common to many plasmids,Many plasmids diverge outside the core SP6 pro...,https://www.dnaseq.co.uk/resources/primers/sta...
7,T3,AATTAACCCTCACTAAAGGG,Common to many plasmids,,https://www.dnaseq.co.uk/resources/primers/sta...
8,T7,TAATACGACTCACTATAGGG,Common to many plasmids,,https://www.dnaseq.co.uk/resources/primers/sta...
9,T7 Term,TATGCTAGTTATTGCTCAG,Common to many plasmids,,https://www.dnaseq.co.uk/resources/primers/sta...


In [42]:
df_opw_primers = pd.read_csv(OPENWETWARE_PRIMERS)
df_opw_primers.head(n=200)

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,M13 forward (-20),GTAAAACGACGGCCAGT,M13 forward sequencing primer (-20),,https://openwetware.org/wiki/Common_primer_seq...
1,M13 forward (-47),CGCCAGGGTTTTCCCAGTCACGAC,M13 forward sequencing primer (-47),,https://openwetware.org/wiki/Common_primer_seq...
2,M13 reverse (-24),AACAGCTATGACCATG,M13 reverse sequencing primer: (-24),,https://openwetware.org/wiki/Common_primer_seq...
3,M13 reverse (-48),AGCGGATAACAATTTCACACAGGA,M13 reverse sequencing primer: (-48),,https://openwetware.org/wiki/Common_primer_seq...
4,SP6 primer,ATTTAGGTGACACTATAG,SP6 universal primer,,https://openwetware.org/wiki/Common_primer_seq...
5,VF2,tgccacctgacgtctaagaa,VF2,,https://openwetware.org/wiki/Common_primer_seq...
6,VR,attaccgcctttgagtgagc,VR,,https://openwetware.org/wiki/Common_primer_seq...


In [43]:
df_signagen_primers = pd.read_csv(SIGNAGEN_PRIMERS)
df_signagen_primers.Sequence = df_signagen_primers.Sequence.str.strip('\n')
df_signagen_primers.head(n=200)

Unnamed: 0,Primer,Sequence,Description,Comment,Source
0,3’AOX1,GCAAATGGCATTCTGACATCC,"For Pichia vectors with AOX1 terminator, rever...",,signagen
1,5’AOX1,GACTGGTTCCAATTGACAAGC,"For Pichia vectors with AOX1 promoter, forward...",,signagen
2,35S promoter,CTATCCTTCGCAAGACCCTTC,"CaMV 35S promoter, forward primer",,signagen
3,AC5,ACACAAAGCCGCTCCATCAG,"Drosophila Actin 5C promoter, forward primer",,signagen
4,Alpha-factor,TACTATTGCCAGCATTGCTGC,"Alpha factor signal sequence, forward primer",,signagen
5,Amp-R,ATAATACCGCGCCACATAGC,"5′ end of ampicillin resistance gene, reverse ...",,signagen
6,AUG1 Forward,CAATTTACATCTTTATTTATTAACG,"For Pichia vectors with AUG1 promoter, forward...",,signagen
7,AUG1 Reverse,GAAGAGAAAAACATTAGTTGGC,"For Pichia vectors with AUG1 promoter, reverse...",,signagen
8,BGH Reverse,TAGAAGGCACAGTCGAGG,"Bovine growth hormone terminator, reverse primer",,signagen
9,Bglob-intron-F,CTGGTCATCATCCTGCCTTT,"Rabbit beta-globin intron, forward primer",,signagen


In [44]:
def search_file(primer_name, file_name, string_to_search):
    line_number = 0
    list_of_results = []
    prev_line=""
    with open(file_name, 'r') as read_obj:
        for line in read_obj:
            line_number += 1
            if string_to_search.strip().upper() in line.upper():
                list_of_results.append((prev_line.strip()+'\n'+line.strip()+'\n'))
            prev_line=line
        if len(list_of_results)>0:
            print(f'primer: {primer_name}, found: {len(list_of_results)}')
    return list_of_results

In [45]:
def get_primer_aphanumeric(primers):
    primers_alph=[]
    for s in primers:
        p=re.sub('[^0-9a-zA-Z]+', '_', s)
        primers_alph.append(p)
    return primers_alph

In [46]:
adg_primer_seqs=df_adg_primers['Sequence'].tolist()
adg_primers=df_adg_primers['Primer'].tolist()

mrc_primer_seqs=df_mrc_primers['Sequence'].tolist()
mrc_primers=df_mrc_primers['Primer'].tolist()

opw_primer_seqs=df_opw_primers['Sequence'].tolist()
opw_primers=df_opw_primers['Primer'].tolist()

signagen_primer_seqs=df_signagen_primers['Sequence'].tolist()
signagen_primers=df_signagen_primers['Primer'].tolist()

In [47]:
#no need to duplicate
signagen_seq_parsed=[]
for sp, sn in zip(signagen_primer_seqs, signagen_primers):
    sp=sp.strip()
    if sp not in adg_primer_seqs and sp not in mrc_primer_seqs and sp not in opw_primer_seqs:
        signagen_seq_parsed.append(sp)
print(f'signagen unique: {len(signagen_seq_parsed)}, all signagen {len(signagen_primer_seqs)}')

signagen no duplicated: 0, all signagen 134


In [48]:
def get_matches(df, primer_list, primer_list_source):
    for kp in range(len(primer_list)):
        row=df.loc[df['Primer'] == primer_list[kp]]
        seq=row['Sequence'].to_string(index=False)
        pname=row['Primer'].to_string(index=False)
        p=re.sub('[^0-9a-zA-Z]+', '_', pname)
        matching_reads=search_file(pname, READ_FILE, seq)
        if len(matching_reads)>0:
            f = open(OUT_PATH+f'{SRA}_{READ_TYPE}_{p}_{primer_list_source}.txt', "w")
            f.writelines(matching_reads)
            f.close()

In [None]:
#addgene primers
get_matches(df_adg_primers, adg_primers, 'addgene')

In [None]:
get_matches(df_mrc_primers, mrc_primers, 'MRC')

In [None]:
get_matches(df_opw_primers, opw_primers, 'openwetware')

In [50]:
if len(signagen_seq_parsed)>0:
    get_matches(df_signagen_primers, signagen_primers, 'signagen')