### Primer search

Loop through all contigs / reads in a sam/fastq file and find primers, write out the contigs/reads to separate files

Common primers:

- T7 primer, 
- M13 primers
- SV40 ori
- WPRE
- bgH
- sp6 primer

In [119]:
import pandas as pd
import pathlib
import re
import os

In [31]:
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

In [120]:
PRIMERS='/mnt/1TB_0/Data/databases/Primers_Addgene.csv'
PRJ='PRJNA605983'
SRA='SRR11092060'
DATA_PATH=f'/mnt/1TB_0/Data/Assembly/{PRJ}/{SRA}/'

#assembly contigs
READ_TYPE='asm' #raw
READ_FILE=DATA_PATH+'megahit_default/final.contigs.fa'
OUT_PATH=DATA_PATH+'primers/'

In [121]:
if not os.path.isfile(READ_FILE):
    READ_FILE=DATA_PATH+f'megahit_default/{SRA}_final.contigs.fa'

In [110]:
pathlib.Path(OUT_PATH).mkdir(exist_ok=True)

In [13]:
df_primers = pd.read_csv(PRIMERS)

In [32]:
df_primers.head(n=200)

Unnamed: 0,Primer,Sequence,Description
0,3'AOX1,GCAAATGGCATTCTGACATCC,"For Pichia vectors with AOX1 terminator, rever..."
1,5'AOX1,GACTGGTTCCAATTGACAAGC,"For Pichia vectors with AOX1 promoter, forward..."
2,35S promoter,CTATCCTTCGCAAGACCCTTC,"CaMV 35S promoter, forward primer"
3,AC5,ACACAAAGCCGCTCCATCAG,"Drosophila Actin 5C promoter, forward primer"
4,Alpha-factor,TACTATTGCCAGCATTGCTGC,"Alpha factor signal sequence, forward primer"
5,Amp-R,ATAATACCGCGCCACATAGC,"5' end of ampicillin resistance gene, reverse ..."
6,AUG1 Forward,CAATTTACATCTTTATTTATTAACG,"For Pichia vectors with AUG1 promoter, forward..."
7,AUG1 Reverse,GAAGAGAAAAACATTAGTTGGC,"For Pichia vectors with AUG1 promoter, reverse..."
8,BGH Reverse,TAGAAGGCACAGTCGAGG,"Bovine growth hormone terminator, reverse primer"
9,Bglob-intron-F,CTGGTCATCATCCTGCCTTT,"Rabbit beta-globin intron, forward primer"


In [34]:
key_primers=['T7','M13 (-21) Forward','M13 (-40)','M13 Reverse','M13/pUC Forward','M13/pUC Reverse',\
            'SV40pA-R','SV40pro-F','SV40-spliceR','WPRE-R','BGH Reverse','SP6','MMLV-F']

In [122]:
def search_file(primer_name, file_name, string_to_search):
    line_number = 0
    list_of_results = []
    prev_line=""
    with open(file_name, 'r') as read_obj:
        for line in read_obj:
            line_number += 1
            if string_to_search.strip().upper() in line.upper():
                list_of_results.append((prev_line.strip()+'\n'+line.strip()+'\n'))
            prev_line=line
        if len(list_of_results)>0:
            print(f'primer: {primer_name}, found: {len(list_of_results)}')
    return list_of_results

In [123]:
primer_seqs=df_primers['Sequence'].tolist()
primers=df_primers['Primer'].tolist()

In [124]:
primers_alph=[]
for s in primers:
    p=re.sub('[^0-9a-zA-Z]+', '_', s)
    primers_alph.append(p)

In [125]:
primers_alph[0], primers_alph[2]

('3_AOX1', '35S_promoter')

In [126]:
primer_seqs[0], len(primer_seqs)

('GCAAATGGCATTCTGACATCC', 134)

In [127]:
def get_matches(primer_list):
    for kp in range(len(primer_list)):
        row=df_primers.loc[df_primers['Primer'] == primer_list[kp]]
        seq=row['Sequence'].to_string(index=False)
        pname=row['Primer'].to_string(index=False)
        p=re.sub('[^0-9a-zA-Z]+', '_', pname)
        matching_reads=search_file(pname, READ_FILE, seq)
        if len(matching_reads)>0:
            f = open(OUT_PATH+f'{SRA}_{READ_TYPE}_{p}.txt', "w")
            f.writelines(matching_reads)
            f.close()

In [None]:
#all primers
get_matches(primers)

primer:  CMV Forward, found: 1
primer:  GAL1, found: 1


In [None]:
#get_matches(key_primers)