In [38]:
import pandas as pd

In [39]:
dfs = pd.read_html('https://www.neb.com/tools-and-resources/selection-charts/alphabetized-list-of-recognition-specificities')

In [40]:
dfs

[                        Recognition Sequence               Enzyme
 0                                    AA/CGTT                 AclI
 1                                    A/AGCTT  HindIII-HF® HindIII
 2                                    AAT/ATT        SspI-HF® SspI
 3                                      /AATT                MluCI
 4                                    A/CATGT                 PciI
 5                                    A/CCGGT        AgeI-HF® AgeI
 6                                ACCTGC(4/8)          BspMI BfuAI
 7                                   A/CCWGGT                SexAI
 8                                    A/CGCGT        MluI-HF® MluI
 9                               ACGGC(12/14)                BceAI
 10                                     A/CGT             HpyCH4IV
 11                                    ACN/GT            HpyCH4III
 12                  (10/15)ACNNNNGTAYC(12/7)                 BaeI
 13                   (9/12)ACNNNNNCTCC(10/7)                B

In [41]:
pd.__version__

'0.20.3'

In [42]:
df = dfs[0]
df.head()

Unnamed: 0,Recognition Sequence,Enzyme
0,AA/CGTT,AclI
1,A/AGCTT,HindIII-HF® HindIII
2,AAT/ATT,SspI-HF® SspI
3,/AATT,MluCI
4,A/CATGT,PciI


In [43]:
iupac_codes = pd.read_html('https://www.bioinformatics.org/sms/iupac.html', skiprows=1, index_col=0)
iupac_codes

[                            1
 0                            
 A                     Adenine
 C                    Cytosine
 G                     Guanine
 T (or U)  Thymine (or Uracil)
 R                      A or G
 Y                      C or T
 S                      G or C
 W                      A or T
 K                      G or T
 M                      A or C
 B                 C or G or T
 D                 A or G or T
 H                 A or C or T
 V                 A or C or G
 N                    any base
 . or -                    gap,      1              2
 0                    
 A  Ala        Alanine
 C  Cys       Cysteine
 D  Asp  Aspartic Acid
 E  Glu  Glutamic Acid
 F  Phe  Phenylalanine
 G  Gly        Glycine
 H  His      Histidine
 I  Ile     Isoleucine
 K  Lys         Lysine
 L  Leu        Leucine
 M  Met     Methionine
 N  Asn     Asparagine
 P  Pro        Proline
 Q  Gln      Glutamine
 R  Arg       Arginine
 S  Ser         Serine
 T  Thr      Threonine
 V  V

In [44]:
degenerate_nucleotides = iupac_codes[0]
degenerate_nucleotides.columns 
degenerate_nucleotides

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
A,Adenine
C,Cytosine
G,Guanine
T (or U),Thymine (or Uracil)
R,A or G
Y,C or T
S,G or C
W,A or T
K,G or T
M,A or C


In [45]:
degenerate_nucleotides.index = degenerate_nucleotides.index.map(lambda x: x[0])
degenerate_nucleotides

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
A,Adenine
C,Cytosine
G,Guanine
T,Thymine (or Uracil)
R,A or G
Y,C or T
S,G or C
W,A or T
K,G or T
M,A or C


In [46]:
degenerate_nucleotides = degenerate_nucleotides.drop(['.', "A", "C", 'G', 'T'], errors='ignore')
degenerate_nucleotides

Unnamed: 0_level_0,1
0,Unnamed: 1_level_1
R,A or G
Y,C or T
S,G or C
W,A or T
K,G or T
M,A or C
B,C or G or T
D,A or G or T
H,A or C or T
V,A or C or G


In [47]:
degenerate_nucleotides['letters'] = degenerate_nucleotides[1].str.replace(' or ', '')
degenerate_nucleotides.loc['N', 'letters'] = 'ACGT'
degenerate_nucleotides

Unnamed: 0_level_0,1,letters
0,Unnamed: 1_level_1,Unnamed: 2_level_1
R,A or G,AG
Y,C or T,CT
S,G or C,GC
W,A or T,AT
K,G or T,GT
M,A or C,AC
B,C or G or T,CGT
D,A or G or T,AGT
H,A or C or T,ACT
V,A or C or G,ACG


In [48]:
degenerate_map = degenerate_nucleotides['letters'].to_dict()
degenerate_map

{'B': 'CGT',
 'D': 'AGT',
 'H': 'ACT',
 'K': 'GT',
 'M': 'AC',
 'N': 'ACGT',
 'R': 'AG',
 'S': 'GC',
 'V': 'ACG',
 'W': 'AT',
 'Y': 'CT'}

In [49]:
df.head()

Unnamed: 0,Recognition Sequence,Enzyme
0,AA/CGTT,AclI
1,A/AGCTT,HindIII-HF® HindIII
2,AAT/ATT,SspI-HF® SspI
3,/AATT,MluCI
4,A/CATGT,PciI


In [50]:
df['Recognition Sequence'] = df['Recognition Sequence'].str.replace('/', '')
df['Recognition Sequence'] = df['Recognition Sequence'].str.strip('()/-1234567890')
df.head(10)

Unnamed: 0,Recognition Sequence,Enzyme
0,AACGTT,AclI
1,AAGCTT,HindIII-HF® HindIII
2,AATATT,SspI-HF® SspI
3,AATT,MluCI
4,ACATGT,PciI
5,ACCGGT,AgeI-HF® AgeI
6,ACCTGC,BspMI BfuAI
7,ACCWGGT,SexAI
8,ACGCGT,MluI-HF® MluI
9,ACGGC,BceAI


In [51]:
df['non_degenerate'] = df['Recognition Sequence']
df.head()

Unnamed: 0,Recognition Sequence,Enzyme,non_degenerate
0,AACGTT,AclI,AACGTT
1,AAGCTT,HindIII-HF® HindIII,AAGCTT
2,AATATT,SspI-HF® SspI,AATATT
3,AATT,MluCI,AATT
4,ACATGT,PciI,ACATGT


In [52]:
for degenerate_letter, replacement_letters in degenerate_map.items():
    df['Recognition Sequence'] = df['Recognition Sequence'].str.replace(degenerate_letter, f'[{replacement_letters}]')
df.head(10)

Unnamed: 0,Recognition Sequence,Enzyme,non_degenerate
0,AACGTT,AclI,AACGTT
1,AAGCTT,HindIII-HF® HindIII,AAGCTT
2,AATATT,SspI-HF® SspI,AATATT
3,AATT,MluCI,AATT
4,ACATGT,PciI,ACATGT
5,ACCGGT,AgeI-HF® AgeI,ACCGGT
6,ACCTGC,BspMI BfuAI,ACCTGC
7,ACC[AT]GGT,SexAI,ACCWGGT
8,ACGCGT,MluI-HF® MluI,ACGCGT
9,ACGGC,BceAI,ACGGC


In [55]:
from Bio import SeqIO
import re 

records = SeqIO.parse('/Users/olgabot/Downloads/Mononegav_254genomes_sequence_100517.fasta', 'fasta')
records


lines = []

for record in records:

    for i, (sequence, enzyme, nondegenerate_sequence) in df.iterrows():
        if not sequence in record:
            pass
        else:
            nondegenerate_found = re.findall(nondegenerate_sequence, str(record.seq))
            degenerate_found = re.findall(sequence, str(record.seq))
            line = [record.id, enzyme, sequence, len(degenerate_found), nondegenerate_sequence, len(nondegenerate_found)]
            lines.append(line)
#             print(record.id, enzyme, sequence, len(found))

found_restriction_sites = pd.DataFrame(
    lines, columns=['virus_name', 'enzyme', 'restriction_sequence', 'count', 
                    'restriction_sequence_nondegenerate', 'restriction_sequence_nondegenerate_count'])  
found_restriction_sites.head()

Unnamed: 0,virus_name,enzyme,restriction_sequence,count,restriction_sequence_nondegenerate,restriction_sequence_nondegenerate_count
0,gi|9626945|ref|NC_001498.1|,AclI,AACGTT,1,AACGTT,1
1,gi|9626945|ref|NC_001498.1|,HindIII-HF® HindIII,AAGCTT,4,AAGCTT,4
2,gi|9626945|ref|NC_001498.1|,SspI-HF® SspI,AATATT,2,AATATT,2
3,gi|9626945|ref|NC_001498.1|,MluCI,AATT,73,AATT,73
4,gi|9626945|ref|NC_001498.1|,PciI,ACATGT,5,ACATGT,5


In [56]:
found_restriction_sites.head(10)

Unnamed: 0,virus_name,enzyme,restriction_sequence,count,restriction_sequence_nondegenerate,restriction_sequence_nondegenerate_count
0,gi|9626945|ref|NC_001498.1|,AclI,AACGTT,1,AACGTT,1
1,gi|9626945|ref|NC_001498.1|,HindIII-HF® HindIII,AAGCTT,4,AAGCTT,4
2,gi|9626945|ref|NC_001498.1|,SspI-HF® SspI,AATATT,2,AATATT,2
3,gi|9626945|ref|NC_001498.1|,MluCI,AATT,73,AATT,73
4,gi|9626945|ref|NC_001498.1|,PciI,ACATGT,5,ACATGT,5
5,gi|9626945|ref|NC_001498.1|,AgeI-HF® AgeI,ACCGGT,4,ACCGGT,4
6,gi|9626945|ref|NC_001498.1|,BspMI BfuAI,ACCTGC,8,ACCTGC,8
7,gi|9626945|ref|NC_001498.1|,BceAI,ACGGC,9,ACGGC,9
8,gi|9626945|ref|NC_001498.1|,HpyCH4IV,ACGT,14,ACGT,14
9,gi|9626945|ref|NC_001498.1|,SpeI-HF® SpeI,ACTAGT,1,ACTAGT,1


In [58]:
enzyme_counts = found_restriction_sites.groupby('enzyme')['count'].sum()
enzyme_counts.sort_values().head()

enzyme
AscI             1
SrfI             4
NotI-HF® NotI    4
AsiSI            5
FseI             7
Name: count, dtype: int64

In [60]:
enzyme_counts = found_restriction_sites.groupby('enzyme')['restriction_sequence_nondegenerate_count'].sum()
enzyme_counts.sort_values().head()

enzyme
AscI             1
SrfI             4
NotI-HF® NotI    4
AsiSI            5
FseI             7
Name: restriction_sequence_nondegenerate_count, dtype: int64