In [18]:
import pandas as pd

dataframes = pd.read_html('https://www.neb.com/tools-and-resources/selection-charts/alphabetized-list-of-recognition-specificities')
dataframes

[                        Recognition Sequence               Enzyme
 0                                    AA/CGTT                 AclI
 1                                    A/AGCTT  HindIII HindIII-HF®
 2                                    AAT/ATT        SspI SspI-HF®
 3                                      /AATT        MluCI Tsp509I
 4                                    A/CATGT                 PciI
 5                                    A/CCGGT        AgeI AgeI-HF®
 6                                ACCTGC(4/8)          BspMI BfuAI
 7                                   A/CCWGGT                SexAI
 8                                    A/CGCGT        MluI MluI-HF®
 9                               ACGGC(12/14)                BceAI
 10                                     A/CGT             HpyCH4IV
 11                                    ACN/GT            HpyCH4III
 12                  (10/15)ACNNNNGTAYC(12/7)                 BaeI
 13                   (9/12)ACNNNNNCTCC(10/7)                B

In [20]:
restriction_enzymes = dataframes[0]
print(restriction_enzymes.shape)
restriction_enzymes.head()

(240, 2)


Unnamed: 0,Recognition Sequence,Enzyme
0,AA/CGTT,AclI
1,A/AGCTT,HindIII HindIII-HF®
2,AAT/ATT,SspI SspI-HF®
3,/AATT,MluCI Tsp509I
4,A/CATGT,PciI


All restriction endonuclease recognition specificities available from New England Biolabs are listed below. For enzymes that recognize non-palindromic sequences, the complementary sequence of each strand is listed. For example, CCTC(7/6) and (6/7)GAGG both represent an MnlI (NEB #R0163) site.

All recognition sequences are written 5´ to 3´ using the single letter code nomenclature with the point of cleavage indicated by a "/".

Numbers in parentheses indicate point of cleaveage for non-palindromic enzymes.

For example, GGTCTC(1/5) indicates cleavage at:

    5´ ...GGTCTCN/...3´
    3´ ...CCAGAGNNNNN/...5´
  
How to show an image:
![](https://www.neb.com/~/media/NebUs/Page%20Images/Tools%20and%20Resources/charts/singlelettercode.jpg?device=modal)

Get only the palindromic sequences, the ones that don't have parentheses

In [24]:
restriction_enzymes['Recognition Sequence'].str.contains('\(') 

0      False
1      False
2      False
3      False
4      False
5      False
6       True
7      False
8      False
9       True
10     False
11     False
12      True
13      True
14     False
15     False
16      True
17      True
18     False
19     False
20     False
21     False
22     False
23     False
24      True
25     False
26     False
27     False
28      True
29     False
       ...  
210    False
211    False
212    False
213    False
214     True
215    False
216     True
217    False
218    False
219     True
220    False
221    False
222    False
223    False
224    False
225    False
226    False
227    False
228     True
229    False
230    False
231    False
232    False
233    False
234    False
235    False
236    False
237    False
238    False
239    False
Name: Recognition Sequence, Length: 240, dtype: bool

The `~` means "not" so it reverses the True/False from above

In [26]:
~restriction_enzymes['Recognition Sequence'].str.contains('\(')

0       True
1       True
2       True
3       True
4       True
5       True
6      False
7       True
8       True
9      False
10      True
11      True
12     False
13     False
14      True
15      True
16     False
17     False
18      True
19      True
20      True
21      True
22      True
23      True
24     False
25      True
26      True
27      True
28     False
29      True
       ...  
210     True
211     True
212     True
213     True
214    False
215     True
216    False
217     True
218     True
219    False
220     True
221     True
222     True
223     True
224     True
225     True
226     True
227     True
228    False
229     True
230     True
231     True
232     True
233     True
234     True
235     True
236     True
237     True
238     True
239     True
Name: Recognition Sequence, Length: 240, dtype: bool

In [28]:
palindromic = restriction_enzymes.loc[~restriction_enzymes['Recognition Sequence'].str.contains('\(')]
print(palindromic.shape)
palindromic.head()

(175, 2)


Unnamed: 0,Recognition Sequence,Enzyme
0,AA/CGTT,AclI
1,A/AGCTT,HindIII HindIII-HF®
2,AAT/ATT,SspI SspI-HF®
3,/AATT,MluCI Tsp509I
4,A/CATGT,PciI


The brackets mean if ANYTHING within the brackets matches, return True

In [44]:
palindromic['Recognition Sequence'].str.contains('[BDHKMNRSVWY]')

0      False
1      False
2      False
3      False
4      False
5      False
7       True
8      False
10     False
11      True
14      True
15     False
18     False
19     False
20     False
21     False
22     False
23     False
25     False
26     False
27     False
29     False
31     False
33     False
34      True
35      True
37     False
38      True
40      True
41     False
       ...  
206     True
207     True
208     True
209     True
210     True
211     True
212     True
213     True
215    False
217    False
218    False
220    False
221    False
222     True
223     True
224    False
225    False
226    False
227    False
229    False
230    False
231    False
232    False
233    False
234    False
235    False
236     True
237     True
238     True
239     True
Name: Recognition Sequence, Length: 175, dtype: bool

In [47]:
palindromic_acgt = palindromic.loc[~palindromic['Recognition Sequence'].str.contains('[BDHKMRSVWYN]')]
print(palindromic_acgt.shape)
palindromic_acgt.head(10)

(100, 2)


Unnamed: 0,Recognition Sequence,Enzyme
0,AA/CGTT,AclI
1,A/AGCTT,HindIII HindIII-HF®
2,AAT/ATT,SspI SspI-HF®
3,/AATT,MluCI Tsp509I
4,A/CATGT,PciI
5,A/CCGGT,AgeI AgeI-HF®
8,A/CGCGT,MluI MluI-HF®
10,A/CGT,HpyCH4IV
15,A/CTAGT,SpeI SpeI-HF®
18,A/GATCT,BglII


In [48]:
palindromic_acgt['sequence_no_slash'] = palindromic_acgt['Recognition Sequence'].str.replace('/', '')
print(palindromic_acgt.shape)
palindromic_acgt.head()

(100, 3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Recognition Sequence,Enzyme,sequence_no_slash
0,AA/CGTT,AclI,AACGTT
1,A/AGCTT,HindIII HindIII-HF®,AAGCTT
2,AAT/ATT,SspI SspI-HF®,AATATT
3,/AATT,MluCI Tsp509I,AATT
4,A/CATGT,PciI,ACATGT


In [61]:
from Bio import SeqIO

records = SeqIO.parse('/Users/olgabot/Downloads/sequence.fasta', 'fasta')
records

for record in records:
    for i, (original, enzyme, sequence) in palindromic_acgt.iterrows():
        if sequence in record:
            pass
#             print(f'{sequence} found!')
        else:
            print(f'{sequence} not found in {record.id}!\tEnzyme: {enzyme}')

ACGCGT not found in EU224440.2!	Enzyme: MluI MluI-HF®
AGCGCT not found in EU224440.2!	Enzyme: AfeI
CCTCAGC not found in EU224440.2!	Enzyme: Nb.BbvCI
CCTGCAGG not found in EU224440.2!	Enzyme: SbfI SbfI-HF®
CGTACG not found in EU224440.2!	Enzyme: BsiWI BsiWI-HF®
GCCCGGGC not found in EU224440.2!	Enzyme: SrfI
GCGATCGC not found in EU224440.2!	Enzyme: AsiSI
GCGGCCGC not found in EU224440.2!	Enzyme: NotI NotI-HF®
GGCCGGCC not found in EU224440.2!	Enzyme: FseI
GGCGCGCC not found in EU224440.2!	Enzyme: AscI
GTTTAAAC not found in EU224440.2!	Enzyme: PmeI
TACGTA not found in EU224440.2!	Enzyme: SnaBI
TGCGCA not found in EU224440.2!	Enzyme: FspI


There's a difference between:

* Parentheses: `()`
* Brackets: `[]`
* Curly braces: `{}`