### Load packages

In [1]:
import pandas as pd
import re
from urllib.request import urlopen

In [2]:
with urlopen('https://www.gutenberg.org/files/84/84-0.txt') as f:
    frankenstein = f.read().decode('utf-8')

In [3]:
frankenstein[:100]

"\ufeff\r\nProject Gutenberg's Frankenstein, by Mary Wollstonecraft (Godwin) Shelley\r\n\r\nThis eBook is for th"

### Find words containing "natural" using a regular expression

Find the words that contain "natural" as a substring, and count them.

In [4]:
match_list = re.findall(r'\S+natural', frankenstein, re.IGNORECASE)

In [5]:
match_dictionary = {'words':match_list}
match_df = pd.DataFrame(match_dictionary)
match_df

Unnamed: 0,words
0,supernatural
1,supernatural
2,unnatural
3,unnatural
4,supernatural
5,unnatural
6,supernatural


In [6]:
match_df.groupby('words').size()

words
supernatural    4
unnatural       3
dtype: int64

### Find all ocurrences of Pribnow box in E. coli genome

And determine empirically the frequency of three most variable bases in the Pribnow box

In [7]:
with urlopen('https://raw.githubusercontent.com/biodatascience/datasci611/gh-pages/data/ecoli_k12.fasta') as f:
    ecoli = f.read().decode('utf-8')

In [8]:
pribnow_list = re.findall(r'TA...T.{8,12}?[AG]TG', ecoli)

In [9]:
len(pribnow_list)

5156

In [10]:
pribnow_dictionary = {}
for i, base in enumerate(['T0', 'A1', 'T2', 'A3', 'A4', 'T5', 'spacer']):
    if base != 'spacer':
        pribnow_dictionary[base] = [x[i] for x in pribnow_list]
    else:
        pribnow_dictionary[base] = [len(x[6:-3]) for x in pribnow_list]
pribnow_df = pd.DataFrame(pribnow_dictionary)
pribnow_df.head()

Unnamed: 0,T0,A1,T2,A3,A4,T5,spacer
0,T,A,T,T,C,T,8
1,T,A,G,T,T,T,9
2,T,A,C,C,A,T,10
3,T,A,T,G,T,T,10
4,T,A,C,G,G,T,9


In [11]:
# example string slicing
a = 'abcdefghijk'
a[3:-3]

'defgh'

In [12]:
for c in pribnow_df.columns:
    print(f'Column {c}:')
    if c != 'spacer':
        print(pribnow_df[[c]].groupby(c).size().sort_values(ascending=False)/pribnow_df.shape[0])
    else:
        print(pribnow_df[c].median())
    print('\n')

Column T0:
T0
T    1.0
dtype: float64


Column A1:
A1
A    1.0
dtype: float64


Column T2:
T2
T    0.314973
A    0.310318
C    0.258728
G    0.115981
dtype: float64


Column A3:
A3
C    0.283165
T    0.273274
G    0.232739
A    0.210822
dtype: float64


Column A4:
A4
T    0.298487
G    0.267649
A    0.221490
C    0.212374
dtype: float64


Column T5:
T5
T    1.0
dtype: float64


Column spacer:
10.0


