# Text Processing Services

### standard python 3.6 library

[read the docs]()


> In this notebook we will explore the standard library for preprocessing text.

In [1]:
import datetime

d = datetime.datetime(2010, 7, 4, 12, 15, 58)
'{:%Y-%m-%d %H:%M:%S}'.format(d)

'2010-07-04 12:15:58'

In [2]:
today = datetime.datetime.now()
f_today = '{:%Y-%m-%d %H:%M:%S}'.format(today)
f_today

'2019-09-04 14:05:33'

In [3]:
date, time = f_today.split(' ')
date

'2019-09-04'

In [None]:
time

## Template strings

* Template strings provide simpler string substitutions as described in PEP 292.

`class string.Template(template)`

> The constructor takes a single argument which is the template string.

`substitute(mapping, **kwds)`

> Performs the template substitution, returning a new string.

`safe_substitute(mapping, **kwds)`

> Like substitute(), except that if placeholders are missing from mapping and kwds, instead of raising a KeyError exception, the original placeholder will appear in the resulting string intact.

In [1]:
from string import Template

s = Template('$who likes $what')
s.substitute(who='tim', what='kung pao')

'tim likes kung pao'

In [2]:
d = dict(who='tim')
Template('Give $who $100').substitute(d)

ValueError: Invalid placeholder in string: line 1, col 11

In [3]:
Template('$who likes $what').safe_substitute(d)

'tim likes $what'

In [4]:
Template('$who likes $100').safe_substitute(d)

'tim likes $100'

In [5]:
Template.flags

<RegexFlag.IGNORECASE: 2>

In [6]:
import re

re.split(r'\W+', 'Words, words, words.')

['Words', 'words', 'words', '']

In [7]:
re.split(r'(\W+)', 'Words, words, words.')

['Words', ', ', 'words', ', ', 'words', '.', '']

In [8]:
re.split(
    pattern=r'\W+',
    string='Words, words, words.',
    maxsplit=1
)

['Words', 'words, words.']

In [9]:
 re.split('[a-f]+', '0a3B9', flags=re.IGNORECASE)

['0', '3', '9']

In [10]:
re.split(r'(\W+)', '...words, words...')

['', '...', 'words', ', ', 'words', '...', '']

In [11]:
re.sub(
    pattern=r'def\s+([a-zA-Z_][a-zA-Z_0-9]*)\s*\(\s*\):',
    repl=r'static PyObject*\npy_\1(void)\n{',
    string='def myfunc():'
)

'static PyObject*\npy_myfunc(void)\n{'

In [12]:
TEXT = 'dog is chucho chucho'
FIND = 'chucho'

pattern = re.compile(FIND)
s = pattern.search(TEXT)
if s:
    a, b = s.span()
    match = s.string[a:b]
    one_liner = s.string[s.start():s.end()]

    print(f'text        : { s.string }\n' # access to the string.
          f'match       : { match }\n' # one way to get the text from the match.
          f'another way : { one_liner }\n' # another way to extract the match in text.
          f'location    : { s.span() }\n' # pattern match method.
          f'text length : { s.endpos }') # position methods.

text        : dog is chucho chucho
match       : chucho
another way : chucho
location    : (7, 13)
text length : 20


In [220]:
def seach_for(word, from_text):
    # word :  compile a regular expression pattern, returning a pattern object.
    pattern = re.compile(word)
    token = pattern.search(from_text)
    
    if token:
        match = token.string[token.start():token.end()]
        text = token.string.replace(match, '[{}]'.format(match.upper()))
        
        print(f'text        : { text }\n'
              f'match       : { match }\n'
              f'location    : { token.span() }\n'
              f'text length : { token.endpos }\n')

In [221]:
doc = ('Scan through string looking for the first'
       ' location where this regular expression produces a match')

seach_for(word='regular', from_text=doc)

text        : Scan through string looking for the first location where this [REGULAR] expression produces a match
match       : regular
location    : (62, 69)
text length : 97



In [222]:
docs = ['Scan through chucho looking for',
        'through chucho looking for the',
        'through chucho looking for the first chucho']

for doc in docs: seach_for('chucho', doc)

text        : Scan through [CHUCHO] looking for
match       : chucho
location    : (13, 19)
text length : 31

text        : through [CHUCHO] looking for the
match       : chucho
location    : (8, 14)
text length : 30

text        : through [CHUCHO] looking for the first [CHUCHO]
match       : chucho
location    : (8, 14)
text length : 43



In [None]:
# You could use enumerate to get the index.
# It makes the code both simpler and more efficient,
# as the linear search of index is avoided.
# I would also suggest unpacking t to (word, tag) to improve readability.

for ndx, (word, tag) in enumerate(wsj_list):
    if word in wordlist and tag == 'VN':
        print wsj[ndx-1:ndx+1], ndx

In [119]:
# {'id': 0, 'loc': [(13, 19)], 'endpos': 31, 'match': 'string'}

D = dict.fromkeys({'id': 0}, {'loc': [], 'endpos': 0, 'match': ''})
D

{'id': {'loc': [], 'endpos': 0, 'match': ''}}

In [120]:
D['id'] = 1
D

{'id': 1}

In [118]:
D['id']['endpos'] = 23
D

{'id': {'loc': [], 'endpos': 23, 'match': ''}, 'loc': [(3, 5)], 'endpos': 23}

In [117]:
D['loc'] = [(3, 5)]
D

{'id': {'loc': [], 'endpos': 0, 'match': ''}, 'loc': [(3, 5)], 'endpos': 23}

In [238]:
from collections import Counter

def remove_duplicates(text):

    # Split text string separated by space.
    # Joins two adjacent elements in iterable way.
    # Uses the Counter method have strings as key and
    # their frequencies as value.
    # Returns a joined adjacent of elements.

    text = text.split(' ')
    for idx in range(0, len(text)):
        text[idx] = ''.join(text[idx])
        print(f'* step (1): {text[idx]}')
        print(f'* step (1.1): {text}')
    unique = Counter(text)
    print(f'* step (3) : Counter(0_SPACE.join(list[text[i,,]])) -> {unique}')
    unique_join = ' '.join(unique.keys())
    print(f'* step (4) : 1_SPACE.str.join(Counter.keys()) -> {unique_join}')

In [239]:
docs = [
    'Python Python Python Python is great and Java is not so great',
    'Python Python Python is awesome Python',
    'Python is a language close to human language.',
    'PythonPythonPython. Python:, yep said it three times',
    'Python Python Python Python Python Python Python Python',
    'Python Python programming! and yeah Python Python.']

for doc in docs: print(remove_duplicates(doc))

* step (1): Python
* step (1.1): ['Python', 'Python', 'Python', 'Python', 'is', 'great', 'and', 'Java', 'is', 'not', 'so', 'great']
* step (1): Python
* step (1.1): ['Python', 'Python', 'Python', 'Python', 'is', 'great', 'and', 'Java', 'is', 'not', 'so', 'great']
* step (1): Python
* step (1.1): ['Python', 'Python', 'Python', 'Python', 'is', 'great', 'and', 'Java', 'is', 'not', 'so', 'great']
* step (1): Python
* step (1.1): ['Python', 'Python', 'Python', 'Python', 'is', 'great', 'and', 'Java', 'is', 'not', 'so', 'great']
* step (1): is
* step (1.1): ['Python', 'Python', 'Python', 'Python', 'is', 'great', 'and', 'Java', 'is', 'not', 'so', 'great']
* step (1): great
* step (1.1): ['Python', 'Python', 'Python', 'Python', 'is', 'great', 'and', 'Java', 'is', 'not', 'so', 'great']
* step (1): and
* step (1.1): ['Python', 'Python', 'Python', 'Python', 'is', 'great', 'and', 'Java', 'is', 'not', 'so', 'great']
* step (1): Java
* step (1.1): ['Python', 'Python', 'Python', 'Python', 'is', 'great

In [143]:
docs2 = ['Scan through string looking for',
        'Scan looking for the', # TRACK ISSUE: When a string doesnt have the word it duplicates the id
        'through string looking for the',
        'string looking string string string',
        'string looking for the first string']


from itertools import groupby

In [209]:
from typing import Iterable, Dict, List, Sequence

def pattern_search(word: Sequence[str],
                   texts: Iterable[str]) -> Iterable[Dict]:
    '''Word pattern finder from a list of texts.

    * key[`loc`] -> tuple[s]:
        If multiple matches found: group (tuple)s, per index.
    '''
    pattern = re.compile(word)
    temp_list = list()
    for idx, text in enumerate(texts):
        for token in pattern.finditer(text):
            matches = {
                'id': idx,
                'loc': [(loc.span()) for loc in pattern.finditer(text)],
                'endpos': token.endpos,
                'match': token.string[token.start():token.end()]
            }
        temp_list.append(matches)
    return [i[0] for i in groupby(temp_list)]

In [210]:
docs = [
    'Scan through string looking for',
    'through string looking for the',
    'string looking string string string',
    'string looking for the first string',
    # DEBUG: this string doesnt have the word it duplicates the id.
    'Scan looking for the'
]

matches = pattern_search(word='string', texts=docs)
for match in matches:
    print(match)

{'id': 0, 'loc': [(13, 19)], 'endpos': 31, 'match': 'string'}
{'id': 1, 'loc': [(8, 14)], 'endpos': 30, 'match': 'string'}
{'id': 2, 'loc': [(0, 6), (15, 21), (22, 28), (29, 35)], 'endpos': 35, 'match': 'string'}
{'id': 3, 'loc': [(0, 6), (29, 35)], 'endpos': 35, 'match': 'string'}


In [219]:
for match in matches:
    for k, v in match.items():
        print('{} --> {}'.format(k, v))
    print('\n')

id --> 0
loc --> [(13, 19)]
endpos --> 31
match --> string


id --> 1
loc --> [(8, 14)]
endpos --> 30
match --> string


id --> 2
loc --> [(0, 6), (15, 21), (22, 28), (29, 35)]
endpos --> 35
match --> string


id --> 3
loc --> [(0, 6), (29, 35)]
endpos --> 35
match --> string




In [208]:
for match in matches:
    print(match['id'])

0
2
3
4


In [19]:
m = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", "Malcolm Reynolds")
m.group('first_name')

'Malcolm'

In [20]:
from typing import Iterator

def fib(n: int) -> Iterator[int]:
    a, b = 0, 1
    while a < n:
        yield a
        a, b = b, a + b

In [37]:
from typing import Any, get_type_hints, Sequence, Iterable, Generic, KT, VT, Dict

In [30]:
get_type_hints(iter_seach_for)

{'word': str, 'texts': list}

In [39]:
Sequence is str

False

In [40]:
from typing import Iterable, List

def even(numbers: Iterable[int]) -> List[int]:
    return list(n for n in numbers if n % 2 == 0)

In [None]:
even()