# Working with Text

In [5]:
%matplotlib inline

Libraries for I/O

In [6]:
import os
import glob

Libraries for numerics

In [7]:
import numpy as np
import pandas as pd
import scipy.stats as stats

Libraries for plotting

In [8]:
import matplotlib.pyplot as plt
import seaborn as sns

Libraries for string manipulation

In [9]:
import string
import re

Libraries for functional programming

In [10]:
from functools import reduce, partial
import itertools as it
import operator as op
import toolz as tz
import toolz.curried as c

## String methods

In [11]:
s = "  Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37\n"

### Removing leading and trailing whitespace

In [7]:
s.strip()

'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'

In [8]:
s.lstrip()

'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37\n'

In [9]:
s.rstrip()

'  Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'

In [10]:
s = s.strip()

### Changing case

In [11]:
s.lower()

'avoid taking unnecessary gambles. lucky numbers: 12, 15, 23, 28, 37'

In [12]:
s.upper()

'AVOID TAKING UNNECESSARY GAMBLES. LUCKY NUMBERS: 12, 15, 23, 28, 37'

In [13]:
s.title()

'Avoid Taking Unnecessary Gambles. Lucky Numbers: 12, 15, 23, 28, 37'

### Checking conditions

In [14]:
s.startswith('Avoid')

True

In [15]:
s.endswith('37')

True

In [16]:
s.isalpha()

False

In [17]:
s.isnumeric()

False

In [18]:
s.isspace()

False

In [19]:
s.isprintable()

True

### Counting and indexing

In [20]:
s.count('a')

3

In [21]:
s.count('gambles')

1

In [22]:
s.find('gambles')

25

In [23]:
s[27:]

'mbles. Lucky numbers: 12, 15, 23, 28, 37'

In [24]:
s.find('foobar')

-1

In [25]:
s.index('gambles')

25

In [26]:
try:
    s.index('foobar')
except ValueError as e:
    print(e)

substring not found


### Splitting and joining

In [27]:
s.split()

['Avoid',
 'taking',
 'unnecessary',
 'gambles.',
 'Lucky',
 'numbers:',
 '12,',
 '15,',
 '23,',
 '28,',
 '37']

In [28]:
s.split(':')

['Avoid taking unnecessary gambles. Lucky numbers', ' 12, 15, 23, 28, 37']

In [29]:
'-'.join(s.split())

'Avoid-taking-unnecessary-gambles.-Lucky-numbers:-12,-15,-23,-28,-37'

### Replacing

In [30]:
s.replace('gambles', 'risk')

'Avoid taking unnecessary risk. Lucky numbers: 12, 15, 23, 28, 37'

### Translating

In [31]:
table = str.maketrans(string.ascii_lowercase, string.ascii_uppercase, string.punctuation)
s.translate(table)

'AVOID TAKING UNNECESSARY GAMBLES LUCKY NUMBERS 12 15 23 28 37'

In [32]:
table = str.maketrans('', '', string.punctuation)
s.translate(table)

'Avoid taking unnecessary gambles Lucky numbers 12 15 23 28 37'

In [1]:
ord('a')

97

In [2]:
chr(97)

'a'

**Exercise: Caesar Cipher**

A Caesar cipher with offset $k$ converts a character into the character $k$ letters down, looping around if this goes past `z`. Non-characters (numbers, spaces, punctuation) are left intact. For instance, with offset=3, we get `abcXYZ` being coded as `defABC`. Write an function `encode(k, s)` where `k` is the offset and `s` the string to be coded.  Write a `decode(k, s)` function that decodes encrypted ciphers. Test it out on the fortune. 

In [33]:
def encode(k, s):    
    table = str.maketrans(
        string.ascii_lowercase + string.ascii_uppercase,
        string.ascii_lowercase[k:] + string.ascii_lowercase[:k] + 
        string.ascii_uppercase[k:] + string.ascii_uppercase[:k])
    return s.translate(table)

In [34]:
encode(3, 'abcXYZ')

'defABC'

In [35]:
def decode(k, s):
    return encode(-k, s)

In [36]:
code = encode(3, s)

In [37]:
code

'Dyrlg wdnlqj xqqhfhvvdub jdpeohv. Oxfnb qxpehuv: 12, 15, 23, 28, 37'

In [38]:
decode(3, code)

'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'

## Counting words

To count words, we typically do the following preprocessing:
    
- Convert to lower (or upper) case
- Remove punctuation
- Split on blank space
- Count each word in list

In [4]:
s

NameError: name 's' is not defined

### Preprocessing

In [13]:
words = s.lower().translate(str.maketrans('','',string.punctuation)).split()

### Using a Counter (bag)

In [41]:
from collections import Counter

In [42]:
Counter(words)

Counter({'12': 1,
         '15': 1,
         '23': 1,
         '28': 1,
         '37': 1,
         'avoid': 1,
         'gambles': 1,
         'lucky': 1,
         'numbers': 1,
         'taking': 1,
         'unnecessary': 1})

### Using a dictionary

In [43]:
counter = {}
for word in words:
    counter[word] = counter.get(word, 0) + 1

In [44]:
counter

{'12': 1,
 '15': 1,
 '23': 1,
 '28': 1,
 '37': 1,
 'avoid': 1,
 'gambles': 1,
 'lucky': 1,
 'numbers': 1,
 'taking': 1,
 'unnecessary': 1}

### Using a `defaultdict`

In [45]:
from collections import defaultdict

In [46]:
d = defaultdict(int)

In [47]:
for word in words:
    d[word] += 1

In [48]:
d

defaultdict(int,
            {'12': 1,
             '15': 1,
             '23': 1,
             '28': 1,
             '37': 1,
             'avoid': 1,
             'gambles': 1,
             'lucky': 1,
             'numbers': 1,
             'taking': 1,
             'unnecessary': 1})

### Using a functional pipe

In [49]:
tz.pipe(
    s,
    lambda s: s.lower(),
    lambda s: s.translate(str.maketrans('', '', string.punctuation)),
    lambda s: s.split(),
    tz.frequencies
)

{'12': 1,
 '15': 1,
 '23': 1,
 '28': 1,
 '37': 1,
 'avoid': 1,
 'gambles': 1,
 'lucky': 1,
 'numbers': 1,
 'taking': 1,
 'unnecessary': 1}

### Modification for collection of strings

In [50]:
ss = [s, s, s]

In [51]:
ss

['Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37',
 'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37',
 'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37']

In [52]:
tz.pipe(
    ss,
    c.map(lambda s: s.lower()),
    c.map(lambda s: s.translate(str.maketrans('', '', string.punctuation))),
    c.mapcat(lambda s: s.split()),
    tz.frequencies
)

{'12': 3,
 '15': 3,
 '23': 3,
 '28': 3,
 '37': 3,
 'avoid': 3,
 'gambles': 3,
 'lucky': 3,
 'numbers': 3,
 'taking': 3,
 'unnecessary': 3}

## String to vector

To analyze text, we typically need to convert it to a vector format. There are several ways to do so. Here we show the most obvious method known as one-hot encoding.

### One hot character encoding

We first encode the string 'abcabc' as the vector [0,1,2,0,1,2]. For one-hot encoding, we next convert this to the one-hot encoded matrix

```python
array([[1, 0, 0],
       [0, 1, 0],
       [0, 0, 1],
       [1, 0, 0],
       [0, 1, 0],
       [0, 0, 1]])
```

In [15]:
idx = 0
index = {}
for ch in s:
    if not ch in index:
        index[ch] = idx
        idx += 1

In [16]:
index

{'\n': 30,
 ' ': 0,
 ',': 25,
 '.': 20,
 '1': 23,
 '2': 24,
 '3': 27,
 '5': 26,
 '7': 29,
 '8': 28,
 ':': 22,
 'A': 1,
 'L': 21,
 'a': 7,
 'b': 18,
 'c': 13,
 'd': 5,
 'e': 12,
 'g': 10,
 'i': 4,
 'k': 8,
 'l': 19,
 'm': 17,
 'n': 9,
 'o': 3,
 'r': 15,
 's': 14,
 't': 6,
 'u': 11,
 'v': 2,
 'y': 16}

In [14]:
s

'  Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37\n'

#### Categorical encoding

In [55]:
nchars = len(index)

In [56]:
vs = np.array([index[ch] for ch in s])

In [57]:
vs

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  3,  9, 10,  5, 11,  9,  9, 12,
       13, 12, 14, 14,  7, 15, 16,  5, 10,  7, 17, 18, 19, 12, 14, 20,  5,
       21, 11, 13,  8, 16,  5,  9, 11, 17, 18, 12, 15, 14, 22,  5, 23, 24,
       25,  5, 23, 26, 25,  5, 24, 27, 25,  5, 24, 28, 25,  5, 27, 29])

#### One-hot encoding

In [58]:
n = len(vs)
p = len(index)
m = np.zeros((n,p), dtype='int')
i = np.arange(len(vs))
m[i, vs] = 1
m

array([[1, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       [0, 0, 1, ..., 0, 0, 0],
       ..., 
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 1, 0, 0],
       [0, 0, 0, ..., 0, 0, 1]])

#### Reverse index lookup

In [59]:
reverse_index = dict(zip(index.values(), index.keys()))

In [60]:
''.join(reverse_index[v] for v in vs)

'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'

### One hot encoding for words.

In [61]:
words = ' '.join([s,s]).lower().translate(str.maketrans('', '', string.punctuation)).split()

In [62]:
pos = 0
index = {}
for word in words:
    if word not in index:
        index[word] = pos
        pos += 1

#### Categorical encoding

In [63]:
ws = np.array([index[word] for word in words])

In [64]:
ws

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10,  0,  1,  2,  3,  4,  5,
        6,  7,  8,  9, 10])

#### One-hot encoding

In [65]:
n = len(ws)
p = len(index)
m = np.zeros((n,p), dtype='int')
i = np.arange(len(ws))
m[i, ws] = 1
m

array([[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1]])

#### Reverse lookup

In [66]:
reverse_index = dict(zip(index.values(), index.keys()))

In [67]:
' '.join(reverse_index[w] for w in ws)

'avoid taking unnecessary gambles lucky numbers 12 15 23 28 37 avoid taking unnecessary gambles lucky numbers 12 15 23 28 37'

## Regular expressions

In [68]:
s

'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'

### Literal match

In [69]:
re.findall(r'gambles', s)

['gambles']

### Quantifiers `.`, `{m,n}`, `+`, `*`

In [70]:
#+ one or more _
#[abc]e = ae or be or ce
#[a-z]*e   [0-9]*e
#[0-9]+\. greedy
#[^abc] where ^ is complement
re.findall(r'gam.les', s)

['gambles']

In [71]:
re.findall(r'g.*s', s)

['g unnecessary gambles. Lucky numbers']

### Non-greedy quantifier.

In [72]:
re.findall(r'g.*?s', s)

['g unneces', 'gambles']

### Special characters

In [73]:
re.findall(r'\bg.*?s\b', s)

['gambles']

In [74]:
re.findall(r'\b\w+?\b', s)

['Avoid',
 'taking',
 'unnecessary',
 'gambles',
 'Lucky',
 'numbers',
 '12',
 '15',
 '23',
 '28',
 '37']

In [75]:
re.findall(r'\b\d+?\b', s)

['12', '15', '23', '28', '37']

In [76]:
re.findall(r'\b[a-zA-Z]+?\b', s)

['Avoid', 'taking', 'unnecessary', 'gambles', 'Lucky', 'numbers']

### Begin and end anchors

In [77]:
re.findall(r'\w+', s)

['Avoid',
 'taking',
 'unnecessary',
 'gambles',
 'Lucky',
 'numbers',
 '12',
 '15',
 '23',
 '28',
 '37']

In [78]:
re.findall(r'^\w+', s)   #begin

['Avoid']

In [79]:
re.findall(r'\w+$', s)   #end

['37']

### Capture groups

In [80]:
pat = r'\b(\d)(\d)?\b'

In [81]:
re.findall(pat, s)

[('1', '2'), ('1', '5'), ('2', '3'), ('2', '8'), ('3', '7')]

### Using search and match objects

In [82]:
re.search(pat, s)

<_sre.SRE_Match object; span=(49, 51), match='12'>

In [83]:
m = re.search(pat, s)

In [84]:
m.string

'Avoid taking unnecessary gambles. Lucky numbers: 12, 15, 23, 28, 37'

In [85]:
m.group()

'12'

In [86]:
m.groups()

('1', '2')

### Replacement using capture groups

In [87]:
rep = r'\2\1'
re.sub(pat, rep, s)

'Avoid taking unnecessary gambles. Lucky numbers: 21, 51, 32, 82, 73'

### Using compiled patterns

In [88]:
pat = re.compile(r'\b[a-zA-Z]+?\b')
pat.findall(s)

['Avoid', 'taking', 'unnecessary', 'gambles', 'Lucky', 'numbers']

In [None]:
#r'' where r is raw string
#r'\b\d'
#'\\b\\d'