# Introducing Pandas String Operations

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns

In [3]:
x = np.array([2, 3, 5, 7, 11, 13])
x * 2

array([ 4,  6, 10, 14, 22, 26])

In [4]:
data = ['peter', 'Paul', 'MARY', 'gUIDO']
[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [8]:
data = ['peter', 'Paul', None, 'MARY', 'gUIDO']
names = pd.Series(data)
names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [9]:
names.str.capitalize() # skipping any missing value

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

---

# Tables of Pandas String Methods

In [10]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam',
                   'Eric Idle', 'Terry Jones', 'Michael Palin'])

---Methods similar to Python string methods---

In [11]:
monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [13]:
monte.str.capitalize()

0    Graham chapman
1       John cleese
2     Terry gilliam
3         Eric idle
4       Terry jones
5     Michael palin
dtype: object

In [14]:
monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

In [15]:
monte.str.startswith('T')

0    False
1    False
2     True
3    False
4     True
5    False
dtype: bool

In [16]:
monte.str.split()

0    [Graham, Chapman]
1       [John, Cleese]
2     [Terry, Gilliam]
3         [Eric, Idle]
4       [Terry, Jones]
5     [Michael, Palin]
dtype: object

---Methods using regular expressions---

In [20]:
monte.str.extract('([A-Za-z]+)')

Unnamed: 0,0
0,Graham
1,John
2,Terry
3,Eric
4,Terry
5,Michael


In [19]:
monte.str.findall(r'^[^AEIOU].*[^aeiou]$')

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

---Miscallaneous methods && Vectorized item access and slicing---

In [22]:
monte.str[0:3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [27]:
monte.str.slice(0,3)

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [29]:
monte.str.slice(5)

0    m Chapman
1       Cleese
2      Gilliam
3         Idle
4        Jones
5     el Palin
dtype: object

In [30]:
monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

In [31]:
monte.str.split().str.get(0)

0     Graham
1       John
2      Terry
3       Eric
4      Terry
5    Michael
dtype: object

---Indicator variables---

In [33]:
full_monte = pd.DataFrame({'name' : monte, 'info' : ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C',
'B|C|D']})
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [39]:
dummies = full_monte['info'].str.get_dummies('|')
dummies

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


the get_dummies() lets us quickly split out these indicator variables into DataFrame

In [47]:
pd.concat([dummies, full_monte], axis=1)

Unnamed: 0,A,B,C,D,name,info
0,0,1,1,1,Graham Chapman,B|C|D
1,0,1,0,1,John Cleese,B|D
2,1,0,1,0,Terry Gilliam,A|C
3,0,1,0,1,Eric Idle,B|D
4,0,1,1,0,Terry Jones,B|C
5,0,1,1,1,Michael Palin,B|C|D
