# 6 String Manipulation

In [1]:
import pandas as pd

## Vectorized String Method

### Uppercase

In [2]:
s = pd.Series(['Apple', 'Banana', 'Orange'])
print(s)
print(s.str.upper())

0     Apple
1    Banana
2    Orange
dtype: object
0     APPLE
1    BANANA
2    ORANGE
dtype: object


### Lowercase

In [3]:
s = pd.Series(['Apple', 'Banana', 'Orange'])
print(s)
print(s.str.lower())

0     Apple
1    Banana
2    Orange
dtype: object
0     apple
1    banana
2    orange
dtype: object


### String Length

In [4]:
s = pd.Series(['Apple', 'Banana', 'Orange'])
print(s)
print(s.str.len())

0     Apple
1    Banana
2    Orange
dtype: object
0    5
1    6
2    6
dtype: int64


### Trimming Whitespace

In [10]:
s = pd.Index(['none', '   left', 'right   ', '   both   '])
print(s)
# strip trims both sides
print(s.str.strip())
# lstrip trims left side
print(s.str.lstrip())
# rstrip trims right side
print(s.str.rstrip())

Index(['none', '   left', 'right   ', '   both   '], dtype='object')
Index(['none', 'left', 'right', 'both'], dtype='object')
Index(['none', 'left', 'right   ', 'both   '], dtype='object')
Index(['none', '   left', 'right', '   both'], dtype='object')


### Index of Found Substring

In [16]:
# find
# Returns lowest indexes in each strings
# Returns -1 on failure
s = pd.Series(['Apple', 'Banana', 'Orange'])
print(s)
print(s.str.find('a'))

0     Apple
1    Banana
2    Orange
dtype: object
0   -1
1    1
2    2
dtype: int64


In [17]:
# index
# Returns lowest indexes in each strings
# Raises ValueError on failure
s = pd.Series(['Apple', 'Banana', 'Orange'])
print(s)
# print(s.str.index('a')) #Uncomment this line will raise ValueError

0     Apple
1    Banana
2    Orange
dtype: object


### Replacing

In [23]:
s = pd.Series(['Apple', 'Banana', 'Orange'])
print(s)
print(s.str.replace('a', '@'))

0     Apple
1    Banana
2    Orange
dtype: object
0     Apple
1    B@n@n@
2    Or@nge
dtype: object


### Splitting 

In [18]:
s = pd.Series(['a_b_c', 'd_e'])
print(s)
print(s.str.split('_'))

0    a_b_c
1      d_e
dtype: object
0    [a, b, c]
1       [d, e]
dtype: object


In [19]:
s = pd.Series(['a_b_c', 'd_e'])
print(s)
print(s.str.split('_', expand=True)) # Expanding into DataFrame

0    a_b_c
1      d_e
dtype: object
   0  1     2
0  a  b     c
1  d  e  None


### Joining

In [22]:
s = pd.Series([['a', 'b', 'c'], ['d', 'e']])
print(s)
print(s.str.join('_'))

0    [a, b, c]
1       [d, e]
dtype: object
0    a_b_c
1      d_e
dtype: object


## Vectorized String Method with Regular Expressions

### Counting Matched Pattern

In [24]:
pattern = r'[a-z][0-9]'
s = pd.Series(['1', 'a2', '3b4', 'c5d6'])
print(s)
print(s.str.count(pattern))

0       1
1      a2
2     3b4
3    c5d6
dtype: object
0    0
1    1
2    1
3    2
dtype: int64


### Finding Pattern

In [27]:
# Contains relies on re.search, and matches any part of the string
pattern = r'[a-z][0-9]'
s = pd.Series(['1', 'a2', '3b4', 'c5d6'])
print(s)
print(s.str.contains(pattern))

0       1
1      a2
2     3b4
3    c5d6
dtype: object
0    False
1     True
2     True
3     True
dtype: bool


In [28]:
# Match relies on re.match, and matches only the beginning of the string
pattern = r'[a-z][0-9]'
s = pd.Series(['1', 'a2', '3b4', 'c5d6'])
print(s)
print(s.str.match(pattern))

0       1
1      a2
2     3b4
3    c5d6
dtype: object
0    False
1     True
2    False
3     True
dtype: bool


In [29]:
# Finding all occurrences of pattern
pattern = r'[a-z][0-9]'
s = pd.Series(['1', 'a2', '3b4', 'c5d6'])
print(s)
print(s.str.findall(pattern))

0       1
1      a2
2     3b4
3    c5d6
dtype: object
0          []
1        [a2]
2        [b4]
3    [c5, d6]
dtype: object


### Extracting Matched Pattern

In [39]:
# Extract groups from the first matched patterns
pattern='(?P<col_0>[a-z])(?P<col_1>[0-9])'
s = pd.Series(['1', 'a2', '3b4', 'c5d6'])
print(s)
print(s.str.extract(pattern, expand=True))

0       1
1      a2
2     3b4
3    c5d6
dtype: object
  col_0 col_1
0   NaN   NaN
1     a     2
2     b     4
3     c     5


In [37]:
# Extract groups from all matched patterns
pattern='(?P<col_0>[a-z])(?P<col_1>[0-9])'
s = pd.Series(['1', 'a2', '3b4', 'c5d6'])
print(s)
print(s.str.extractall(pattern))

0       1
1      a2
2     3b4
3    c5d6
dtype: object
        col_0 col_1
  match            
1 0         a     2
2 0         b     4
3 0         c     5
  1         d     6
