In [1]:
import pandas as pd
import numpy as np

### 3.11.1 Pandas 字符串操作简介

In [5]:
data = ['peter', 'Paul', 'MARY', 'gUIDO']

[s.capitalize() for s in data]

['Peter', 'Paul', 'Mary', 'Guido']

In [7]:
# 报错

data2 = ['peter', 'Paul', None, 'MARY', 'gUIDO']

[s.capitalize() for s in data]

AttributeError: 'NoneType' object has no attribute 'capitalize'

In [8]:
names = pd.Series(data2)
names

0    peter
1     Paul
2     None
3     MARY
4    gUIDO
dtype: object

In [10]:
names.str.capitalize()

0    Peter
1     Paul
2     None
3     Mary
4    Guido
dtype: object

### 3.11.2 Pandas 字符串方法列表

#### 1. 与 Python 字符串方法相似的方法

In [11]:
monte = pd.Series(['Graham Chapman', 'John Cleese', 'Terry Gilliam', 
                  'Eric Idle', 'Terry Jones', 'Michael Palin'])

In [13]:
# 返回一个字符串 Series

monte.str.lower()

0    graham chapman
1       john cleese
2     terry gilliam
3         eric idle
4       terry jones
5     michael palin
dtype: object

In [14]:
# 返回数值

monte.str.len()

0    14
1    11
2    13
3     9
4    11
5    13
dtype: int64

#### 2. 使用正则表达式的方法

提取元素前面的连续字母作为每个人的名字

In [15]:
monte.str.extract('([A-Za-z]+)')

Unnamed: 0,0
0,Graham
1,John
2,Terry
3,Eric
4,Terry
5,Michael


找出所有开头和结尾都是辅音字母的名字

In [17]:
monte.str.findall(r"^[^AEIOU].*[^aeiou]$")

0    [Graham Chapman]
1                  []
2     [Terry Gilliam]
3                  []
4       [Terry Jones]
5     [Michael Palin]
dtype: object

3. 其他字符串方法

In [20]:
# (1) 向量化字符串的取值和切片操作

monte.str[0: 3]

0    Gra
1    Joh
2    Ter
3    Eri
4    Ter
5    Mic
dtype: object

In [21]:
# 获取每个姓名的姓

monte.str.split().str.get(-1)

0    Chapman
1     Cleese
2    Gilliam
3       Idle
4      Jones
5      Palin
dtype: object

(2) 指标变量

In [22]:
full_monte = pd.DataFrame({'name': monte,
                          'info': ['B|C|D', 'B|D', 'A|C', 'B|D', 'B|C', 'B|C|D']})
full_monte

Unnamed: 0,name,info
0,Graham Chapman,B|C|D
1,John Cleese,B|D
2,Terry Gilliam,A|C
3,Eric Idle,B|D
4,Terry Jones,B|C
5,Michael Palin,B|C|D


In [23]:
full_monte['info'].str.get_dummies("|")

Unnamed: 0,A,B,C,D
0,0,1,1,1
1,0,1,0,1
2,1,0,1,0
3,0,1,0,1
4,0,1,1,0
5,0,1,1,1


### 3.11.3 案例：食谱数据库

In [None]:
path = r"E:\Python\Python for Data Analysis\datasets\usda_food\database.json"

try:
    recipes = pd.read_csv(path)
except ValueError as e:
    print("Value Error: ", e)

In [None]:
with open("")