# <center>第7章 数据清洗和准备</center>



## 7.1 处理缺失数据

>fillna(value=None, method=None, axis=None, inplace=False),value可以是数字，字典，Series或者DataFrame

>dropna(axis=0, how='any'/'all', thresh=None, subset=None, inplace=False),axis=0表示丢弃存在缺失值的行。

In [None]:
import pandas as pd 
import numpy as np

In [None]:
data=pd.Series(['aardvark','artichoke',np.nan,'avocado'])
data.isnull()

#根据轴标签(对DataFrame适用)对缺失数据进行过滤
data=pd.DataFrame({'language':{2001:'Java',2003:np.nan,2004:'vt'},
                    'hello':{2001:'bb',2002:'djsl',2003:'ssss'}})
#传入any丢弃所有含有空白值的行或列，传入all只丢弃全部是空白值的行或者列
data.dropna(axis=1,how='any'/'all',thresh=None,inplace=False)

#用fillna填充空白
#可以传入字典，但此时 axis不能指定，只能按列填充
data.fillna(value='a',axis=1,method={'backfill','bfill','pad'},inplace=True)

## 7.2 数据转换

### <font color="#00dddd">1.重复数据</font>

>pd.DataFrame.duplicated(subset,keep=['first','last','False']),检查各行是否存在重复值
DataFrame.drop_duplicates(subset=None,keep='first',inplace=False),去除重复项，默认第一列

In [2]:
import pandas as pd 
import numpy as np

In [5]:
data=pd.DataFrame({'k1':['one','two']*3+['two'],
                    'k2':[1,1,2,3,3,4,4]})
data.duplicated()
#还可以指定检查某一列是否重复
data.duplicated(['k1'])
#自动过滤重复项，如果不指定列，就会默认第一列
data.drop_duplicates()
data.drop_duplicates(['k1','k2'])

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

0    False
1    False
2     True
3     True
4     True
5     True
6     True
dtype: bool

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


### <font color='#00dddd'>2.利用函数或者映射进行数据转换</font>

>map是Series才有的方法，作用于Series的每一个元素，传入的转换方法可以是函数、字典或者Series。applymap则是DataFrame作用于每一个元素的方法

In [3]:
data=pd.DataFrame({'food':['bacon','pulled pork','bacon','Pastrami','corned beef','Bacon','pastrami','honey ham','nova lox'],
                    'ounces':[4,3,12,6,7.5,8,3,5,6]})
meet_to_animal={
    'bacon':'pig',
    'pulled pork':'pig',
    'pastrami':'cow',
    'corned beef':'cow',
    'honey ham':'pig',
    'nova lox':'salmon'
}
data['food']=data['food'].str.lower()
print(data)

#转入字典和Series应该是差不多的
data['type']=data['food'].map(meet_to_animal)

#也可以传入一个能完成全部这些的函数
data['food'].map(lambda x: meet_to_animal[x.lower()])

          food  ounces
0        bacon     4.0
1  pulled pork     3.0
2        bacon    12.0
3     pastrami     6.0
4  corned beef     7.5
5        bacon     8.0
6     pastrami     3.0
7    honey ham     5.0
8     nova lox     6.0


0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object


### <font color='#00dddd'>3.替换值-'replace'</font>

>DataFrame.repalce(to_replace,value,method,inplace=False),mothod是在没有传入value时的填充方法,有'ffill','bfill','None'等

In [None]:
import pandas as pd 
import numpy as np

In [1]:
data=pd.Series([99,-1,3,4,-999,-1000])
print(data)
data.replace(-999,np.nan)

#一次替换多个值
data.replace([99,-999],np.nan)

#要让每个值有不同的替换值,传入一个值列表
data.replace([99,-999],[np.nan,0])
#利用字典替换
dic={99:4,-1:33,-1000:np.nan}
data.replace(dic,inplace=False)

#Series/DataFrame.replace方法与Series/DataFrame.str.replace不同，后者做的是字符串的元素级替换

0      99
1      -1
2       3
3       4
4    -999
5   -1000
dtype: int64


0      4.0
1     33.0
2      3.0
3      4.0
4   -999.0
5      NaN
dtype: float64


### <font color='#00dddd'>4.重命名轴索引</font>

>重命名轴索引,轴标签也可以通过函数映射进行转换，从而得到一个新的标签

In [None]:
import pandas as pd
import numpy as np

In [None]:
data=pd.DataFrame(np.arange(12).reshape(3,4),
                index=['ohio','colorabo','new york'],
                columns=['one','two','three','four'])
#跟Series一样，轴索引也有一个map方法
transform=lambda x:x[:4].title()
#直接赋值就可以修改索引了
data.index=data.index.map(transform)
#甚至你还可以修改columns
data.columns.map(transform)
#如果要创建数据集的转换版(而不是修改原始数据),比较实用的方法是rename
#传入的str表示不变动,或者传入str.upper、str.title等等,你还可以传入字典
#rename(self,index=str,columns=?,axis=?,inplace=False)
data.rename(index=str.title,columns=str.upper)


### <font color='#00dddd'>5.离散化和面元划分</font>

>pd.cut是按数值分组,而pd.qcut是按分位数分组

pd.cut(x,bins,right=True,labels=None

In [None]:
import pandas as pd 
import numpy as np

In [2]:
age=np.random.randint(1,77,9)
bins=[1,33,55,77]
cat=pd.cut(age,bins=bins,right=True)
cat.codes#组别
cat.categories#分组区间
#分组统计
pd.value_counts(cat)
#还可以自己传入组名
group=['high','median','low','none']
b=pd.cut(np.random.randn(20),4,labels=group)
b.codes
b.categories
b.value_counts()

#qcut,按分位数进行分组
data=np.random.randn(1000)
cats=pd.qcut(data,q=4)
#自定义分位数
interval=[0,0.2,0.4,0.8,1]
cats=pd.qcut(data,interval)

### <font color='#00dddd'>6.检测和过滤异常值</font>

In [None]:
import pandas as pd
import numpy as np

In [None]:
data=pd.DataFrame(np.random.randn(1000,4))
data.describe()
#假如你想要找出某列中绝对值超过3的值:
col=data[2]
col[np.abs(col)>3]
#找出有绝对值超过3的行
data[(np.abs(data)>3).any(1)]
#找出有绝对值超过3的列
data[(np.abs(data)>3).any(0)]

### <font color='#00dddd'>7.排列和随机取样</font>

>DataFrame(n=None,axis=0,replace=False,weights=None)

In [None]:
import pandas as pd
import numpy as np
df=pd.DataFrame(np.arange(20).reshape(5,4))
sample=np.random.permutation(5)
sample
df.take(sample)
df.iloc(sample)
#取样
df.sample(3)

#有放回的取样
df.sample(6,replace=True)


### <font color='#00dddd'>8.计算哑变量</font>

>pd.get_dummies(data,prefix=None)

In [None]:
import pandas as pd
import numpy as np

>可以与面元划分相结合

In [6]:
df=pd.DataFrame({'key':['a','b','c','d','b','c'],'data1':np.arange(6)})
pd.get_dummies(df['key'])

#给虚拟变量加前缀
pd.get_dummies(df['key'],prefix='keys')

Unnamed: 0,a,b,c,d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,0,1,0,0
5,0,0,1,0


Unnamed: 0,keys_a,keys_b,keys_c,keys_d
0,1,0,0,0
1,0,1,0,0
2,0,0,1,0
3,0,0,0,1
4,0,1,0,0
5,0,0,1,0


[2 1 1 4 2 0 2 4 1 3]
   (-0.004, 1.0]  (1.0, 2.0]  (2.0, 3.0]  (3.0, 4.0]
0              0           1           0           0
1              1           0           0           0
2              1           0           0           0
3              0           0           0           1
4              0           1           0           0
5              1           0           0           0
6              0           1           0           0
7              0           0           0           1
8              1           0           0           0
9              0           0           1           0



# 7.3 字符串操作

- Python内置的字符串方法

In [None]:
a='jfdlsjfs'
b=['jfdl','djfd','avdd','ffff']
len(a)
a.split('f')
'.'.join(a)
'/'.join(b)
#字符串定位
a.index('f')#字符串不存在就会raise Error
a.find(']')#不存在就会返回-1

### <font color='#00dddd'>1.正则表达式</font>

>分为三个模块--模式匹配、替换以及拆分

In [None]:
import re

text="foo    bar\t baz  \tqux"
re.split('\s+',text)

#先把patter进行编译
#如果打算对许多字符串应用同一条正则表达式，强烈建立通过re.compile先编译，可以节省CPU时间
regex=re.compile(r'\s+')
regex.split(text)
#findall,search,match
text='''Dave dave@google.com
Steve steve@gmail.com
Rob rob@gmail.com
Ryan ryan@yahoo.com'''
pattern= r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z]{2,4}'
regex=re.compile(pattern,flags=re.I)
regex.findall(text)
#search只返回匹配的第一个对象
a=regex.search(text)
a.start
a.end
#sub方法可以将匹配到的字符串替换为指定字符串(不改变原text)
regex.sub('REDACTED',text)
#分段
pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\.([A-Z]{2,4})'
regex1=re.compile(pattern,flags=re.I)
regex1.findall(text)
#字符串替换
regex1.sub(r'Username: \1, Domain: \2, Suffix: \3',text)


### <font color='#00dddd'>2.pandas的矢量化函数</font>

>进行字符串的规整工作。Series.map在存在缺失值时会报错

In [5]:
import pandas as pd
import numpy as np
import re

In [6]:
data = {'Dave': 'dave@google.com', 'Steve': 'steve@gmail.com',
         'Rob': 'rob@gmail.com', 'Wes': np.nan}
data=pd.Series(data)
data.isnull()
pattern='([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})'
data.str.findall(pattern,flags=re.IGNORECASE)

Dave     False
Steve    False
Rob      False
Wes       True
dtype: bool

Dave     [(dave, google, com)]
Steve    [(steve, gmail, com)]
Rob        [(rob, gmail, com)]
Wes                        NaN
dtype: object

**下表介绍了更多的Pandas字符串方法**

![图片](https://upload-images.jianshu.io/upload_images/7178691-a634364ed6d5d5c5.png?imageMogr2/auto-orient/strip|imageView2/2/format/webp)