In [2]:
import numpy as np
import pandas as pd

from pandas import Series, DataFrame

In [8]:
df = DataFrame(np.random.randint(0, 150, (6, 3)), 
               index=pd.MultiIndex.from_product([['C', 'B', 'A'], ['期中', '期末']]),
               columns=['Python', 'Math', 'English'])
df

Unnamed: 0,Unnamed: 1,Python,Math,English
C,期中,79,20,31
C,期末,32,99,143
B,期中,19,148,89
B,期末,15,44,28
A,期中,52,113,91
A,期末,137,95,104


In [14]:
df.iloc[0: 5, 1:2]

Unnamed: 0,Unnamed: 1,Math
C,期中,20
C,期末,99
B,期中,148
B,期末,44
A,期中,113


In [21]:
# 正序切片不会报错
df.sort_index(axis=0, level=0).loc['A': 'B']

Unnamed: 0,Unnamed: 1,Python,Math,English
A,期中,52,113,91
A,期末,137,95,104
B,期中,19,148,89
B,期末,15,44,28


In [22]:
# 乱序的index切片会报错
df.loc['B': 'A']

UnsortedIndexError: 'Key length (1) was greater than MultiIndex lexsort depth (0)'

## 删除重复元素

In [24]:
df1 = DataFrame({'color': ['red', 'black', 'red', 'yellow', 'red']})
df1

Unnamed: 0,color
0,red
1,black
2,red
3,yellow
4,red


In [28]:
# 查看重复元素
df1.duplicated().sum()

2

In [32]:
# 删除重复数据
df1.drop_duplicates(inplace=True)
df1

Unnamed: 0,color
0,red
1,black
3,yellow


## 映射

### 替换

In [34]:
df = DataFrame(np.random.randint(0, 150, (6, 3)), index=list('abcdef'), 
               columns=['Python', 'Math', 'English'])
df

Unnamed: 0,Python,Math,English
a,90,129,46
b,129,144,65
c,11,81,3
d,40,37,56
e,94,60,65
f,82,102,1


In [36]:
mp = {65: -100, 129: -200}

# 全部替换
df.replace(mp)

Unnamed: 0,Python,Math,English
a,90,-200,46
b,-200,144,-100
c,11,81,3
d,40,37,56
e,94,60,-100
f,82,102,1


### 新建一列

In [38]:
# 增加一列Java， 其值比Python列少10
df['Java'] = df['Python'].map(lambda x: x - 10)
df

Unnamed: 0,Python,Math,English,Java
a,90,129,46,80
b,129,144,65,119
c,11,81,3,1
d,40,37,56,30
e,94,60,65,84
f,82,102,1,72


In [40]:
# 定义筛选条件
def convert(value):
    if value < 60:
        return '不及格'
    if (value >= 60) & (value < 100):
        return '一般'
    if (value >= 100) & (value < 130):
        return '良好'
    if (value >= 130) & (value < 150):
        return '优秀'

In [42]:
# 根据条件添加数据
df['level'] = df['English'].map(convert)
df

Unnamed: 0,Python,Math,English,Java,level
a,90,129,46,80,不及格
b,129,144,65,119,一般
c,11,81,3,1,不及格
d,40,37,56,30,不及格
e,94,60,65,84,一般
f,82,102,1,72,不及格


In [44]:
# transform 和 map 类似
df['level2'] = df['Math'].transform(convert)
df

Unnamed: 0,Python,Math,English,Java,level,level2
a,90,129,46,80,不及格,良好
b,129,144,65,119,一般,优秀
c,11,81,3,1,不及格,一般
d,40,37,56,30,不及格,不及格
e,94,60,65,84,一般,一般
f,82,102,1,72,不及格,良好


### 替换索引

In [48]:
df.rename({'level': '英语等级'}, axis=1, inplace=True)
df

Unnamed: 0,Python,Math,English,Java,英语等级,level2
a,90,129,46,80,不及格,良好
b,129,144,65,119,一般,优秀
c,11,81,3,1,不及格,一般
d,40,37,56,30,不及格,不及格
e,94,60,65,84,一般,一般
f,82,102,1,72,不及格,良好


In [49]:
# 数据和字典进行比较，有则替换，如果没有也不会报错
df.rename({'a': 'A', 'b': 'B', 'g': 'G'})

Unnamed: 0,Python,Math,English,Java,英语等级,level2
A,90,129,46,80,不及格,良好
B,129,144,65,119,一般,优秀
c,11,81,3,1,不及格,一般
d,40,37,56,30,不及格,不及格
e,94,60,65,84,一般,一般
f,82,102,1,72,不及格,良好


## 异常值检测过滤

In [102]:
df.min(axis=1)

a    46
b    65
c     1
d    30
e    60
f     1
dtype: int64

In [57]:
df1 = DataFrame(np.random.randint(0, 150, [6, 3]),
                index=pd.MultiIndex.from_product([['a', 'b', 'c'], ['期中', '期末']]),
                columns=['Python', 'Math', 'English']
               )
df1

Unnamed: 0,Unnamed: 1,Python,Math,English
a,期中,101,45,95
a,期末,26,26,96
b,期中,61,108,14
b,期末,125,70,82
c,期中,86,102,17
c,期末,109,110,132


In [59]:
df1.min(axis=0, level=0)

Unnamed: 0,Python,Math,English
a,26,26,95
b,61,70,14
c,86,102,17


In [111]:
# 求解方差, std, 求解平均值mean
# 异常值标准：
# df - mean > 3*std
df2 = DataFrame(np.random.randn(100000, 3), columns=['Python', 'Java', 'PHP'])
df2.head()

Unnamed: 0,Python,Java,PHP
0,0.54614,0.466815,0.029392
1,0.183451,0.839903,-0.399041
2,0.206188,0.449464,-1.318918
3,0.68615,-0.600173,-0.895407
4,0.714139,0.484717,1.931289


In [112]:
v_mean = df2.mean()
v_mean

Python   -0.001819
Java      0.003528
PHP      -0.001258
dtype: float64

In [113]:
v_std = df2.std()
v_std

Python    1.002261
Java      1.002765
PHP       0.999431
dtype: float64

In [114]:
cond = (df2 - v_mean) > v_std * 3
cond.sum()

Python    140
Java      118
PHP       123
dtype: int64

In [115]:
cond = cond.any(axis=1)

In [116]:
df2[cond]

Unnamed: 0,Python,Java,PHP
496,-0.353445,-0.878357,3.208070
515,0.663584,-0.080987,3.319823
775,3.156177,-1.611657,-0.971565
1525,0.217295,3.471857,0.099466
1674,-1.474350,3.051869,1.606687
1806,3.250636,-0.793959,-0.352930
2145,-0.525991,0.286310,3.125183
2321,-0.092307,3.169947,-0.877134
2388,-0.687854,3.122491,0.845703
2571,3.031008,0.004589,1.281277


In [120]:
# 选取不满足条件的索引
index = df2[cond].index
# 删除不满足条件的数据
df2.drop(labels=index, inplace=True)
df2

Unnamed: 0,Python,Java,PHP
0,0.546140,0.466815,0.029392
1,0.183451,0.839903,-0.399041
2,0.206188,0.449464,-1.318918
3,0.686150,-0.600173,-0.895407
4,0.714139,0.484717,1.931289
5,-0.378451,-0.411125,0.001086
6,0.436973,-1.375490,1.487678
7,-0.081864,-0.408206,-1.182210
8,-0.378248,0.279761,1.123659
9,-1.552444,0.117663,0.092789


In [121]:
cond = (df2 - v_mean) > v_std * 3
cond.sum()

Python    0
Java      0
PHP       0
dtype: int64