# <font color=steelblue>NaN - means Not a Number</font>

In [1]:
import numpy as np
import pandas as pd

In [2]:
n = np.nan
type(n)

float

In [3]:
m = 1
m + n

nan

***
<font color=orange size=4>NaN in Series</font>

In [4]:
s1 = pd.Series([1,2,np.nan,3,4],index=['A','B','C','D','E'])
s1

A    1.0
B    2.0
C    NaN
D    3.0
E    4.0
dtype: float64

In [5]:
s1.isnull()

A    False
B    False
C     True
D    False
E    False
dtype: bool

In [6]:
s1.dropna()

A    1.0
B    2.0
D    3.0
E    4.0
dtype: float64

***
<font color=orange size=4>NaN in DataFrame</font>

In [7]:
df = pd.DataFrame([[1,2,3],[np.nan,5,6],
                   [7,np.nan,9],[np.nan,np.nan,np.nan]])
df

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0
3,,,


In [8]:
df.notnull()

Unnamed: 0,0,1,2
0,True,True,True
1,False,True,True
2,True,False,True
3,False,False,False


In [9]:
df.dropna(axis=0,how='any',thresh=2)

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,,5.0,6.0
2,7.0,,9.0


In [10]:
df.fillna(value={0:0,1:1,2:2})#按column

Unnamed: 0,0,1,2
0,1.0,2.0,3.0
1,0.0,5.0,6.0
2,7.0,1.0,9.0
3,0.0,1.0,2.0


# <font color=steelblue>多级Index</font>

In [11]:
s2 = pd.Series(np.random.randn(6),
               index=[['1','1','1','2','2','2'],
                      ['a','b','c','a','b','c']])
s2

1  a    0.586184
   b   -0.944968
   c   -0.997697
2  a    1.206130
   b    1.580011
   c    0.676228
dtype: float64

In [12]:
print(type(s2))
#这里很有意思，根据label索引和index索引
print(type(s2['1']))
print(type(s2[1]))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>
<class 'numpy.float64'>


In [13]:
#多级index的Series是二维的
s2[:,'a']

1    0.586184
2    1.206130
dtype: float64

In [14]:
#多级index的Series和DataFrame相互转换
df1 = s2.unstack()
df1

Unnamed: 0,a,b,c
1,0.586184,-0.944968,-0.997697
2,1.20613,1.580011,0.676228


In [15]:
#columns为一级索引，index为二级索引
df1.T.unstack()

1  a    0.586184
   b   -0.944968
   c   -0.997697
2  a    1.206130
   b    1.580011
   c    0.676228
dtype: float64

In [16]:
df2 = pd.DataFrame(np.arange(16).reshape(4,4),
                   index=[['a','a','b','b'],[1,2,1,2]],
                   columns=[['BJ','BJ','SH','GZ'],[8,9,8,8]])
df2

Unnamed: 0_level_0,Unnamed: 1_level_0,BJ,BJ,SH,GZ
Unnamed: 0_level_1,Unnamed: 1_level_1,8,9,8,8
a,1,0,1,2,3
a,2,4,5,6,7
b,1,8,9,10,11
b,2,12,13,14,15


# <font color=steelblue>Mapping和Replace</font>

***
<font color=orange size=5>mapping</font>

In [17]:
df3 = pd.DataFrame({'城市':['北京','上海','广州'],
                    '人口':[1000,2000,1500]},index=['A','B','C'])
df3

Unnamed: 0,城市,人口
A,北京,1000
B,上海,2000
C,广州,1500


In [18]:
tmp_df = df3
tmp_df['GDP'] = pd.Series([1000,2000,1500])
tmp_df

Unnamed: 0,城市,人口,GDP
A,北京,1000,
B,上海,2000,
C,广州,1500,


In [19]:
#需要指定索引，如果index不是0,1,2……
tmp_df = df3
tmp_df['GDP'] = pd.Series([1000,2000,1500],index=['A','B','C'])
tmp_df

Unnamed: 0,城市,人口,GDP
A,北京,1000,1000
B,上海,2000,2000
C,广州,1500,1500


In [20]:
tmp_df = df3
gdp_map = {'北京':1000,'上海':2000,'广州':1500}
tmp_df['GDP'] = tmp_df['城市'].map(gdp_map)
tmp_df

Unnamed: 0,城市,人口,GDP
A,北京,1000,1000
B,上海,2000,2000
C,广州,1500,1500


***
<font color=orange size=5>replace</font>

In [21]:
s3 = pd.Series(np.arange(5))
s3

0    0
1    1
2    2
3    3
4    4
dtype: int64

In [22]:
s3.replace([1,2,3],[10,20,30])

0     0
1    10
2    20
3    30
4     4
dtype: int64

In [23]:
s3.replace({1:10,2:20,3:30})

0     0
1    10
2    20
3    30
4     4
dtype: int64