In [1]:
# 5.2 Essential Functionality（主要功能

#1 Reindexing（重新索引）

import numpy as np
import pandas as pd
from pandas import Series, DataFrame



In [12]:
# Use reindex

obj = pd.Series([4.5, 7.2, -5.3, 3.6], index=['d', 'b', 'a', 'c'])
print(obj)

obj2 = obj.reindex(['a', 'b', 'c', 'd', 'e'])
print(obj2)


d    4.5
b    7.2
a   -5.3
c    3.6
dtype: float64
a   -5.3
b    7.2
c    3.6
d    4.5
e    NaN
dtype: float64


In [14]:
# Can use method: ffill
obj3 = pd.Series(['bule', 'purple', 'yellow'], index=[0, 2, 4])
print(obj3)

print('---')
obj4 = obj3.reindex(range(6), method='ffill')
print(obj3)    #notice that obj3 no change
print('---')
print(obj4)


0      bule
2    purple
4    yellow
dtype: object
---
0      bule
2    purple
4    yellow
dtype: object
---
0      bule
1      bule
2    purple
3    purple
4    yellow
5    yellow
dtype: object


In [21]:
frame = pd.DataFrame(np.arange(9).reshape(3, 3),
                     index=['a', 'c', 'd'],
                     columns=['Ohio', 'Texas', 'California'])

print(frame)

frame2 = frame.reindex(['a', 'b', 'c', 'd'])
print(frame2)

# Change columns index
states = ['Texas', 'Utah', 'California']
frame3 = frame.reindex(columns=states)
print(frame3)

print('---')
frame.loc[['a', 'b', 'c', 'd'], states] 
# not work
# Message: Passing list-likes to .loc or [] with any missing label will raise
# KeyError in the future, you can use .reindex() as an alternative.
frame.index.name='Index'
frame.columns.name='State'
print(frame)

   Ohio  Texas  California
a     0      1           2
c     3      4           5
d     6      7           8
   Ohio  Texas  California
a   0.0    1.0         2.0
b   NaN    NaN         NaN
c   3.0    4.0         5.0
d   6.0    7.0         8.0
   Texas  Utah  California
a      1   NaN           2
c      4   NaN           5
d      7   NaN           8
---
State  Ohio  Texas  California
Index                         
a         0      1           2
c         3      4           5
d         6      7           8


Passing list-likes to .loc or [] with any missing label will raise
KeyError in the future, you can use .reindex() as an alternative.

See the documentation here:
https://pandas.pydata.org/pandas-docs/stable/indexing.html#deprecate-loc-reindex-listlike
  return self._getitem_tuple(key)


In [29]:
#5.2.2 Dropping Entries from an Axis

# Use: drop
obj = pd.Series(np.arange(5.), index=['a', 'b', 'c', 'd', 'e'])
print(obj)

print('--')
new_obj = obj.drop('c')
print(new_obj)

print('--')
obj1= obj.drop(['d', 'c'])
print(obj)    # notice that obj is not changed, just return a new object
print('--')
print(obj1)

# notice that obj is changed, directly change the object
obj.drop('c', inplace=True)
obj

a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
--
a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64
--
a    0.0
b    1.0
c    2.0
d    3.0
e    4.0
dtype: float64
--
a    0.0
b    1.0
e    4.0
dtype: float64


a    0.0
b    1.0
d    3.0
e    4.0
dtype: float64

In [24]:
data = pd.DataFrame(np.arange(16).reshape(4, 4),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])
data

Unnamed: 0,one,two,three,four
Ohio,0,1,2,3
Colorado,4,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [26]:
data.drop(['Colorado', 'Ohio'])

Unnamed: 0,one,two,three,four
Utah,8,9,10,11
New York,12,13,14,15


In [27]:
data.drop('two', axis=1)

Unnamed: 0,one,three,four
Ohio,0,2,3
Colorado,4,6,7
Utah,8,10,11
New York,12,14,15


In [28]:
data.drop(['two', 'four'], axis='columns')

Unnamed: 0,one,three
Ohio,0,2
Colorado,4,6
Utah,8,10
New York,12,14


In [38]:
# 5.2.3 Indexing, Selection, and Filtering

obj = pd.Series(np.arange(4.), index=['a', 'b', 'c', 'd'])

print(obj)
print('---')

print(obj['b'])
print(obj[1])
print(obj[2:4])
print(obj[['b', 'a', 'd']])
print(obj[[1, 3]])
print(obj[obj < 2])
print(obj['b':'c'])  # notice that include c collumn, not like python slice

obj['b':'c'] = 5   # notice that include c collumn
print(obj)
print()

a    0.0
b    1.0
c    2.0
d    3.0
dtype: float64
---
1.0
1.0
c    2.0
d    3.0
dtype: float64
b    1.0
a    0.0
d    3.0
dtype: float64
b    1.0
d    3.0
dtype: float64
a    0.0
b    1.0
dtype: float64
b    1.0
c    2.0
dtype: float64
a    0.0
b    5.0
c    5.0
d    3.0
dtype: float64



In [43]:
data = pd.DataFrame(np.arange(16).reshape((4, 4)),
                    index=['Ohio', 'Colorado', 'Utah', 'New York'],
                    columns=['one', 'two', 'three', 'four'])

print(data)

print(data['two'])

print(data[['three', 'one']])

print(data[:2])

d1 = data[data['three'] > 5]
print(d1)

print(data < 5)

data[data < 5] = 0
print(data)

          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
Ohio         1
Colorado     5
Utah         9
New York    13
Name: two, dtype: int32
          three  one
Ohio          2    0
Colorado      6    4
Utah         10    8
New York     14   12
          one  two  three  four
Ohio        0    1      2     3
Colorado    4    5      6     7
          one  two  three  four
Colorado    4    5      6     7
Utah        8    9     10    11
New York   12   13     14    15
            one    two  three   four
Ohio       True   True   True   True
Colorado   True  False  False  False
Utah      False  False  False  False
New York  False  False  False  False
          one  two  three  four
Ohio        0    0      0     0
Colorado    0    5      6     7
Utah        8    9     10    11
New York   12   13     14    15


In [44]:
#5.2.3 Selection with loc and iloc
data

Unnamed: 0,one,two,three,four
Ohio,0,0,0,0
Colorado,0,5,6,7
Utah,8,9,10,11
New York,12,13,14,15


In [46]:
data.loc['Colorado', ['two', 'three']]

two      5
three    6
Name: Colorado, dtype: int32

In [47]:
data.iloc[2, [3, 0, 1]]

four    11
one      8
two      9
Name: Utah, dtype: int32

In [48]:
data.iloc[2]

one       8
two       9
three    10
four     11
Name: Utah, dtype: int32

In [49]:
data.iloc[[1, 2], [3, 0, 1]]

Unnamed: 0,four,one,two
Colorado,7,0,5
Utah,11,8,9


In [50]:
data.loc[:'Utah', 'two']

Ohio        0
Colorado    5
Utah        9
Name: two, dtype: int32

In [51]:
data.iloc[:, :3][data.three > 5]

Unnamed: 0,one,two,three
Colorado,0,5,6
Utah,8,9,10
New York,12,13,14


In [None]:
5 Arithmetic and Data Alignment (算数和数据对齐)


In [52]:
s1 = pd.Series([7.3, -2.5, 3.4, 1.5], index=['a', 'c', 'd', 'e'])
s2 = pd.Series([2.1, 3.6, -1.5, 4, 3.1], index=['a', 'c', 'e', 'f', 'g'])
s1 + s2

a    9.4
c    1.1
d    NaN
e    0.0
f    NaN
g    NaN
dtype: float64

In [55]:
df1 = pd.DataFrame(np.arange(9.).reshape((3, 3)), columns=list('bcd'),
                   index=['Ohio', 'Texas', 'Colorado'])

df2 = pd.DataFrame(np.arange(12.).reshape((4, 3)), columns=list('bde'),
                   index=['Utah', 'Ohio', 'Texas', 'Oregon'])

print(df1)
print('---')
print(df2)
print('---')
print(df1+df2)

            b    c    d
Ohio      0.0  1.0  2.0
Texas     3.0  4.0  5.0
Colorado  6.0  7.0  8.0
---
          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
---
            b   c     d   e
Colorado  NaN NaN   NaN NaN
Ohio      3.0 NaN   6.0 NaN
Oregon    NaN NaN   NaN NaN
Texas     9.0 NaN  12.0 NaN
Utah      NaN NaN   NaN NaN


In [57]:
df1 = pd.DataFrame({'A': [1, 2]})
df2 = pd.DataFrame({'B': [3, 4]})

print(df1)
print('---')
print(df2)
print('---')
print(df1+df2)
print('---')
print(df1-df2)

   A
0  1
1  2
---
   B
0  3
1  4
---
    A   B
0 NaN NaN
1 NaN NaN
---
    A   B
0 NaN NaN
1 NaN NaN


In [62]:
# Page 150 Arithmetic methods with fill values (带填充值的算数方法)

df1 = pd.DataFrame(np.arange(12.).reshape((3, 4)), 
                   columns=list('abcd'))

df2 = pd.DataFrame(np.arange(20.).reshape((4, 5)), 
                   columns=list('abcde'))

df2.loc[1, 'b'] = np.nan

print(df1)
print('---')
print(df2)
print('---')
print(df1+df2)


     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
---
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0
---
      a     b     c     d   e
0   0.0   2.0   4.0   6.0 NaN
1   9.0   NaN  13.0  15.0 NaN
2  18.0  20.0  22.0  24.0 NaN
3   NaN   NaN   NaN   NaN NaN
---
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   5.0
1   9.0   6.0  13.0  15.0  10.0
2  18.0  20.0  22.0  24.0  15.0
3  16.0  17.0  18.0  19.0  20.0
---
     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
---
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0
---


In [63]:
print(df1.add(df2, fill_value = 0))  # <- df1,df2 not change, return new copy
print(df1.add(df2, fill_value = 1))
print('---')
print(df1)    
print('---')
print(df2)
print('---')

      a     b     c     d     e
0   0.0   2.0   4.0   6.0   4.0
1   9.0   5.0  13.0  15.0   9.0
2  18.0  20.0  22.0  24.0  14.0
3  15.0  16.0  17.0  18.0  19.0
      a     b     c     d     e
0   0.0   2.0   4.0   6.0   5.0
1   9.0   6.0  13.0  15.0  10.0
2  18.0  20.0  22.0  24.0  15.0
3  16.0  17.0  18.0  19.0  20.0
---
     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
---
      a     b     c     d     e
0   0.0   1.0   2.0   3.0   4.0
1   5.0   NaN   7.0   8.0   9.0
2  10.0  11.0  12.0  13.0  14.0
3  15.0  16.0  17.0  18.0  19.0
---


In [65]:
df2 = 1 / df1
# samse
df3 = df1.rdiv(1)

print(df1)    
print('---')
print(df2)
print('---')
print(df3)   # same as df2
print('---')

# Table 5-5 
add, radd; sub,rsub; div,rdiv; floordiv,rfloordiv; mul, rmul, pow, rpow

     a    b     c     d
0  0.0  1.0   2.0   3.0
1  4.0  5.0   6.0   7.0
2  8.0  9.0  10.0  11.0
---
       a         b         c         d
0    inf  1.000000  0.500000  0.333333
1  0.250  0.200000  0.166667  0.142857
2  0.125  0.111111  0.100000  0.090909
---
       a         b         c         d
0    inf  1.000000  0.500000  0.333333
1  0.250  0.200000  0.166667  0.142857
2  0.125  0.111111  0.100000  0.090909
---


In [66]:
df1.reindex(columns=df2.columns, fill_value=0)

Unnamed: 0,a,b,c,d
0,0.0,1.0,2.0,3.0
1,4.0,5.0,6.0,7.0
2,8.0,9.0,10.0,11.0


In [69]:
#Operations between DataFrame and Series

arr = np.arange(12.).reshape((3, 4))

print(arr)
print('---')
print(arr[0])
print('---')
print( arr - arr[0])   #broadcasting

[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]
---
[0. 1. 2. 3.]
---
[[0. 0. 0. 0.]
 [4. 4. 4. 4.]
 [8. 8. 8. 8.]]


In [72]:
frame = pd.DataFrame(np.arange(12.).reshape((4, 3)),
                     columns=list('bde'),
                    index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print( frame)
print('---')

series = frame.iloc[0]
print(series)
print('---')
print(frame - series)

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
---
b    0.0
d    1.0
e    2.0
Name: Utah, dtype: float64
---
          b    d    e
Utah    0.0  0.0  0.0
Ohio    3.0  3.0  3.0
Texas   6.0  6.0  6.0
Oregon  9.0  9.0  9.0


In [73]:
series2 = pd.Series(range(3), index=['b', 'e', 'f'])
frame + series2

Unnamed: 0,b,d,e,f
Utah,0.0,,3.0,
Ohio,3.0,,6.0,
Texas,6.0,,9.0,
Oregon,9.0,,12.0,


In [75]:
series3 = frame['d']
print(series3)

Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64


In [79]:
print(frame)
print(series3)
print('---')
print(frame.sub(series3, axis='index'))
# same
print(frame.sub(series3, axis = 0))
# vs
print(frame.sub(series3, axis = 1))

          b     d     e
Utah    0.0   1.0   2.0
Ohio    3.0   4.0   5.0
Texas   6.0   7.0   8.0
Oregon  9.0  10.0  11.0
Utah       1.0
Ohio       4.0
Texas      7.0
Oregon    10.0
Name: d, dtype: float64
---
          b    d    e
Utah   -1.0  0.0  1.0
Ohio   -1.0  0.0  1.0
Texas  -1.0  0.0  1.0
Oregon -1.0  0.0  1.0
          b    d    e
Utah   -1.0  0.0  1.0
Ohio   -1.0  0.0  1.0
Texas  -1.0  0.0  1.0
Oregon -1.0  0.0  1.0
        Ohio  Oregon  Texas  Utah   b   d   e
Utah     NaN     NaN    NaN   NaN NaN NaN NaN
Ohio     NaN     NaN    NaN   NaN NaN NaN NaN
Texas    NaN     NaN    NaN   NaN NaN NaN NaN
Oregon   NaN     NaN    NaN   NaN NaN NaN NaN


In [None]:
6 Function Application and Mapping (函数应用和映射)

In [85]:
frame = pd.DataFrame(np.random.randn(4, 3), columns=list('bde'), 
                     index=['Utah', 'Ohio', 'Texas', 'Oregon'])
print(frame)
print('------')

print(np.abs(frame))
print('-------')

f = lambda x: x.max() - x.min()

fd1 = frame.apply(f)
# vs.
fd2 = frame.apply(f, axis='columns')
print(fd1)
print('-------')
print(fd2)

               b         d         e
Utah    0.610896  0.652326 -0.782994
Ohio   -0.609226  0.518462  0.303001
Texas   0.103609 -0.200211 -0.794206
Oregon -0.282736 -0.100765  0.021823
------
               b         d         e
Utah    0.610896  0.652326  0.782994
Ohio    0.609226  0.518462  0.303001
Texas   0.103609  0.200211  0.794206
Oregon  0.282736  0.100765  0.021823
-------
b    1.220123
d    0.852537
e    1.097207
dtype: float64
-------
Utah      1.435319
Ohio      1.127688
Texas     0.897815
Oregon    0.304558
dtype: float64


In [91]:
def f(x): 
    return pd.Series([x.min(), x.max()], index=['min', 'max'])

print(frame)
print('-------')

print(frame.apply(f))


format = lambda x: '%.2f' % x

print('-------')
print(frame.applymap(format) )

               b         d         e
Utah    0.610896  0.652326 -0.782994
Ohio   -0.609226  0.518462  0.303001
Texas   0.103609 -0.200211 -0.794206
Oregon -0.282736 -0.100765  0.021823
-------
            b         d         e
min -0.609226 -0.200211 -0.794206
max  0.610896  0.652326  0.303001
-------
            b      d      e
Utah     0.61   0.65  -0.78
Ohio    -0.61   0.52   0.30
Texas    0.10  -0.20  -0.79
Oregon  -0.28  -0.10   0.02


In [None]:
7 Sorting and Ranking （排序）
按row或column index来排序的话，可以用sort_index方法，会返回一个新的object

sort_index

sort_values

In [93]:
s1 = pd.Series(range(4), index=['d', 'a', 'b', 'c'])

s2 = s1.sort_index()  # return a new object s2

print(s1)   # s1 still not sorted
print(s2)


d    0
a    1
b    2
c    3
dtype: int64
a    1
b    2
c    3
d    0
dtype: int64


In [97]:
# Sorting for dataframe

fd1 = pd.DataFrame(np.arange(8).reshape((2, 4)),
                     index=['BB', 'AA'],
                     columns=['d', 'a', 'b', 'c'])

fd2 = fd1.sort_index()
# vs. 
fd3 = fd1.sort_index(axis=1)
# vs. 
fd4 = fd1.sort_index(axis=1,ascending=False)

print(fd1)
print('--')
print(fd2)
print('--')
print(fd3)
print('--')
print(fd4)

    d  a  b  c
BB  0  1  2  3
AA  4  5  6  7
--
    d  a  b  c
AA  4  5  6  7
BB  0  1  2  3
--
    a  b  c  d
BB  1  2  3  0
AA  5  6  7  4
--
    d  c  b  a
BB  0  3  2  1
AA  4  7  6  5


In [99]:
# Sort series
obj = pd.Series([4, 7, -3, 2])
obj.sort_values()

obj = pd.Series([4, np.nan, 7, np.nan, -3, 2])
obj.sort_values()

4   -3.0
5    2.0
0    4.0
2    7.0
1    NaN
3    NaN
dtype: float64

In [103]:
frame = pd.DataFrame({'b': [4, 7, -3, 2], 'a': [0, 1, 0, 1]})
print(frame)

fd1 = frame.sort_values(by='b')
fd2 = frame.sort_values(by=['a', 'b'])

print('---')
print(fd1)
print('---')
print(fd2)

   b  a
0  4  0
1  7  1
2 -3  0
3  2  1
---
   b  a
2 -3  0
3  2  1
0  4  0
1  7  1
---
   b  a
2 -3  0
0  4  0
3  2  1
1  7  1


In [None]:
ranking（排名）是给有效的数据分配数字。
rank方法能用于series和DataFrame，rank方法默认会给每个group一个mean rank
（平均排名）。rank 表示在这个数在原来的Series中排第几名，
有相同的数，取其排名平均（默认）作为值：

In [106]:
obj = pd.Series([7, -5, 7, 4, 2, 0, 4])
print(obj)

print(obj.sort_values())

print(obj.rank())

# 在obj中，4和4的排名是第4名和第五名，取平均得4.5。
# 7和7的排名分别是第六名和第七名，则其排名取平均得6.5。

0    7
1   -5
2    7
3    4
4    2
5    0
6    4
dtype: int64
1   -5
5    0
4    2
3    4
6    4
0    7
2    7
dtype: int64
0    6.5
1    1.0
2    6.5
3    4.5
4    3.0
5    2.0
6    4.5
dtype: float64


In [110]:
frame = pd.DataFrame({'b': [4.3, 7, -3, 2],
                      'a': [0, 1, 0, 1],
                      'c': [-2, 5, 8, -2.5]})

print(frame)
print('---')

fd = frame.rank(axis='columns') # columns表示列与列之间的排序)
print(fd) 

     b  a    c
0  4.3  0 -2.0
1  7.0  1  5.0
2 -3.0  0  8.0
3  2.0  1 -2.5
---
     b    a    c
0  3.0  2.0  1.0
1  3.0  1.0  2.0
2  1.0  2.0  3.0
3  3.0  2.0  1.0


In [None]:
8 Axis Indexes with Duplicate Labels (有重复label的轴索引)
我们看到的所有例子都有unique axis labels(index values),唯一的轴标签（索引值）。
一些pandas函数（reindex）,需要label是唯一的，但这并是不强制的。

In [111]:
obj = pd.Series(range(5), index=['a', 'a', 'b', 'b', 'c'])
obj.index.is_unique

False

In [113]:
df = pd.DataFrame(np.random.randn(4, 3), index=['a', 'a', 'b', 'b'])

print(df)
print('---')
print(df.loc['b'])

          0         1         2
a -0.196904  0.555696  0.065237
a  0.094582  0.015055  0.133386
b  0.154799 -0.427429  0.193470
b  1.742300 -1.201848 -0.044421
---
          0         1         2
b  0.154799 -0.427429  0.193470
b  1.742300 -1.201848 -0.044421
