In [2]:
import pandas as pd
import numpy as np

## 1. Pandas 基础

### 1.1 创建数据

In [3]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [4]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [5]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,0.590805,-0.687192,-0.109357,0.771107
2013-01-02,-1.483135,-1.13039,-1.615244,0.55695
2013-01-03,1.030111,0.013849,0.47487,-2.125574
2013-01-04,1.128438,-1.296163,-0.325883,-1.552199
2013-01-05,-1.78249,-1.33911,-0.208431,-0.808711
2013-01-06,-0.107094,0.819046,-0.605576,2.186797


In [6]:
df.shape

(6, 4)

In [9]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### 1.2 观察数据

In [14]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,0.590805,-0.687192,-0.109357,0.771107
2013-01-02,-1.483135,-1.13039,-1.615244,0.55695
2013-01-03,1.030111,0.013849,0.47487,-2.125574
2013-01-04,1.128438,-1.296163,-0.325883,-1.552199
2013-01-05,-1.78249,-1.33911,-0.208431,-0.808711


In [11]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,1.128438,-1.296163,-0.325883,-1.552199
2013-01-05,-1.78249,-1.33911,-0.208431,-0.808711
2013-01-06,-0.107094,0.819046,-0.605576,2.186797


In [15]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [16]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [17]:
df.to_numpy()

array([[ 0.59080496, -0.68719181, -0.10935684,  0.77110746],
       [-1.48313496, -1.13038966, -1.61524356,  0.55695049],
       [ 1.03011122,  0.01384865,  0.47487004, -2.12557389],
       [ 1.12843774, -1.29616316, -0.32588316, -1.55219937],
       [-1.78248972, -1.33911016, -0.20843091, -0.80871077],
       [-0.10709377,  0.8190459 , -0.60557633,  2.18679651]])

In [18]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [19]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.103894,-0.603327,-0.39827,-0.161938
std,1.265481,0.861426,0.694424,1.619317
min,-1.78249,-1.33911,-1.615244,-2.125574
25%,-1.139125,-1.25472,-0.535653,-1.366327
50%,0.241856,-0.908791,-0.267157,-0.12588
75%,0.920285,-0.161411,-0.134125,0.717568
max,1.128438,0.819046,0.47487,2.186797


In [20]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,0.590805,-1.483135,1.030111,1.128438,-1.78249,-0.107094
B,-0.687192,-1.13039,0.013849,-1.296163,-1.33911,0.819046
C,-0.109357,-1.615244,0.47487,-0.325883,-0.208431,-0.605576
D,0.771107,0.55695,-2.125574,-1.552199,-0.808711,2.186797


### 1.3 数据索引

In [21]:
df

Unnamed: 0,A,B,C,D
2013-01-01,0.590805,-0.687192,-0.109357,0.771107
2013-01-02,-1.483135,-1.13039,-1.615244,0.55695
2013-01-03,1.030111,0.013849,0.47487,-2.125574
2013-01-04,1.128438,-1.296163,-0.325883,-1.552199
2013-01-05,-1.78249,-1.33911,-0.208431,-0.808711
2013-01-06,-0.107094,0.819046,-0.605576,2.186797


In [25]:
df['B']

2013-01-01   -0.687192
2013-01-02   -1.130390
2013-01-03    0.013849
2013-01-04   -1.296163
2013-01-05   -1.339110
2013-01-06    0.819046
Freq: D, Name: B, dtype: float64

In [26]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,0.590805,-0.687192,-0.109357,0.771107
2013-01-02,-1.483135,-1.13039,-1.615244,0.55695
2013-01-03,1.030111,0.013849,0.47487,-2.125574


In [29]:
df.loc['2013-01-01']

Unnamed: 0,A,B,C,D
2013-01-01,0.590805,-0.687192,-0.109357,0.771107
2013-01-02,-1.483135,-1.13039,-1.615244,0.55695
2013-01-03,1.030111,0.013849,0.47487,-2.125574


In [34]:
df.loc[dates[0]]

A   -0.149426
B   -0.625770
C   -1.716888
D    1.248758
Name: 2013-01-01 00:00:00, dtype: float64

In [36]:
df.loc[:, ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,0.590805,-0.687192
2013-01-02,-1.483135,-1.13039
2013-01-03,1.030111,0.013849
2013-01-04,1.128438,-1.296163
2013-01-05,-1.78249,-1.33911
2013-01-06,-0.107094,0.819046


In [38]:
df.loc['20130102':'20130104', ['A','C']]

Unnamed: 0,A,C
2013-01-02,-0.240625,0.662846
2013-01-03,1.283688,-0.714698
2013-01-04,-1.958146,-0.25445


In [39]:
df.iloc[3]

A   -1.958146
B   -1.089130
C   -0.254450
D   -0.394750
Name: 2013-01-04 00:00:00, dtype: float64

In [40]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-1.958146,-1.08913
2013-01-05,0.86306,-1.118246


In [41]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.240625,0.662846
2013-01-03,1.283688,-0.714698
2013-01-05,0.86306,-1.151417
