![pandas](assets/pandas.png)

In [1]:
import pandas as pd
import numpy as np

## 1. Pandas 基础

### 1.1 创建数据

In [2]:
s = pd.Series([1, 3, 5, np.nan, 6, 8])
s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

In [7]:
dates = pd.date_range('20130101', periods=6)
dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [8]:
df = pd.DataFrame(np.random.randn(6, 4), index=dates, columns=list('ABCD'))
df

Unnamed: 0,A,B,C,D
2013-01-01,-0.149426,-0.62577,-1.716888,1.248758
2013-01-02,-0.240625,1.162671,0.662846,0.788367
2013-01-03,1.283688,-1.709946,-0.714698,0.583882
2013-01-04,-1.958146,-1.08913,-0.25445,-0.39475
2013-01-05,0.86306,-1.118246,-1.151417,-1.084437
2013-01-06,-1.721927,-1.022905,-0.995096,-0.155186


In [11]:
df.shape

(6, 4)

In [12]:
df2 = pd.DataFrame({'A': 1.,
                    'B': pd.Timestamp('20130102'),
                    'C': pd.Series(1, index=list(range(4)), dtype='float32'),
                    'D': np.array([3] * 4, dtype='int32'),
                    'E': pd.Categorical(["test", "train", "test", "train"]),
                    'F': 'foo'})

df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2013-01-02,1.0,3,test,foo
1,1.0,2013-01-02,1.0,3,train,foo
2,1.0,2013-01-02,1.0,3,test,foo
3,1.0,2013-01-02,1.0,3,train,foo


### 1.2 观察数据

In [13]:
df.head()

Unnamed: 0,A,B,C,D
2013-01-01,-0.149426,-0.62577,-1.716888,1.248758
2013-01-02,-0.240625,1.162671,0.662846,0.788367
2013-01-03,1.283688,-1.709946,-0.714698,0.583882
2013-01-04,-1.958146,-1.08913,-0.25445,-0.39475
2013-01-05,0.86306,-1.118246,-1.151417,-1.084437


In [14]:
df.tail(3)

Unnamed: 0,A,B,C,D
2013-01-04,-1.958146,-1.08913,-0.25445,-0.39475
2013-01-05,0.86306,-1.118246,-1.151417,-1.084437
2013-01-06,-1.721927,-1.022905,-0.995096,-0.155186


In [15]:
df.index

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

In [16]:
df.columns

Index(['A', 'B', 'C', 'D'], dtype='object')

In [17]:
df.to_numpy()

array([[-0.14942612, -0.62576979, -1.71688763,  1.24875823],
       [-0.24062523,  1.16267133,  0.66284567,  0.78836657],
       [ 1.28368809, -1.70994585, -0.71469757,  0.58388228],
       [-1.95814636, -1.08912951, -0.25445005, -0.39474987],
       [ 0.86305987, -1.11824553, -1.15141684, -1.08443748],
       [-1.72192673, -1.02290463, -0.99509583, -0.15518624]])

In [18]:
df2.to_numpy()

array([[1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1.0, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [19]:
df.describe()

Unnamed: 0,A,B,C,D
count,6.0,6.0,6.0,6.0
mean,-0.320563,-0.733887,-0.69495,0.164439
std,1.315733,0.991844,0.822295,0.862034
min,-1.958146,-1.709946,-1.716888,-1.084437
25%,-1.351601,-1.110967,-1.112337,-0.334859
50%,-0.195026,-1.056017,-0.854897,0.214348
75%,0.609938,-0.725053,-0.369512,0.737245
max,1.283688,1.162671,0.662846,1.248758


In [20]:
df.T

Unnamed: 0,2013-01-01,2013-01-02,2013-01-03,2013-01-04,2013-01-05,2013-01-06
A,-0.149426,-0.240625,1.283688,-1.958146,0.86306,-1.721927
B,-0.62577,1.162671,-1.709946,-1.08913,-1.118246,-1.022905
C,-1.716888,0.662846,-0.714698,-0.25445,-1.151417,-0.995096
D,1.248758,0.788367,0.583882,-0.39475,-1.084437,-0.155186


### 1.3 数据索引

In [22]:
df['B']

2013-01-01   -0.625770
2013-01-02    1.162671
2013-01-03   -1.709946
2013-01-04   -1.089130
2013-01-05   -1.118246
2013-01-06   -1.022905
Freq: D, Name: B, dtype: float64

In [25]:
df[0:3]

Unnamed: 0,A,B,C,D
2013-01-01,-0.149426,-0.62577,-1.716888,1.248758
2013-01-02,-0.240625,1.162671,0.662846,0.788367
2013-01-03,1.283688,-1.709946,-0.714698,0.583882


In [33]:
df.loc['2013-01-01']

A   -0.149426
B   -0.625770
C   -1.716888
D    1.248758
Name: 2013-01-01 00:00:00, dtype: float64

In [34]:
df.loc[dates[0]]

A   -0.149426
B   -0.625770
C   -1.716888
D    1.248758
Name: 2013-01-01 00:00:00, dtype: float64

In [35]:
df.loc[: , ['A', 'B']]

Unnamed: 0,A,B
2013-01-01,-0.149426,-0.62577
2013-01-02,-0.240625,1.162671
2013-01-03,1.283688,-1.709946
2013-01-04,-1.958146,-1.08913
2013-01-05,0.86306,-1.118246
2013-01-06,-1.721927,-1.022905


In [38]:
df.loc['20130102':'20130104', ['A','C']]

Unnamed: 0,A,C
2013-01-02,-0.240625,0.662846
2013-01-03,1.283688,-0.714698
2013-01-04,-1.958146,-0.25445


In [39]:
df.iloc[3]

A   -1.958146
B   -1.089130
C   -0.254450
D   -0.394750
Name: 2013-01-04 00:00:00, dtype: float64

In [40]:
df.iloc[3:5, 0:2]

Unnamed: 0,A,B
2013-01-04,-1.958146,-1.08913
2013-01-05,0.86306,-1.118246


In [41]:
df.iloc[[1, 2, 4], [0, 2]]

Unnamed: 0,A,C
2013-01-02,-0.240625,0.662846
2013-01-03,1.283688,-0.714698
2013-01-05,0.86306,-1.151417
