In [1]:
import pandas
import numpy

In [2]:
s = pandas.Series([1, 3, 6, numpy.nan, 44, 1])
print(s)

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64


In [3]:
datas = pandas.date_range('20160101', periods=6)
print(datas)

DatetimeIndex(['2016-01-01', '2016-01-02', '2016-01-03', '2016-01-04',
               '2016-01-05', '2016-01-06'],
              dtype='datetime64[ns]', freq='D')


In [4]:
df = pandas.DataFrame(numpy.random.randn(6, 4), index=datas, columns=['a', 'b', 'c', 'd'])  # index作为行的key，columns作为列的key
print(df)

                   a         b         c         d
2016-01-01 -1.869362  0.077074  1.231011 -1.832309
2016-01-02  1.139228 -1.411883 -0.092928 -0.246048
2016-01-03  0.638512 -0.677155  1.878026 -0.266875
2016-01-04  1.035957 -0.400885  2.215768  0.823981
2016-01-05 -0.022262  0.266871 -1.156377  0.969492
2016-01-06 -1.608790 -0.202388 -0.050817  0.311565


In [5]:
df1 = pandas.DataFrame(numpy.arange(12).reshape((3, 4)))  # 行列的key为默认
print(df1)

   0  1   2   3
0  0  1   2   3
1  4  5   6   7
2  8  9  10  11


In [7]:
df2 = pandas.DataFrame({'A':1, 'B':pandas.Timestamp('20130102'), 'C':pandas.Series(1, index=list(range(4)), dtype='float32'),
                        'D':numpy.array([3] * 4, dtype='int32'), 'E':pandas.Categorical(['test', 'train', 'test', 'train']), 'F':"foo"})
print(df2)  # 以字典方式建立的DataFrame

   A          B    C  D      E    F
0  1 2013-01-02  1.0  3   test  foo
1  1 2013-01-02  1.0  3  train  foo
2  1 2013-01-02  1.0  3   test  foo
3  1 2013-01-02  1.0  3  train  foo


In [8]:
df2.dtypes  # 返回每列的数据类型

A             int64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object

In [9]:
df2.index  # 列的名字序号

Int64Index([0, 1, 2, 3], dtype='int64')

In [10]:
df2.columns  # 行的名字的序号

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

In [11]:
df2.values  # 获得所有值，而去掉行和列的名字

array([[1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo'],
       [1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'test', 'foo'],
       [1, Timestamp('2013-01-02 00:00:00'), 1.0, 3, 'train', 'foo']],
      dtype=object)

In [12]:
df2.describe()  # 计算每列中的一些数值，分数值类型的列不会计算

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


In [13]:
df2.T  # 转置，包括行名和列名也翻转

Unnamed: 0,0,1,2,3
A,1,1,1,1
B,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00,2013-01-02 00:00:00
C,1.0,1.0,1.0,1.0
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


In [14]:
df2.sort_index(axis=1, ascending=False)  # 以列名进行排序，ascending指定为倒序

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1.0,2013-01-02,1
1,foo,train,3,1.0,2013-01-02,1
2,foo,test,3,1.0,2013-01-02,1
3,foo,train,3,1.0,2013-01-02,1


In [15]:
df2.sort_index(axis=0, ascending=False)

Unnamed: 0,A,B,C,D,E,F
3,1,2013-01-02,1.0,3,train,foo
2,1,2013-01-02,1.0,3,test,foo
1,1,2013-01-02,1.0,3,train,foo
0,1,2013-01-02,1.0,3,test,foo


In [16]:
df2.sort_values(by='E')  # 以E列的数据为准进行排序

Unnamed: 0,A,B,C,D,E,F
0,1,2013-01-02,1.0,3,test,foo
2,1,2013-01-02,1.0,3,test,foo
1,1,2013-01-02,1.0,3,train,foo
3,1,2013-01-02,1.0,3,train,foo
