# 创建矩阵

In [1]:
import pandas as pd
import numpy as np
s = pd.Series([1,3,6,np.nan,44,41])

In [2]:
s

0     1.0
1     3.0
2     6.0
3     NaN
4    44.0
5    41.0
dtype: float64

# 创建时间序列 

In [3]:
dates = pd.date_range('20180701',periods=6)

In [4]:
dates

DatetimeIndex(['2018-07-01', '2018-07-02', '2018-07-03', '2018-07-04',
               '2018-07-05', '2018-07-06'],
              dtype='datetime64[ns]', freq='D')

# 自定义行列的矩阵

In [5]:
df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])

In [6]:
df

Unnamed: 0,a,b,c,d
2018-07-01,0.891318,0.288762,-0.264948,-0.194423
2018-07-02,-1.205095,-0.40537,0.053271,-0.706912
2018-07-03,0.179788,1.999196,1.759414,1.071318
2018-07-04,-0.375869,-0.459862,1.7984,-1.816501
2018-07-05,-0.629967,-0.190398,-0.416971,-0.256503
2018-07-06,0.300752,1.410693,-0.047278,-0.054342


# 未定义行列矩阵

In [7]:
df1 = pd.DataFrame(np.arange(12).reshape((3,4)))

In [8]:
df1

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


# 自定义列

In [9]:
df2 = pd.DataFrame({'A':1.,'B':pd.Timestamp('20180707'),'C':pd.Series(1,index=list(range(4)),dtype='int32'),'D':np.array([3]*4,dtype='int32'),'E':pd.Categorical(["test","train","test","train"]),'F':'foo'})

In [10]:
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-07-07,1,3,test,foo
1,1.0,2018-07-07,1,3,train,foo
2,1.0,2018-07-07,1,3,test,foo
3,1.0,2018-07-07,1,3,train,foo


# 输出所有列类型

In [11]:
df2.dtypes

A           float64
B    datetime64[ns]
C             int32
D             int32
E          category
F            object
dtype: object

# 输出所有序号

In [12]:
df2.index

Int64Index([0, 1, 2, 3], dtype='int64')

# 输出所有列名称

In [13]:
df2.columns

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object')

# 用数组表示所有值

In [14]:
df2.values

array([[1.0, Timestamp('2018-07-07 00:00:00'), 1, 3, 'test', 'foo'],
       [1.0, Timestamp('2018-07-07 00:00:00'), 1, 3, 'train', 'foo'],
       [1.0, Timestamp('2018-07-07 00:00:00'), 1, 3, 'test', 'foo'],
       [1.0, Timestamp('2018-07-07 00:00:00'), 1, 3, 'train', 'foo']], dtype=object)

# 对数字段进行比较描述计量，方差，最小，等

In [15]:
df2.describe()

Unnamed: 0,A,C,D
count,4.0,4.0,4.0
mean,1.0,1.0,3.0
std,0.0,0.0,0.0
min,1.0,1.0,3.0
25%,1.0,1.0,3.0
50%,1.0,1.0,3.0
75%,1.0,1.0,3.0
max,1.0,1.0,3.0


# 转置

In [16]:
df2.T

Unnamed: 0,0,1,2,3
A,1,1,1,1
B,2018-07-07 00:00:00,2018-07-07 00:00:00,2018-07-07 00:00:00,2018-07-07 00:00:00
C,1,1,1,1
D,3,3,3,3
E,test,train,test,train
F,foo,foo,foo,foo


# 排序，0从列排序，1从行排序,ascending=False，即倒序排列

In [17]:
df2.sort_index(axis=1,ascending=False)

Unnamed: 0,F,E,D,C,B,A
0,foo,test,3,1,2018-07-07,1.0
1,foo,train,3,1,2018-07-07,1.0
2,foo,test,3,1,2018-07-07,1.0
3,foo,train,3,1,2018-07-07,1.0


In [18]:
df2.sort_index(axis=0,ascending=False)

Unnamed: 0,A,B,C,D,E,F
3,1.0,2018-07-07,1,3,train,foo
2,1.0,2018-07-07,1,3,test,foo
1,1.0,2018-07-07,1,3,train,foo
0,1.0,2018-07-07,1,3,test,foo


# 对值进行排序

In [19]:
df2.sort_values(by='E')

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-07-07,1,3,test,foo
2,1.0,2018-07-07,1,3,test,foo
1,1.0,2018-07-07,1,3,train,foo
3,1.0,2018-07-07,1,3,train,foo
