### pandas介绍：
  * numpy序列化的矩阵，类似列表
  * pandas类似字典

In [2]:
import pandas as pd
import numpy as np

In [4]:
s =pd.Series([1,2,6,np.nan,44,1])
s

0     1.0
1     2.0
2     6.0
3     NaN
4    44.0
5     1.0
dtype: float64

In [6]:
dates = pd.date_range('20180303',periods=6)
dates

DatetimeIndex(['2018-03-03', '2018-03-04', '2018-03-05', '2018-03-06',
               '2018-03-07', '2018-03-08'],
              dtype='datetime64[ns]', freq='D')

In [7]:
df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['a','b','c','d'])
df

Unnamed: 0,a,b,c,d
2018-03-03,1.761004,1.648836,-0.65691,0.535489
2018-03-04,-0.363756,1.480507,0.354068,-1.358791
2018-03-05,-0.349576,0.088785,1.165699,1.152263
2018-03-06,-1.153368,0.309072,0.836402,0.478812
2018-03-07,0.952556,-1.973463,1.178855,-0.152371
2018-03-08,0.120752,1.646287,-0.434218,0.761825


In [29]:
df1=pd.DataFrame(np.arange(12).reshape((3,4)))
df1

Unnamed: 0,0,1,2,3
0,0,1,2,3
1,4,5,6,7
2,8,9,10,11


In [4]:
df2=pd.DataFrame({'A':1.,
                  'B':pd.Timestamp('20180303'),
                  'C':pd.Series(1,index=list(range(4)),dtype='float32'),
                 'D':np.array([3]*4,dtype='int32'),
                 'E':pd.Categorical(["test","train","test","train"]),
                 'F':'foo'})
df2

Unnamed: 0,A,B,C,D,E,F
0,1.0,2018-03-03,1.0,3,test,foo
1,1.0,2018-03-03,1.0,3,train,foo
2,1.0,2018-03-03,1.0,3,test,foo
3,1.0,2018-03-03,1.0,3,train,foo


In [5]:
print(df2.dtypes,'\n')
print(df2.index,'\n')
print(df2.columns,'\n')
print(df2.values,'\n')
print(df2.describe(),'\n')
print(df2.T,'\n')
print(df2.sort_index(axis=1,ascending=False))
print(df2.sort_values(by='E'))


A           float64
B    datetime64[ns]
C           float32
D             int32
E          category
F            object
dtype: object 

Int64Index([0, 1, 2, 3], dtype='int64') 

Index(['A', 'B', 'C', 'D', 'E', 'F'], dtype='object') 

[[1.0 Timestamp('2018-03-03 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2018-03-03 00:00:00') 1.0 3 'train' 'foo']
 [1.0 Timestamp('2018-03-03 00:00:00') 1.0 3 'test' 'foo']
 [1.0 Timestamp('2018-03-03 00:00:00') 1.0 3 'train' 'foo']] 

         A    C    D
count  4.0  4.0  4.0
mean   1.0  1.0  3.0
std    0.0  0.0  0.0
min    1.0  1.0  3.0
25%    1.0  1.0  3.0
50%    1.0  1.0  3.0
75%    1.0  1.0  3.0
max    1.0  1.0  3.0 

                     0                    1                    2  \
A                    1                    1                    1   
B  2018-03-03 00:00:00  2018-03-03 00:00:00  2018-03-03 00:00:00   
C                    1                    1                    1   
D                    3                    3                   

### pandas选择数据：

In [15]:
dates=pd.date_range('20180303',periods=6)
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2018-03-03,0,1,2,3
2018-03-04,4,5,6,7
2018-03-05,8,9,10,11
2018-03-06,12,13,14,15
2018-03-07,16,17,18,19
2018-03-08,20,21,22,23


In [119]:
print(df['A'],'\n')
print(df.A,'\n')
print(df[0:3],'\n')
print(df['2018-03-04':'2018-03-7'])

2018-03-03     0
2018-03-04     4
2018-03-05     8
2018-03-06    12
2018-03-07    16
2018-03-08    20
Freq: D, Name: A, dtype: int64 

2018-03-03     0
2018-03-04     4
2018-03-05     8
2018-03-06    12
2018-03-07    16
2018-03-08    20
Freq: D, Name: A, dtype: int64 

            A  B   C   D
2018-03-03  0  1   2   3
2018-03-04  4  5   6   7
2018-03-05  8  9  10  11 

             A   B   C   D
2018-03-04   4   5   6   7
2018-03-05   8   9  10  11
2018-03-06  12  13  14  15
2018-03-07  16  17  18  19


In [115]:
#select by label:loc(标签)
print(df.loc['20180303'],'\n')
print(df.loc['20180303',['A','B']]  )

A    0
B    1
C    2
D    3
Name: 2018-03-03 00:00:00, dtype: int64 

A    0
B    1
Name: 2018-03-03 00:00:00, dtype: int64


In [70]:
#select by position:iloc(数字)
print(df.iloc[3:5,1:3])

             B   C
2018-03-06  13  14
2018-03-07  17  18


In [113]:
#mixed selection:ix
print(df.ix[:3,['A','C']])
print(df.ix['20180303':'20180305',['A','C']])
print(df.ix['20180303':'20180305',[0,2]])

            A   C
2018-03-03  0   2
2018-03-04  4   6
2018-03-05  8  10
            A   C
2018-03-03  0   2
2018-03-04  4   6
2018-03-05  8  10
            A   C
2018-03-03  0   2
2018-03-04  4   6
2018-03-05  8  10


* 竖的标签只能单行或df['2018-03-04':'2018-03-7']
* 横的标签智能['A','B']，不能['A':'C']

In [122]:
#Boolean indexing
print(df)
print(df[df.A>8])

             A   B   C   D
2018-03-03   0   1   2   3
2018-03-04   4   5   6   7
2018-03-05   8   9  10  11
2018-03-06  12  13  14  15
2018-03-07  16  17  18  19
2018-03-08  20  21  22  23
             A   B   C   D
2018-03-06  12  13  14  15
2018-03-07  16  17  18  19
2018-03-08  20  21  22  23


### pandas 设置值：

In [177]:
dates=pd.date_range('20180303',periods=6)
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2018-03-03,0,1,2,3
2018-03-04,4,5,6,7
2018-03-05,8,9,10,11
2018-03-06,12,13,14,15
2018-03-07,16,17,18,19
2018-03-08,20,21,22,23


In [178]:
df.iloc[2,2]=1111
df

Unnamed: 0,A,B,C,D
2018-03-03,0,1,2,3
2018-03-04,4,5,6,7
2018-03-05,8,9,1111,11
2018-03-06,12,13,14,15
2018-03-07,16,17,18,19
2018-03-08,20,21,22,23


In [179]:
df.loc['20180306','B']=2222
df

Unnamed: 0,A,B,C,D
2018-03-03,0,1,2,3
2018-03-04,4,5,6,7
2018-03-05,8,9,1111,11
2018-03-06,12,2222,14,15
2018-03-07,16,17,18,19
2018-03-08,20,21,22,23


In [180]:
df.A[df.A>4]=0
df

Unnamed: 0,A,B,C,D
2018-03-03,0,1,2,3
2018-03-04,4,5,6,7
2018-03-05,0,9,1111,11
2018-03-06,0,2222,14,15
2018-03-07,0,17,18,19
2018-03-08,0,21,22,23


In [188]:
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df[df.A>4]=0
df

Unnamed: 0,A,B,C,D
2018-03-03,0,1,2,3
2018-03-04,4,5,6,7
2018-03-05,0,0,0,0
2018-03-06,0,0,0,0
2018-03-07,0,0,0,0
2018-03-08,0,0,0,0


In [191]:
df['F']=np.nan
print(df)
df['E']=pd.Series([1,2,3,4,5,6],index=dates)

            A  B  C  D   F  E
2018-03-03  0  1  2  3 NaN  1
2018-03-04  4  5  6  7 NaN  2
2018-03-05  0  0  0  0 NaN  3
2018-03-06  0  0  0  0 NaN  4
2018-03-07  0  0  0  0 NaN  5
2018-03-08  0  0  0  0 NaN  6


### pandas处理丢失数据：

In [194]:
df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])
df

Unnamed: 0,A,B,C,D
2018-03-03,0,1,2,3
2018-03-04,4,5,6,7
2018-03-05,8,9,10,11
2018-03-06,12,13,14,15
2018-03-07,16,17,18,19
2018-03-08,20,21,22,23


In [197]:
df.iloc[0,1]=np.nan
df.iloc[1,2]=np.nan
df

Unnamed: 0,A,B,C,D
2018-03-03,0,,2.0,3
2018-03-04,4,5.0,,7
2018-03-05,8,9.0,10.0,11
2018-03-06,12,13.0,14.0,15
2018-03-07,16,17.0,18.0,19
2018-03-08,20,21.0,22.0,23


In [207]:
print(df.dropna(axis=0,how='any'))#how={'any','all'}，默认any
print(df.dropna(axis=1,how='all'))
print(df.fillna(value=0))
print(df.isnull())
print(np.any(df.isnull())==True)


             A     B     C   D
2018-03-05   8   9.0  10.0  11
2018-03-06  12  13.0  14.0  15
2018-03-07  16  17.0  18.0  19
2018-03-08  20  21.0  22.0  23
             A     B     C   D
2018-03-03   0   NaN   2.0   3
2018-03-04   4   5.0   NaN   7
2018-03-05   8   9.0  10.0  11
2018-03-06  12  13.0  14.0  15
2018-03-07  16  17.0  18.0  19
2018-03-08  20  21.0  22.0  23
             A     B     C   D
2018-03-03   0   0.0   2.0   3
2018-03-04   4   5.0   0.0   7
2018-03-05   8   9.0  10.0  11
2018-03-06  12  13.0  14.0  15
2018-03-07  16  17.0  18.0  19
2018-03-08  20  21.0  22.0  23
                A      B      C      D
2018-03-03  False   True  False  False
2018-03-04  False  False   True  False
2018-03-05  False  False  False  False
2018-03-06  False  False  False  False
2018-03-07  False  False  False  False
2018-03-08  False  False  False  False
True


### pandas导入导出：

In [215]:
data=pd.read_csv('Student.csv')
print(data)
data.to_pickle('student.pickle')

    Student ID   name  age  grender
0         1100      A    21  Female
1         1101      B    22    Male
2         1102      C    12  Female
3         1103     ZL    32    Male
4         1104    ZYJ    16  Female
5         1105     JL    18  Female
6         1106    DOG    16  Female
7         1107    CAT    24  Female
8         1108      D    25    Male
9         1109      E    26    Male
10        1110      F    26    Male
11        1111      G    19  Female
12        1112      H    17    Male
13        1113      I    21    Male
14        1114  KEELY    12  Female
