# Pandas基础

In [4]:
import pandas as pd

## pandas功能

<ol>
<li>具有按轴自动或显式数据对齐功能</li>
<li>集成时间序列功能</li>
<li>既能处理时间序列数据也能处理非时间序列数据</li>
<li>数学运算和约简</li>
<li>灵活处理缺失数据</li>
<li>使用常见数据库中的关系型运算</li>
</ol>

In [5]:
from pandas import DataFrame,Series

## Series:索引在左，值在右

In [6]:
obj = Series([4,7,-5,3])
obj

0    4
1    7
2   -5
3    3
dtype: int64

In [7]:
obj.values

array([ 4,  7, -5,  3], dtype=int64)

In [8]:
obj.index

Int64Index([0, 1, 2, 3], dtype='int64')

In [9]:
obj2 = Series([4,7,-5,3],index=['a','b','c','d'])
obj2

a    4
b    7
c   -5
d    3
dtype: int64

In [10]:
obj2.index

Index([u'a', u'b', u'c', u'd'], dtype='object')

In [12]:
obj2[['a','b']]

a    4
b    7
dtype: int64

In [13]:
obj2[obj2>0]

a    4
b    7
d    3
dtype: int64

In [14]:
'b' in obj2

True

In [15]:
sdata = {'ohio':35000,'texas':71000,'oregon':16000,'utah':5000}
obj3 = Series(sdata)
obj3

ohio      35000
oregon    16000
texas     71000
utah       5000
dtype: int64

In [16]:
states = ['california','ohio','oregon','texas']

In [18]:
obj4 = Series(sdata,index=states)
obj4

california      NaN
ohio          35000
oregon        16000
texas         71000
dtype: float64

In [20]:
pd.isnull(obj4)

california     True
ohio          False
oregon        False
texas         False
dtype: bool

In [21]:
pd.notnull(obj4)

california    False
ohio           True
oregon         True
texas          True
dtype: bool

In [22]:
obj3+obj4

california       NaN
ohio           70000
oregon         32000
texas         142000
utah             NaN
dtype: float64

In [23]:
obj4.name = 'population'

In [24]:
obj4.index.name = 'state'

In [25]:
obj4

state
california      NaN
ohio          35000
oregon        16000
texas         71000
Name: population, dtype: float64

## DataFrame:表格型数据结构

In [28]:
data = {'state':['ohio','ohio','ohio','nevada','nevada'],
        'year':[2000,2001,2002,2001,2002],
        'pop':[1.5,1.7,3.6,2.4,2.9]}
frame = DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,ohio,2000
1,1.7,ohio,2001
2,3.6,ohio,2002
3,2.4,nevada,2001
4,2.9,nevada,2002


## 指定列顺序

In [29]:
frame = DataFrame(data,columns=['year','state','pop'])
frame

Unnamed: 0,year,state,pop
0,2000,ohio,1.5
1,2001,ohio,1.7
2,2002,ohio,3.6
3,2001,nevada,2.4
4,2002,nevada,2.9


## 指定index

In [30]:
frame = DataFrame(data,columns=['year','state','pop'],index=['one','two','three','four','five'])
frame

Unnamed: 0,year,state,pop
one,2000,ohio,1.5
two,2001,ohio,1.7
three,2002,ohio,3.6
four,2001,nevada,2.4
five,2002,nevada,2.9


## 获取列

In [31]:
frame['state']

one        ohio
two        ohio
three      ohio
four     nevada
five     nevada
Name: state, dtype: object

In [32]:
frame.year

one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64

## 获取行

In [33]:
frame.ix['three']

year     2002
state    ohio
pop       3.6
Name: three, dtype: object

## 创建新列

In [37]:
frame['debt'] = 16.5
frame

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,16.5
two,2001,ohio,1.7,16.5
three,2002,ohio,3.6,16.5
four,2001,nevada,2.4,16.5
five,2002,nevada,2.9,16.5


In [38]:
frame['eastern'] = frame.state == 'ohio'
frame

Unnamed: 0,year,state,pop,debt,eastern
one,2000,ohio,1.5,16.5,True
two,2001,ohio,1.7,16.5,True
three,2002,ohio,3.6,16.5,True
four,2001,nevada,2.4,16.5,False
five,2002,nevada,2.9,16.5,False


## 删除列

In [39]:
del frame['eastern']
frame

Unnamed: 0,year,state,pop,debt
one,2000,ohio,1.5,16.5
two,2001,ohio,1.7,16.5
three,2002,ohio,3.6,16.5
four,2001,nevada,2.4,16.5
five,2002,nevada,2.9,16.5
