In [2]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np

## Series

In [None]:
obj = Series([4,7,-5,3])
print(obj)
print(obj.values)
print(obj.index)

In [5]:
obj2 = Series([4,7,-5,3], index=['d','b','a','c'])
print(obj2)
print(obj2.index)

d    4
b    7
a   -5
c    3
dtype: int64
Index(['d', 'b', 'a', 'c'], dtype='object')


In [6]:
# 索引取值
print(obj2['a'])
print(obj2['d'])
print(obj2[['c','a','d']])


-5
4
c    3
a   -5
d    4
dtype: int64


In [8]:
# 使用numpy运算
# 布尔型数组过滤
print(obj2[obj2>0])
# 标量乘法
print(obj2*2)
# 数学函数
print(np.exp(obj2))

d    4
b    7
c    3
dtype: int64
d     8
b    14
a   -10
c     6
dtype: int64
d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64


In [9]:
# 将其看成有序字典
print('b' in obj2)
print('e' in obj2)

True
False


In [10]:
# 根据字典创建
sdata = {'a':1,'b':2}
obj3 = Series(sdata)
print(obj3)

a    1
b    2
dtype: int64


In [12]:
ind = ['a','b','c']
obj4 = Series(sdata, index=ind)
print(obj4)

a    1.0
b    2.0
c    NaN
dtype: float64


In [14]:
# nan表示缺失或NA值
# 检测缺失数据: treating None, NaN or NA as null
print(pd.isnull(obj4))
print(pd.isna(obj4))

a    False
b    False
c     True
dtype: bool
a    False
b    False
c     True
dtype: bool


In [15]:
print(pd.notnull(obj4))
print(pd.notna(obj4))

a     True
b     True
c    False
dtype: bool
a     True
b     True
c    False
dtype: bool


In [17]:
print(obj4.isnull)
print(obj4.isna)

<bound method Series.isnull of a    1.0
b    2.0
c    NaN
dtype: float64>
<bound method Series.isna of a    1.0
b    2.0
c    NaN
dtype: float64>


In [18]:
# 在算术运算中自动对齐不同索引的数据
print(obj3+obj4)

a    2.0
b    4.0
c    NaN
dtype: float64


In [19]:
# series对象及其索引的name属性
obj4.name = 'zhangsan'
obj4.index.name = 'zhangsan_index'
print(obj4)

zhangsan_index
a    1.0
b    2.0
c    NaN
Name: zhangsan, dtype: float64


In [20]:
# 修改索引
obj4.index = ['aa','bb','cc']
print(obj4)

aa    1.0
bb    2.0
cc    NaN
Name: zhangsan, dtype: float64


## DataFrame

In [21]:
data = {'state':['Ohio','Ohio','Ohio','Nevada','Nevada'],
        'year':[2000,2001,2002,2001,2002],
        'pop':[1.5,1.7,3.6,2.4,2.9]}
# 会自动排序
frame = DataFrame(data)
print(frame)

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9


In [22]:
# 指定列的顺序
print(DataFrame(data, columns=['year','state','pop']))


   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9


In [23]:
frame2 = DataFrame(data,
                   columns=['year','state','pop','debt'],
                   index=['one','two','three','four','five'])
print(frame2)
print(frame2.columns)

       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
Index(['year', 'state', 'pop', 'debt'], dtype='object')


In [24]:
# 获取列，这是一个series
print(frame2['state'])
print(frame2.year)

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
Name: state, dtype: object
one      2000
two      2001
three    2002
four     2001
five     2002
Name: year, dtype: int64


In [26]:
# 获取行，这是一个series
print(frame2.loc['three'])


year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object


In [28]:
# 修改列
frame2['debt'] = 16.5
print(frame2)

       year   state  pop  debt
one    2000    Ohio  1.5  16.5
two    2001    Ohio  1.7  16.5
three  2002    Ohio  3.6  16.5
four   2001  Nevada  2.4  16.5
five   2002  Nevada  2.9  16.5


In [29]:
frame2['debt'] = np.arange(5)
print(frame2)

       year   state  pop  debt
one    2000    Ohio  1.5     0
two    2001    Ohio  1.7     1
three  2002    Ohio  3.6     2
four   2001  Nevada  2.4     3
five   2002  Nevada  2.9     4


In [30]:
# 使用series修改列，会自动匹配dataframe的索引
val = Series([-1.2,-1.5,-1.7], index=['two','four','five'])
frame2['debt'] = val
print(frame2)

       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7


In [31]:
# 创建一个新列
frame2['eastern'] = frame2.state == 'Ohio'
print(frame2)

       year   state  pop  debt  eastern
one    2000    Ohio  1.5   NaN     True
two    2001    Ohio  1.7  -1.2     True
three  2002    Ohio  3.6   NaN     True
four   2001  Nevada  2.4  -1.5    False
five   2002  Nevada  2.9  -1.7    False


In [32]:
# 删除列
del frame2['eastern']
print(frame2)

       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7


In [33]:
# 嵌套字典
pop = {
    'Nevada': {2001:2.4,2002:2.9},
    'Ohio': {2000:1.5,2001:1.7,2002:3.6}
}
frame3 = DataFrame(pop)
print(frame3)

      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2000     NaN   1.5


In [34]:
# 转置
print(frame3.T)


        2001  2002  2000
Nevada   2.4   2.9   NaN
Ohio     1.7   3.6   1.5


In [35]:
# 同时指定索引
print(DataFrame(pop, index=[2001,2002,2003]))

      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2003     NaN   NaN


In [36]:
pdata = {'Ohio': frame3['Ohio'][:-1],
         'Nevada': frame3['Nevada'][:2]}
print(DataFrame(pdata))

      Ohio  Nevada
2001   1.7     2.4
2002   3.6     2.9


In [37]:
frame3.index.name='year'
frame3.columns.name='state'
print(frame3)

state  Nevada  Ohio
year               
2001      2.4   1.7
2002      2.9   3.6
2000      NaN   1.5


In [38]:
print(frame3.values)
print(frame2.values)

[[2.4 1.7]
 [2.9 3.6]
 [nan 1.5]]
[[2000 'Ohio' 1.5 nan]
 [2001 'Ohio' 1.7 -1.2]
 [2002 'Ohio' 3.6 nan]
 [2001 'Nevada' 2.4 -1.5]
 [2002 'Nevada' 2.9 -1.7]]


## 索引对象

In [39]:
obj = Series(range(3), index=['a','b','c'])
index = obj.index
print(index)
print(index[1:])

Index(['a', 'b', 'c'], dtype='object')
Index(['b', 'c'], dtype='object')


In [40]:
# Index对象不可变
index[1] = 'd'

TypeError: Index does not support mutable operations

In [41]:
index = pd.Index(np.arange(3))
obj2 = Series([1.5,-2.5,0], index=index)
print(obj2)
print(obj2.index is index)

0    1.5
1   -2.5
2    0.0
dtype: float64
True


In [43]:
# Index也类似一个固定大小的集合
print(frame3)
print('Ohio' in frame3.columns)
print(2003 in frame3.index)
print(frame3.index.is_unique)

state  Nevada  Ohio
year               
2001      2.4   1.7
2002      2.9   3.6
2000      NaN   1.5
True
False
True
