In [192]:
import pandas as pd
import numpy as np

## 数据结构

### Series

In [193]:
obj = pd.Series([4, 7, -5, 3])
print(obj) #左边是索引，右边是值
print(obj.values) #值
print(obj.index) #索引

obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])
print(obj2)
print(obj2.index)

print(obj2['a']) #通过索引取值
obj2['d'] = 6 #通过索引赋值
print(obj2[['c', 'a', 'd']]) #通过索引列表取值

0    4
1    7
2   -5
3    3
dtype: int64
[ 4  7 -5  3]
RangeIndex(start=0, stop=4, step=1)
d    4
b    7
a   -5
c    3
dtype: int64
Index(['d', 'b', 'a', 'c'], dtype='object')
-5
c    3
a   -5
d    6
dtype: int64


In [194]:
print(obj2[obj2 > 0]) #布尔型索引
print(obj2 * 2) #算术运算
print(np.exp(obj2)) #numpy函数

print('b' in obj2) #检查索引是否存在

sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon': 16000, 'Utah': 5000} #字典创建Series
obj3 = pd.Series(sdata)
print(obj3)

d    6
b    7
c    3
dtype: int64
d    12
b    14
a   -10
c     6
dtype: int64
d     403.428793
b    1096.633158
a       0.006738
c      20.085537
dtype: float64
True
Ohio      35000
Texas     71000
Oregon    16000
Utah       5000
dtype: int64


In [195]:
states = ['California', 'Ohio', 'Oregon', 'Texas'] #指定索引创建Series
obj4 = pd.Series(sdata, index=states)
print(obj4) #缺失值为NaN
print(pd.isnull(obj4)) #检查缺失值
print(pd.notnull(obj4)) #检查非缺失值
print(obj3 + obj4) #算术运算，索引对齐

obj4.name = 'population' #设置name属性
obj4.index.name = 'state' #设置index的name属性
print(obj4)

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64
California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool
California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool
California         NaN
Ohio           70000.0
Oregon         32000.0
Texas         142000.0
Utah               NaN
dtype: float64
state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64


In [196]:

obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan'] #修改索引
print(obj)

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64


### DataFrame

In [None]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'],
		'year': [2000, 2001, 2002, 2001, 2002, 2003],
		'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data) #字典创建DataFrame
print(frame)
print(frame.head()) #前五行
print(frame.tail(3)) #后3行

    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2
    state  year  pop
0    Ohio  2000  1.5
1    Ohio  2001  1.7
2    Ohio  2002  3.6
3  Nevada  2001  2.4
4  Nevada  2002  2.9
    state  year  pop
3  Nevada  2001  2.4
4  Nevada  2002  2.9
5  Nevada  2003  3.2
   year   state  pop
0  2000    Ohio  1.5
1  2001    Ohio  1.7
2  2002    Ohio  3.6
3  2001  Nevada  2.4
4  2002  Nevada  2.9
5  2003  Nevada  3.2
       year   state  pop debt
one    2000    Ohio  1.5  NaN
two    2001    Ohio  1.7  NaN
three  2002    Ohio  3.6  NaN
four   2001  Nevada  2.4  NaN
five   2002  Nevada  2.9  NaN
six    2003  Nevada  3.2  NaN
Index(['state', 'year', 'pop'], dtype='object')


In [None]:
print(pd.DataFrame(data, columns=['year', 'state', 'pop'])) #指定列顺序
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], index=['one', 'two', 'three', 'four', 'five', 'six']) #指定列顺序和索引
print(frame2)

print(frame.columns) #列名

In [198]:
print(frame2['state']) #取列
print(frame2.year) #取列

print(frame2.loc['three']) #按标签取行

one        Ohio
two        Ohio
three      Ohio
four     Nevada
five     Nevada
six      Nevada
Name: state, dtype: object
one      2000
two      2001
three    2002
four     2001
five     2002
six      2003
Name: year, dtype: int64
year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object


In [199]:
frame2['debt'] = 16.5 #赋值
print(frame2)
frame2['debt'] = np.arange(6.)
print(frame2)

       year   state  pop  debt
one    2000    Ohio  1.5  16.5
two    2001    Ohio  1.7  16.5
three  2002    Ohio  3.6  16.5
four   2001  Nevada  2.4  16.5
five   2002  Nevada  2.9  16.5
six    2003  Nevada  3.2  16.5
       year   state  pop  debt
one    2000    Ohio  1.5   0.0
two    2001    Ohio  1.7   1.0
three  2002    Ohio  3.6   2.0
four   2001  Nevada  2.4   3.0
five   2002  Nevada  2.9   4.0
six    2003  Nevada  3.2   5.0


In [200]:
val = pd.Series([-1.2, -1.5, -1.7], index=['two', 'four', 'five'])
frame2['debt'] = val #按索引赋值
print(frame2)

frame2['eastern'] = frame2.state == 'Ohio' #布尔型列
print(frame2)
del frame2['eastern'] #删除列
print(frame2.columns) #列名

       year   state  pop  debt
one    2000    Ohio  1.5   NaN
two    2001    Ohio  1.7  -1.2
three  2002    Ohio  3.6   NaN
four   2001  Nevada  2.4  -1.5
five   2002  Nevada  2.9  -1.7
six    2003  Nevada  3.2   NaN
       year   state  pop  debt  eastern
one    2000    Ohio  1.5   NaN     True
two    2001    Ohio  1.7  -1.2     True
three  2002    Ohio  3.6   NaN     True
four   2001  Nevada  2.4  -1.5    False
five   2002  Nevada  2.9  -1.7    False
six    2003  Nevada  3.2   NaN    False
Index(['year', 'state', 'pop', 'debt'], dtype='object')


In [201]:
pop = {'Nevada': {2001: 2.4, 2002: 2.9},
	   'Ohio': {2000: 1.5, 2001: 1.7, 2002: 3.6}}
frame3 = pd.DataFrame(pop) #嵌套字典创建DataFrame
print(frame3)
print(frame3.T) #转置

pdata = {'Ohio': frame3['Ohio'][:-1], 'Nevada': frame3['Nevada'][:2]} #通过列创建DataFrame
print(pd.DataFrame(pdata))

frame3.index.name = 'year' #设置index的name属性
frame3.columns.name = 'state' #设置columns的name属性
print(frame3)

print(frame3.values) #二维ndarray
print(frame3.values.dtype) #值的类型
print(frame2.values) #二维ndarray，类型不同会转换成object
print(frame2.values.dtype) #值的类型

      Nevada  Ohio
2001     2.4   1.7
2002     2.9   3.6
2000     NaN   1.5
        2001  2002  2000
Nevada   2.4   2.9   NaN
Ohio     1.7   3.6   1.5
      Ohio  Nevada
2001   1.7     2.4
2002   3.6     2.9
state  Nevada  Ohio
year               
2001      2.4   1.7
2002      2.9   3.6
2000      NaN   1.5
[[2.4 1.7]
 [2.9 3.6]
 [nan 1.5]]
float64
[[2000 'Ohio' 1.5 nan]
 [2001 'Ohio' 1.7 -1.2]
 [2002 'Ohio' 3.6 nan]
 [2001 'Nevada' 2.4 -1.5]
 [2002 'Nevada' 2.9 -1.7]
 [2003 'Nevada' 3.2 nan]]
object


DataFrame构造函数的有效输入可以有：

- 2D ndarray：数据的矩阵，行和列的标签是可选参数
- 数组、列表和元组构成的字典：每个序列成为DataFrame的一列，所有序列必须长度相等
- NumPy结构化/记录化数组：与数组构成的字典一致
- Series构成的字典：每个值成为一列，每个Series的索引联合起来 形成结果的行索引，也可以显式地传递序列
- 字典构成的字典：每个内部字典成为一列，键联合起来形成结果的行索引
- 字典或Series构成的列表：列表中的一个元素形成DataFrame的一行，字典键或Series索引联合起来形成DataFrame的列标签
- 列表或元组构成的列表：与二维数组的情况一致
- 其他DataFrame：如果不显示传递索引，则会使用原DatraFrame的索引
- NumPy MaskedArray：和二维数组的情况类似但隐藏制会在结果中成为缺失值

### 索引对象

In [202]:
obj = pd.Series(range(3), index=['a', 'b', 'c'])
index = obj.index
print(obj)
print(index)
print(index[1:]) #索引切片
print(index[1]) #单个索引
# index[1] = 'd' #索引不可修改，不变性更安全
lables = pd.Index(np.arange(3)) #创建Index对象
print(lables)
obj2 = pd.Series([1.5, -2.5, 0], index=lables)
print(obj2)
print(obj2.index is lables) #obj2的索引就是lables

a    0
b    1
c    2
dtype: int64
Index(['a', 'b', 'c'], dtype='object')
Index(['b', 'c'], dtype='object')
b
Index([0, 1, 2], dtype='int64')
0    1.5
1   -2.5
2    0.0
dtype: float64
True


In [203]:
#除了类似数组，索引对象也像一个固定大小的集合
print(frame3)
print('Ohio' in frame3.columns) #检查列名是否存在
print(2003 in frame3.index) #检查行索引是否存在
dup_lables = pd.Index(['foo', 'foo', 'bar', 'bar']) #重复索引
print(dup_lables)

state  Nevada  Ohio
year               
2001      2.4   1.7
2002      2.9   3.6
2000      NaN   1.5
True
False
Index(['foo', 'foo', 'bar', 'bar'], dtype='object')


In [204]:
del obj, obj2, obj3, obj4, frame, frame2, frame3, data, sdata, states, pop, pdata, val, index, lables, dup_lables