In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity="all"

In [22]:
import numpy as np

import pandas as pd

pd.__version__

'1.2.1'

# Pandas对象简介


## pandas的Series对象
Pandas的Series对象是一个**带索引**数据构成的一维数组
Numpy数组通过隐式定义的整数索引获取数组，Pandas的Series对象用一种显式定义的索引与数值关联

### Series是通用的Numpy数组

In [None]:
data=pd.Series([0.25,0.5,0.75,1.0])
data
data.values
data.index

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64

array([0.25, 0.5 , 0.75, 1.  ])

RangeIndex(start=0, stop=4, step=1)

### Series是特殊的字典

In [5]:
population_dict = {'California': 38332521,
                   'Texas': 26448193,
                   'New York': 19651127,
                   'Florida': 19552860,
                   'Illinois': 12882135}
population = pd.Series(population_dict)
population

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
dtype: int64

### 创建Series对象

In [None]:
pd.Series([2,4,6])

0    2
1    4
2    6
dtype: int64

In [None]:
pd.Series(5,index=[100,200,300])

100    5
200    5
300    5
dtype: int64

In [None]:
pd.Series({2:'a',1:'b',3:'c'})

2    a
1    b
3    c
dtype: object

In [None]:
pd.Series({2:'a',1:'b',3:'c'},index=[3,2])#Series对象只会保留显式定义的键值对

## Pandas的DataFrame对象

### DataFrame是通用的Numpy

In [7]:

area_dict = {'California': 423967, 'Texas': 695662, 'New York': 141297,
             'Florida': 170312, 'Illinois': 149995}
area = pd.Series(area_dict)
area
states=pd.DataFrame({'population':population,'area':area})
states
states.index
states.columns

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
dtype: int64

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


Index(['California', 'Texas', 'New York', 'Florida', 'Illinois'], dtype='object')

Index(['population', 'area'], dtype='object')

### DataFrame是特殊的字典
在numpy二维数组中data[0]返回第一行，dataframe\[col0\]返回第一列

In [13]:
states['population']
states.loc['Texas']

California    38332521
Texas         26448193
New York      19651127
Florida       19552860
Illinois      12882135
Name: population, dtype: int64

population    26448193
area            695662
Name: Texas, dtype: int64

### 创建DataFrame对象

In [14]:
#通过单个DataFrame对象创建
pd.DataFrame(population,columns=['population'])

Unnamed: 0,population
California,38332521
Texas,26448193
New York,19651127
Florida,19552860
Illinois,12882135


In [16]:
#通过字典列表创建,缺失的值会用Nan(not a number)来表示
data=[{'a':i,'b':2*i} for i in range(3)]
pd.DataFrame(data)
pd.DataFrame([{'a':1,'b':2},{'b':3,'c':4}])

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [18]:
#通过Series对象字典创建
pd.DataFrame({'population':population,'area':area})

Unnamed: 0,population,area
California,38332521,423967
Texas,26448193,695662
New York,19651127,141297
Florida,19552860,170312
Illinois,12882135,149995


In [23]:
#通过numpy二维数组创建
pd.DataFrame(np.random.rand(3,2),columns=['foo','bar'],index=['a','b','c'])

Unnamed: 0,foo,bar
a,0.920428,0.397977
b,0.363318,0.030885
c,0.11177,0.589904


In [25]:
#通过numpy结构化数组创建
A=np.zeros(3,dtype=[('A','i8'),('B','f8')])
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


## Pandas的Index对象
可以将Index对象看作一个**不可变数组或有序集合**
- 当看作不可变数组时，可以进行切片操作
- 当看作有序集合时，可以进行集合操作如join

In [29]:
ind=pd.Index([2,3,5,7,11])
#当作不可变数组
ind[1]
ind[::2]
ind.size
ind.shape
ind.ndim
ind.dtype

3

Int64Index([2, 5, 11], dtype='int64')

5

(5,)

1

dtype('int64')

In [32]:
#看作有序集合
indA=pd.Index([1,3,5,7,9])
indB=pd.Index([2,3,5,7,11])
indA & indB
indA | indB
indA ^ indB

  after removing the cwd from sys.path.


Int64Index([3, 5, 7], dtype='int64')

Int64Index([1, 2, 3, 5, 7, 9, 11], dtype='int64')

Int64Index([1, 2, 9, 11], dtype='int64')

# 数据取值与选择

## Series数据选择方法

### 将Series看作字典

In [38]:
data=pd.Series([0.25,0.5,0.75,1.0],index=['a','b','c','d'])
data
data['b']
'a' in data
data.keys()
list(data.items())
data['e']=1.25
data

a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64

0.5

True

Index(['a', 'b', 'c', 'd'], dtype='object')

[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]

a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64

### 将Series看作一维数组
当使用显示索引作切片时，结果包含最后一个索引，使用隐式索引作切片时，结果**不包含**最后一个索引

In [43]:
#显式索引作为切片
data['a':'c']
#隐式索引作为切片
data[0:2]
#掩码
data[(data>0.3)&(data<0.8)]
#花哨的索引
data[['a','c']]

a    0.25
b    0.50
c    0.75
dtype: float64

a    0.25
b    0.50
dtype: float64

b    0.50
c    0.75
dtype: float64

a    0.25
c    0.75
dtype: float64

### 索引器：loc、iloc和ix

In [45]:
data=pd.Series(['a','b','c'],index=[1,3,5])
data
#取值操作是显式索引
data[1]
#切片操作是隐式索引
data[1:3]


1    a
3    b
5    c
dtype: object

'a'

3    b
5    c
dtype: object

In [46]:
#索引器不是Series对象的函数方法，而是暴露切片接口的属性
#loc属性，切片和取值都是显式的
data.loc[1]
data.loc[1:3]

'a'

1    a
3    b
dtype: object

In [48]:
#第二种是iloc属性，取值和切片都是隐式

data.iloc[1]
data.iloc[1:3]
#第三种ix是前两种的混合形式，主要用于DataFrame对象

'b'

3    b
5    c
dtype: object

## DataFrame数据选择方法

### 将DataFrame看作字典

In [50]:

area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
data

Unnamed: 0,area,pop
California,423967,38332521
Texas,695662,26448193
New York,141297,19651127
Florida,170312,19552860
Illinois,149995,12882135
