In [1]:
import pandas as pd

In [2]:
from pandas import Series, DataFrame

## Series

Series相当于数组numpy.array类似

### index

In [21]:
obj = pd.Series([4, 7, -5, 3])
obj.index = ['Bob', 'Steve', 'Jeff', 'Ryan']
obj

Bob      4
Steve    7
Jeff    -5
Ryan     3
dtype: int64

In [3]:
obj2 = pd.Series([4, 7, -5, 3], index=['d', 'b', 'a', 'c'])

In [4]:
obj2[1]

7

In [5]:
obj2['a']

-5

In [6]:
obj2[['c', 'a', 'd']]

c    3
a   -5
d    4
dtype: int64

### 筛选大于0

In [7]:
obj2[obj2 > 0]

d    4
b    7
c    3
dtype: int64

### 运算

In [8]:
obj2 * 2

d     8
b    14
a   -10
c     6
dtype: int64

exp，高等数学里以自然常数e为底的指数函数

In [9]:
import numpy as np
np.exp(obj2)

d      54.598150
b    1096.633158
a       0.006738
c      20.085537
dtype: float64

### 创建字典

In [10]:
sdata = {'Ohio': 35000, 'Texas': 71000, 'Oregon':16000, 'Utah': 5000}

In [11]:
type(sdata)

dict

还可以直接用现有的dict来创建series：

In [13]:
obj3 = pd.Series(sdata)
obj3

Ohio      35000
Oregon    16000
Texas     71000
Utah       5000
dtype: int64

### 利用index排序

series中的index其实就是dict中排好序的keys。我们也可以传入一个自己想要的顺序：

In [15]:
states = ['Utah', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

Utah       5000
Ohio      35000
Oregon    16000
Texas     71000
dtype: int64

### 缺失值

缺失值：
pandas中的isnull和notnull函数可以用来检测缺失数据：

In [16]:
states = ['California', 'Ohio', 'Oregon', 'Texas']
obj4 = pd.Series(sdata, index=states)
obj4

California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
dtype: float64

In [17]:
pd.isnull(obj4)

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [18]:
pd.notnull(obj4)

California    False
Ohio           True
Oregon         True
Texas          True
dtype: bool

In [19]:
obj4.isnull()

California     True
Ohio          False
Oregon        False
Texas         False
dtype: bool

In [None]:
df.isnull()#是缺失值返回True，否则返回False
df.isnull().sum()#返回每列包含的缺失值的个数
df.dropna()#直接删除含有缺失值的行
df.dropna(axis = 1)#直接删除含有缺失值的列
df.dropna(how = 'all')#只删除全是缺失值的行
df.dropna(thresh = 4)#保留至少有4个缺失值的行
df.dropna(subset = ['C'])#删除含有缺失值的特定的列

### name属性

serice自身和它的index都有一个叫name的属性，这个能和其他pandas的函数进行整合：

In [20]:
obj4.name = 'population'
obj4.index.name = 'state'
obj4

state
California        NaN
Ohio          35000.0
Oregon        16000.0
Texas         71000.0
Name: population, dtype: float64

## DataFrame

DataFrame表示一个长方形表格，并包含排好序的列，每一列都可以是不同的数值类型（数字，字符串，布尔值）。DataFrame有行索引和列索引（row index, column index）；   
可以类比Excel表格；   

### 构建DataFrame

构建一个dataframe的方法，用一个dcit，dict里的值是list：  

dataframe也会像series一样，自动给数据赋index, 而列则会按顺序排好。

In [22]:
data = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nevada', 'Nevada', 'Nevada'], 
        'year': [2000, 2001, 2002, 2001, 2002, 2003], 
        'pop': [1.5, 1.7, 3.6, 2.4, 2.9, 3.2]}
frame = pd.DataFrame(data)
frame

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002
5,3.2,Nevada,2003


In [23]:
#对于一个较大的DataFrame，用head方法会返回前5行（注：这个函数在数据分析中经常使用，用来查看表格里有什么东西）：
frame.head()

Unnamed: 0,pop,state,year
0,1.5,Ohio,2000
1,1.7,Ohio,2001
2,3.6,Ohio,2002
3,2.4,Nevada,2001
4,2.9,Nevada,2002


In [24]:
#交换列的顺序
pd.DataFrame(data, columns=['year', 'state', 'pop'])

Unnamed: 0,year,state,pop
0,2000,Ohio,1.5
1,2001,Ohio,1.7
2,2002,Ohio,3.6
3,2001,Nevada,2.4
4,2002,Nevada,2.9
5,2003,Nevada,3.2


In [26]:
#如果你导入一个不存在的列名，那么会显示为缺失数据：
frame2 = pd.DataFrame(data, columns=['year', 'state', 'pop', 'debt'], 
                      index=['one', 'two', 'three', 'four', 'five', 'six'])

In [32]:
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,
two,2001,Ohio,1.7,
three,2002,Ohio,3.6,
four,2001,Nevada,2.4,
five,2002,Nevada,2.9,
six,2003,Nevada,3.2,


In [28]:
#显示列名
frame2.columns

Index(['year', 'state', 'pop', 'debt'], dtype='object')

### 提取某一列

注意：frame2[column]能应对任何列名，但frame2.column的情况下，列名必须是有效的python变量名才行。   
返回的series有DataFrame种同样的index，而且name属性也是对应的。

In [35]:
frame2['pop']

one      1.5
two      1.7
three    3.6
four     2.4
five     2.9
six      3.2
Name: pop, dtype: float64

In [40]:
frame2.debt#？？？？

one      NaN
two      NaN
three    NaN
four     NaN
five     NaN
six      NaN
Name: debt, dtype: object

### 提取某一行

In [41]:
#对于行，要用在loc属性里用 位置或名字：
frame2.loc['three']

year     2002
state    Ohio
pop       3.6
debt      NaN
Name: three, dtype: object

### 列值赋值

In [42]:
frame2['debt'] = 16.5
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,16.5
two,2001,Ohio,1.7,16.5
three,2002,Ohio,3.6,16.5
four,2001,Nevada,2.4,16.5
five,2002,Nevada,2.9,16.5
six,2003,Nevada,3.2,16.5


In [43]:
#np.arange()
frame2['debt'] = np.arange(6.)
frame2

Unnamed: 0,year,state,pop,debt
one,2000,Ohio,1.5,0.0
two,2001,Ohio,1.7,1.0
three,2002,Ohio,3.6,2.0
four,2001,Nevada,2.4,3.0
five,2002,Nevada,2.9,4.0
six,2003,Nevada,3.2,5.0


## 利用pandas的DataFrame实现数据聚合（group）

In [7]:
import pandas as pd
from numpy.random import seed
from numpy.random import rand
from numpy.random import random_integers
import numpy as np

#seed() 方法改变随机数生成器的种子，可以在调用其他随机模块函数之前调用此函数。
seed(42)

df = pd.DataFrame({'Weather':['cold','hot','cold','hot','cold','hot','cold'],
                  'Food':['soup','soup','icecream','chololate','soup','icecream','chololate'],
                  'Price':10*rand(7),
                   'Number':random_integers(1,9,size=(7,))})
print(df)
weather_group = df.groupby('Weather')
print(weather_group)

i = 0

for name,group in weather_group:
    i = i+1
    print("Group"+str(i)+ name)
    print(group)

print("first group\n", weather_group.first())#输出组里第一行
print("last group\n", weather_group.last())#输出最后一行
print("mean group\n", weather_group.mean())#输出平均

wf_group = df.groupby(['Weather','Food'])
print("WF Groups:",wf_group.groups)

        Food  Number     Price Weather
0       soup       8  3.745401    cold
1       soup       5  9.507143     hot
2   icecream       4  7.319939    cold
3  chololate       8  5.986585     hot
4       soup       8  1.560186    cold
5   icecream       3  1.559945     hot
6  chololate       6  0.580836    cold
<pandas.core.groupby.DataFrameGroupBy object at 0x000002C9258D0BE0>
Group1cold
        Food  Number     Price Weather
0       soup       8  3.745401    cold
2   icecream       4  7.319939    cold
4       soup       8  1.560186    cold
6  chololate       6  0.580836    cold
Group2hot
        Food  Number     Price Weather
1       soup       5  9.507143     hot
3  chololate       8  5.986585     hot
5   icecream       3  1.559945     hot
first group
          Food  Number     Price
Weather                        
cold     soup       8  3.745401
hot      soup       5  9.507143
last group
               Food  Number     Price
Weather                             
cold     chololate   

  del sys.path[0]


## DataFrame的串联与附加操作