In [1]:
import pandas as pd

In [2]:
pd.__version__

'1.0.1'

# 创建Series数据类型

从列表创建

In [4]:
arr = [0,1,2,3,4]
s1 = pd.Series(arr)
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64

从Ndarray创建

In [8]:
import numpy as np

n = np.random.randn(5)

index = ['a','b','c','d','e']
s2 = pd.Series(n, index=index)
s2

a    1.454590
b    0.787113
c   -0.351009
d    0.726983
e   -0.941259
dtype: float64

从字典创建

In [9]:
d = {'a':1, 'b':2, 'c':3, 'd':4, 'e':5}
s3 = pd.Series(d)
s3

a    1
b    2
c    3
d    4
e    5
dtype: int64

# Series基本操作

修改Series索引

In [10]:
print(s1)

s1.index = ['A', 'B', 'C', 'D', 'E']
s1

0    0
1    1
2    2
3    3
4    4
dtype: int64


A    0
B    1
C    2
D    3
E    4
dtype: int64

Series纵向拼接

In [11]:
s4 = s3.append(s1)
s4

a    1
b    2
c    3
d    4
e    5
A    0
B    1
C    2
D    3
E    4
dtype: int64

Series按指定索引删除元素

In [12]:
print(s4)
s4 = s4.drop('e')
s4

a    1
b    2
c    3
d    4
e    5
A    0
B    1
C    2
D    3
E    4
dtype: int64


a    1
b    2
c    3
d    4
A    0
B    1
C    2
D    3
E    4
dtype: int64

Series修改指定索引元素

In [13]:
s4['A'] = 6
s4

a    1
b    2
c    3
d    4
A    6
B    1
C    2
D    3
E    4
dtype: int64

Series按指定索引查找元素

In [14]:
s4['B']

1

Series切片操作

In [15]:
s4[:3]

a    1
b    2
c    3
dtype: int64

# Series运算

In [18]:
s3

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [19]:
s4

a    1
b    2
c    3
d    4
A    6
B    1
C    2
D    3
E    4
dtype: int64

Series加法运算

In [16]:
s4.add(s3)

A    NaN
B    NaN
C    NaN
D    NaN
E    NaN
a    2.0
b    4.0
c    6.0
d    8.0
e    NaN
dtype: float64

Series减法运算

In [17]:
s4.sub(s3)

A    NaN
B    NaN
C    NaN
D    NaN
E    NaN
a    0.0
b    0.0
c    0.0
d    0.0
e    NaN
dtype: float64

Series乘法运算

In [20]:
s4.mul(s3)

A     NaN
B     NaN
C     NaN
D     NaN
E     NaN
a     1.0
b     4.0
c     9.0
d    16.0
e     NaN
dtype: float64

Series除法运算

In [21]:
s4.div(s3)

A    NaN
B    NaN
C    NaN
D    NaN
E    NaN
a    1.0
b    1.0
c    1.0
d    1.0
e    NaN
dtype: float64

Series求中位数

In [22]:
s4.median()

3.0

Series求和

In [23]:
s4.sum()

26

Series求最大值

In [24]:
s4.max()

6

Series求最小值

In [25]:
s4.min()

1

# 创建DataFrame数据类型

通过Numpy数组创建DataFrame

In [26]:
dates = pd.date_range('today', periods = 6)
num_arr = np.random.randn(6,4)
columns = ['A','B','C','D']

df1 = pd.DataFrame(num_arr, index=dates, columns=columns)
df1

Unnamed: 0,A,B,C,D
2020-04-16 15:29:30.166822,0.360079,0.285664,-0.060731,0.505283
2020-04-17 15:29:30.166822,0.940173,0.734535,0.360792,-1.2904
2020-04-18 15:29:30.166822,0.73926,-0.442352,-0.801418,0.809669
2020-04-19 15:29:30.166822,0.057202,-1.756656,-1.706042,0.490946
2020-04-20 15:29:30.166822,-0.821865,0.246446,1.449582,-0.521915
2020-04-21 15:29:30.166822,-0.498585,-1.895869,0.955869,-0.908044


通过字典数组创建DataFrame

In [27]:
data = {'animal': ['cat', 'cat', 'snake', 'dog', 'dog', 'cat', 'snake', 'cat', 'dog', 'dog'],
        'age': [2.5, 3, 0.5, np.nan, 5, 2, 4.5, np.nan, 7, 3],
        'visits': [1, 3, 2, 3, 2, 3, 1, 1, 2, 1],
        'priority': ['yes', 'yes', 'no', 'yes', 'no', 'no', 'no', 'yes', 'no', 'no']}

labels = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

df2 = pd.DataFrame(data, index=labels)
df2

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


查看DataFrame的数据类型

In [28]:
df2.dtypes

animal       object
age         float64
visits        int64
priority     object
dtype: object

# DataFrame基本操作

In [29]:
df2.head()

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no


In [31]:
df2.tail(3)

Unnamed: 0,animal,age,visits,priority
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


In [33]:
df2.index

Index(['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'], dtype='object')

In [34]:
df2.columns

Index(['animal', 'age', 'visits', 'priority'], dtype='object')

In [35]:
df2.values

array([['cat', 2.5, 1, 'yes'],
       ['cat', 3.0, 3, 'yes'],
       ['snake', 0.5, 2, 'no'],
       ['dog', nan, 3, 'yes'],
       ['dog', 5.0, 2, 'no'],
       ['cat', 2.0, 3, 'no'],
       ['snake', 4.5, 1, 'no'],
       ['cat', nan, 1, 'yes'],
       ['dog', 7.0, 2, 'no'],
       ['dog', 3.0, 1, 'no']], dtype=object)

In [36]:
df2.describe()

Unnamed: 0,age,visits
count,8.0,10.0
mean,3.4375,1.9
std,2.007797,0.875595
min,0.5,1.0
25%,2.375,1.0
50%,3.0,2.0
75%,4.625,2.75
max,7.0,3.0


转置

In [37]:
df2.T

Unnamed: 0,a,b,c,d,e,f,g,h,i,j
animal,cat,cat,snake,dog,dog,cat,snake,cat,dog,dog
age,2.5,3,0.5,,5,2,4.5,,7,3
visits,1,3,2,3,2,3,1,1,2,1
priority,yes,yes,no,yes,no,no,no,yes,no,no


按列排序

In [38]:
# 按 age 升序排列
df2.sort_values(by='age')  

Unnamed: 0,animal,age,visits,priority
c,snake,0.5,2,no
f,cat,2.0,3,no
a,cat,2.5,1,yes
b,cat,3.0,3,yes
j,dog,3.0,1,no
g,snake,4.5,1,no
e,dog,5.0,2,no
i,dog,7.0,2,no
d,dog,,3,yes
h,cat,,1,yes


切片

In [39]:
df2[1:3]

Unnamed: 0,animal,age,visits,priority
b,cat,3.0,3,yes
c,snake,0.5,2,no


In [40]:
df2['age']

a    2.5
b    3.0
c    0.5
d    NaN
e    5.0
f    2.0
g    4.5
h    NaN
i    7.0
j    3.0
Name: age, dtype: float64

In [41]:
df2.age

a    2.5
b    3.0
c    0.5
d    NaN
e    5.0
f    2.0
g    4.5
h    NaN
i    7.0
j    3.0
Name: age, dtype: float64

In [42]:
# 传入一个列名组成的列表
df2[['age', 'animal']]  

Unnamed: 0,age,animal
a,2.5,cat
b,3.0,cat
c,0.5,snake
d,,dog
e,5.0,dog
f,2.0,cat
g,4.5,snake
h,,cat
i,7.0,dog
j,3.0,dog


In [43]:
# 查询 2，3 行
df2.iloc[1:3]  

Unnamed: 0,animal,age,visits,priority
b,cat,3.0,3,yes
c,snake,0.5,2,no


副本拷贝

In [44]:
# 生成 DataFrame 副本，方便数据集被多个不同流程使用
df3 = df2.copy()
df3

Unnamed: 0,animal,age,visits,priority
a,cat,2.5,1,yes
b,cat,3.0,3,yes
c,snake,0.5,2,no
d,dog,,3,yes
e,dog,5.0,2,no
f,cat,2.0,3,no
g,snake,4.5,1,no
h,cat,,1,yes
i,dog,7.0,2,no
j,dog,3.0,1,no


判断DataFrame元素是否为空

In [45]:
df3.isnull()

Unnamed: 0,animal,age,visits,priority
a,False,False,False,False
b,False,False,False,False
c,False,False,False,False
d,False,True,False,False
e,False,False,False,False
f,False,False,False,False
g,False,False,False,False
h,False,True,False,False
i,False,False,False,False
j,False,False,False,False


添加列数据

In [46]:
num = pd.Series(list(np.arange(10)), index=df3.index)

df3['No.'] = num
df3

Unnamed: 0,animal,age,visits,priority,No.
a,cat,2.5,1,yes,0
b,cat,3.0,3,yes,1
c,snake,0.5,2,no,2
d,dog,,3,yes,3
e,dog,5.0,2,no,4
f,cat,2.0,3,no,5
g,snake,4.5,1,no,6
h,cat,,1,yes,7
i,dog,7.0,2,no,8
j,dog,3.0,1,no,9


根据DataFrame的下标值进行更改

In [47]:
# 修改第 2 行与第 2 列对应的值 3.0 → 2.0
df3.iat[1, 1] = 2  
df3

Unnamed: 0,animal,age,visits,priority,No.
a,cat,2.5,1,yes,0
b,cat,2.0,3,yes,1
c,snake,0.5,2,no,2
d,dog,,3,yes,3
e,dog,5.0,2,no,4
f,cat,2.0,3,no,5
g,snake,4.5,1,no,6
h,cat,,1,yes,7
i,dog,7.0,2,no,8
j,dog,3.0,1,no,9


根据DataFrame的标签对数据进行修改

In [48]:
df3.loc['f', 'age'] = 1.5
df3

Unnamed: 0,animal,age,visits,priority,No.
a,cat,2.5,1,yes,0
b,cat,2.0,3,yes,1
c,snake,0.5,2,no,2
d,dog,,3,yes,3
e,dog,5.0,2,no,4
f,cat,1.5,3,no,5
g,snake,4.5,1,no,6
h,cat,,1,yes,7
i,dog,7.0,2,no,8
j,dog,3.0,1,no,9


求平均值

In [49]:
df3.mean()

age       3.25
visits    1.90
No.       4.50
dtype: float64

DataFrame 中任意列做求和操作

In [50]:
df3['visits'].sum()

19

# 字符串操作

将字符串转化为小写字母

In [51]:
string = pd.Series(['A', 'B', 'C', 'Aaba', 'Baca',
                    np.nan, 'CABA', 'dog', 'cat'])
print(string)
string.str.lower()

0       A
1       B
2       C
3    Aaba
4    Baca
5     NaN
6    CABA
7     dog
8     cat
dtype: object


0       a
1       b
2       c
3    aaba
4    baca
5     NaN
6    caba
7     dog
8     cat
dtype: object

将字符串转化为大写字母

In [52]:
string.str.upper()

0       A
1       B
2       C
3    AABA
4    BACA
5     NaN
6    CABA
7     DOG
8     CAT
dtype: object

# DataFrame缺失值操作

对缺失值进行填充

In [53]:
df4 = df3.copy()
print(df4)
df4.fillna(value=3)

  animal  age  visits priority  No.
a    cat  2.5       1      yes    0
b    cat  2.0       3      yes    1
c  snake  0.5       2       no    2
d    dog  NaN       3      yes    3
e    dog  5.0       2       no    4
f    cat  1.5       3       no    5
g  snake  4.5       1       no    6
h    cat  NaN       1      yes    7
i    dog  7.0       2       no    8
j    dog  3.0       1       no    9


Unnamed: 0,animal,age,visits,priority,No.
a,cat,2.5,1,yes,0
b,cat,2.0,3,yes,1
c,snake,0.5,2,no,2
d,dog,3.0,3,yes,3
e,dog,5.0,2,no,4
f,cat,1.5,3,no,5
g,snake,4.5,1,no,6
h,cat,3.0,1,yes,7
i,dog,7.0,2,no,8
j,dog,3.0,1,no,9


删除存在缺失值的行

In [54]:
df5 = df3.copy()
print(df5)
# 任何存在 NaN 的行都将被删除
df5.dropna(how='any')  

  animal  age  visits priority  No.
a    cat  2.5       1      yes    0
b    cat  2.0       3      yes    1
c  snake  0.5       2       no    2
d    dog  NaN       3      yes    3
e    dog  5.0       2       no    4
f    cat  1.5       3       no    5
g  snake  4.5       1       no    6
h    cat  NaN       1      yes    7
i    dog  7.0       2       no    8
j    dog  3.0       1       no    9


Unnamed: 0,animal,age,visits,priority,No.
a,cat,2.5,1,yes,0
b,cat,2.0,3,yes,1
c,snake,0.5,2,no,2
e,dog,5.0,2,no,4
f,cat,1.5,3,no,5
g,snake,4.5,1,no,6
i,dog,7.0,2,no,8
j,dog,3.0,1,no,9


DataFrame 按指定列对齐

In [55]:
left = pd.DataFrame({'key': ['foo1', 'foo2'], 'one': [1, 2]})
right = pd.DataFrame({'key': ['foo2', 'foo3'], 'two': [4, 5]})

print(left)
print(right)

# 按照 key 列对齐连接，只存在 foo2 相同，所以最后变成一行
pd.merge(left, right, on='key')

    key  one
0  foo1    1
1  foo2    2
    key  two
0  foo2    4
1  foo3    5


Unnamed: 0,key,one,two
0,foo2,2,4


# DataFrame文件操作

CSV文件写入

In [56]:
df3.to_csv('animal.csv')
print('写入成功.')

写入成功.


CSV文件读取

In [57]:
df_animal = pd.read_csv('animal.csv')
df_animal

Unnamed: 0.1,Unnamed: 0,animal,age,visits,priority,No.
0,a,cat,2.5,1,yes,0
1,b,cat,2.0,3,yes,1
2,c,snake,0.5,2,no,2
3,d,dog,,3,yes,3
4,e,dog,5.0,2,no,4
5,f,cat,1.5,3,no,5
6,g,snake,4.5,1,no,6
7,h,cat,,1,yes,7
8,i,dog,7.0,2,no,8
9,j,dog,3.0,1,no,9


Excel写入操作

In [58]:
df3.to_excel('animal.xlsx', sheet_name='Sheet1')
print("写入成功.")

写入成功.


Excel读取操作

In [59]:
pd.read_excel('animal.xlsx', 'Sheet1', index_col=None, na_values=['NA'])

Unnamed: 0.1,Unnamed: 0,animal,age,visits,priority,No.
0,a,cat,2.5,1,yes,0
1,b,cat,2.0,3,yes,1
2,c,snake,0.5,2,no,2
3,d,dog,,3,yes,3
4,e,dog,5.0,2,no,4
5,f,cat,1.5,3,no,5
6,g,snake,4.5,1,no,6
7,h,cat,,1,yes,7
8,i,dog,7.0,2,no,8
9,j,dog,3.0,1,no,9
