# series
## series的创建

In [None]:
import pandas as pd
s = pd.Series([1,2,3,4,5])
print(s)
#自定义索引
s = pd.Series([10,2,3,4,5],index=['A','B','C','D','E'])
print(s)
#定义name
s = pd.Series([10,2,3,4,5],index=['A','B','C','D','E'],name = '月份')
print(s)
s1 = pd.Series(s,index=["A","C"])
print(s1)

0    1
1    2
2    3
3    4
4    5
dtype: int64
A    10
B     2
C     3
D     4
E     5
dtype: int64
A    10
B     2
C     3
D     4
E     5
Name: 月份, dtype: int64
A    10
C     3
Name: 月份, dtype: int64


In [None]:
#通过字典来创建
s = pd.Series({"a":1,"b":2,"c":3,"d":4,"e":5})
print(s)
s1 = pd.Series(s,index=["a","c"])
print(s1)

a    1
b    2
c    3
d    4
e    5
dtype: int64
a    1
c    3
dtype: int64


## series的属性
index：Series的索引对象

values:Series的值

dtype或dtypes:Series的元素类型

shape:Series的形状

ndim:Series的维度

size: Series的元素个数

name:Series的名称

loc[] 显示索引，按标签索引或切片

iloc[] 隐式索引，按位置索引或切片

at[] 使用标签访问单个元素

iat[] 使用位置访问单个元素

In [None]:
print(s.index)
print(s.values)
print(s.shape,s.ndim,s.size)
s.name = 'test'
print(s.dtype,s.name)
print(s.loc['a':'c'])#显式索引，按标签
print(s.iloc[1:3])#隐式索引，按位置
print(s.at['a'])#不支持切片,只能找到精确值
print(s.iat[0])#隐式，按位置

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')
[1 2 3 4 5]
(5,) 1 5
int64 test
a    1
b    2
c    3
Name: test, dtype: int64
b    2
c    3
Name: test, dtype: int64
1
1


In [None]:
#访问数据
#print(s[0])#不推荐使用，易混淆
print(s['c'])
print(s[s<3])
s['f']=6
print(s.head())#默认打印前5行，填数字可控制
print(s.tail())#默认打印后5行，填数字可控制
print(s.tail(1))

3
a    1
b    2
Name: test, dtype: int64
a    1
b    2
c    3
d    4
e    5
Name: test, dtype: int64
b    2
c    3
d    4
e    5
f    6
Name: test, dtype: int64
f    6
Name: test, dtype: int64


### series的常见用法

In [None]:
#常见函数
import numpy as np
s = pd.Series([10,2,np.nan,None,3,4,5],index=['A','B','C','D','E','F','G'],name='data')
print(s)

A    10.0
B     2.0
C     NaN
D     NaN
E     3.0
F     4.0
G     5.0
Name: data, dtype: float64


In [None]:
s.head() #默认打印前5行，填数字可控制

Unnamed: 0,data
A,10.0
B,2.0
C,
D,
E,3.0


In [None]:
s.tail() #默认打印后5行，填数字可控制

Unnamed: 0,data
C,
D,
E,3.0
F,4.0
G,5.0


In [None]:
#查看所有的描述性信息
s.describe()

Unnamed: 0,data
count,5.0
mean,4.8
std,3.114482
min,2.0
25%,3.0
50%,4.0
75%,5.0
max,10.0


In [None]:
#获取元素个数
s.count()

np.int64(5)

In [None]:
#获取索引
print(s.keys())
print(s.index)

Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')
Index(['A', 'B', 'C', 'D', 'E', 'F', 'G'], dtype='object')


In [None]:
#检查缺失值
print(s.isna())
s.isna()

A    False
B    False
C     True
D     True
E    False
F    False
G    False
Name: data, dtype: bool


Unnamed: 0,data
A,False
B,False
C,True
D,True
E,False
F,False
G,False


In [None]:
#查看元素是否在列表里
s.isin([4,5,6])

Unnamed: 0,data
A,False
B,False
C,False
D,False
E,False
F,True
G,True


In [None]:
#统计方法
s.describe()
print(s.mean())#平均值
print(s.sum())#总和
print(s.std())#标准差
print(s.var())#方差
print(s.max())#最大值
print(s.min())#最小值
print(s.median())#中位数

4.8
24.0
3.1144823004794877
9.700000000000001
10.0
2.0
4.0


In [None]:
s.sort_values()
s.quantile(0.25)#分位数

np.float64(3.0)

In [None]:
s['H'] = 4
s.mode()#众数,按出现频率(次数)

Unnamed: 0,data
0,4.0


In [None]:
print(s.value_counts())#每个元素的计数

data
4.0     2
10.0    1
2.0     1
3.0     1
5.0     1
Name: count, dtype: int64


In [None]:
#去重
s.drop_duplicates() #返回series
s.unique()    #返回列表
s.nunique()   #去重后的元素个数

5

In [None]:
#排序   值，索引
s.sort_index()     #按索引排序
s.sort_values()    #按值排序

Unnamed: 0,data
B,2.0
E,3.0
F,4.0
H,4.0
G,5.0
A,10.0
C,
D,


# DataFrame

## DataFrame的创建方式

In [None]:
import pandas as pd
import numpy as np

#通过series创建
s1 = pd.Series([1,2,3,4,5])
s2 = pd.Series([6,7,8,9,10])
df = pd.DataFrame({"第1列":s1,"第2列":s2})

#通过字典创建
df = pd.DataFrame(
    {
        "id":[1,2,3,4,5],
        "name":["tom","jack","alice","bob","allen"],
        "age":[15,17,28,26,38],
        "score":[60.5,80,30.6,70,83.5]
    },index=[1,2,3,4,5],columns=["name","age","score"] #列名，修改时可直接在这里改
)
df

Unnamed: 0,name,age,score
1,tom,15,60.5
2,jack,17,80.0
3,alice,28,30.6
4,bob,26,70.0
5,allen,38,83.5


In [None]:
#查看类型
print(type(df))

type(df["第1列"])

<class 'pandas.core.frame.DataFrame'>


KeyError: '第1列'

In [None]:
#dataframe的属性
print('行索引')
print(df.index)
print('列标签')
print(df.columns)
print('值')
print(df.values)

行索引
Index([1, 2, 3, 4, 5], dtype='int64')
列标签
Index(['name', 'age', 'score'], dtype='object')
值
[['tom' 15 60.5]
 ['jack' 17 80.0]
 ['alice' 28 30.6]
 ['bob' 26 70.0]
 ['allen' 38 83.5]]


In [None]:
print('维度:',df.ndim)
print('形状:',df.shape)
print('元素个数:'df.size)
print('数据类型:')
print(df.dtypes)

维度： 2
数据类型：
name      object
age        int64
score    float64
dtype: object


In [None]:
#行列转置
print(df.T)
print(df.T.index)

          1     2      3     4      5
name    tom  jack  alice   bob  allen
age      15    17     28    26     38
score  60.5  80.0   30.6  70.0   83.5
Index(['name', 'age', 'score'], dtype='object')


In [None]:
#获取元素
#某行
print(df.loc[4])
print(df.iloc[3])#隐式索引

name      bob
age        26
score    70.0
Name: 4, dtype: object
name      bob
age        26
score    70.0
Name: 4, dtype: object


In [None]:
#某列
print(df.loc[:,'name'])
print(df.iloc[:,0])#行用：

1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object


In [None]:
#单个元素
print(df.at[3,'score'])
print(df.iat[2,1])
print(df.loc[3,'score'])
print(df.iloc[2,1])

30.6
28
30.6
28


In [None]:
#获取单列数据
print(df['name'])#series
print(df.name)#series
print(df[['name']])#dataframe

1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
1      tom
2     jack
3    alice
4      bob
5    allen
Name: name, dtype: object
    name
1    tom
2   jack
3  alice
4    bob
5  allen


In [None]:
#获取多列数据
print(df[['name','score']])

    name  score
1    tom   60.5
2   jack   80.0
3  alice   30.6
4    bob   70.0
5  allen   83.5


In [None]:
#查看部分数据
print(df.head(2))
print(df.tail(3))

   name  age  score
1   tom   15   60.5
2  jack   17   80.0
    name  age  score
3  alice   28   30.6
4    bob   26   70.0
5  allen   38   83.5


In [None]:
#数据筛选
#使用布尔数据进行筛选
df[df.score>70]

df[(df.score>70)&(df.age<20)]
df[(df['score']>70)&(df.age<20)]

Unnamed: 0,name,age,score
2,jack,17,80.0


In [None]:
#随机抽样
df.sample(3)#从原来数据中随机取3条数据

Unnamed: 0,name,age,score
5,allen,38,83.5
4,bob,26,70.0
2,jack,17,80.0


In [None]:
df = pd.DataFrame(
    {
        "id":[1,2,3,4,5,6],
        "name":["tom","tom","jack","alice","bob","allen"],
        "age":[15,15,15,20,26,38],
        "score":[60.5,60.5,80,30.6,70,83.5]
    },index=[1,2,3,4,5,6],columns=["name","age","score"] #列名，修改时可直接在这里改
)

In [None]:
print(df.isin(['jack',20]))  #查看元素是否包含在参数集合中

    name    age  score
1  False  False  False
2  False  False  False
3   True  False  False
4  False   True  False
5  False  False  False
6  False  False  False


In [None]:
print(df.isna()) #查看元素是否有缺失值

    name    age  score
1  False  False  False
2  False  False  False
3  False  False  False
4  False  False  False
5  False  False  False
6  False  False  False


In [None]:
print(df['score'].sum())
print(df.score.max())
print(df.age.min())
print(df['score'].mean())
print(df['score'].median())
print(df['score'].mode())
print(df['score'].std())
print(df['score'].var())
print(df.score.quantile(0.25))

385.1
83.5
15
64.18333333333334
65.25
0    60.5
Name: score, dtype: float64
19.037375519400424
362.4216666666666
60.5


In [None]:
print(df.describe())

             age      score
count   6.000000   6.000000
mean   21.500000  64.183333
std     9.181503  19.037376
min    15.000000  30.600000
25%    15.000000  60.500000
50%    17.500000  65.250000
75%    24.500000  77.500000
max    38.000000  83.500000


In [None]:
print(df.count())

name     6
age      6
score    6
dtype: int64


In [None]:
print(df.value_counts()) #出现的次数

name   age  score
tom    15   60.5     2
alice  20   30.6     1
allen  38   83.5     1
bob    26   70.0     1
jack   15   80.0     1
Name: count, dtype: int64


In [None]:
print(df.drop_duplicates())

    name  age  score
1    tom   15   60.5
3   jack   15   80.0
4  alice   20   30.6
5    bob   26   70.0
6  allen   38   83.5


In [None]:
print(df.duplicated(subset=['age']))

1    False
2     True
3     True
4    False
5    False
6    False
dtype: bool


In [None]:
print(df.replace(15,30)) #15替换为30

    name  age  score
1    tom   30   60.5
2    tom   30   60.5
3   jack   30   80.0
4  alice   20   30.6
5    bob   26   70.0
6  allen   38   83.5


In [None]:
df.cumsum() #累加和
df.cummax() #累计最大，默认按列，按行：axis=1

Unnamed: 0,name,age,score
1,tom,15,60.5
2,tom,15,60.5
3,tom,15,80.0
4,tom,20,80.0
5,tom,26,80.0
6,tom,38,83.5


In [None]:
#排序
print(df.sort_index(ascending=False))#按索引排序

    name  age  score
6  allen   38   83.5
5    bob   26   70.0
4  alice   20   30.6
3   jack   15   80.0
2    tom   15   60.5
1    tom   15   60.5


In [None]:
print(df.sort_values(by='score'))

    name  age  score
4  alice   20   30.6
1    tom   15   60.5
2    tom   15   60.5
5    bob   26   70.0
3   jack   15   80.0
6  allen   38   83.5


In [None]:
df = pd.DataFrame(
    {
        "id":[1,2,3,4,5,6],
        "name":["tom","tom","jack","alice","bob","allen"],
        "age":[15,15,15,20,26,38],
        "score":[60.5,60.5,80,30.6,70,80]
    },index=[1,2,3,4,5,6],columns=["name","age","score"] #列名，修改时可直接在这里改
)

    name  age  score
4  alice   20   30.6
1    tom   15   60.5
2    tom   15   60.5
5    bob   26   70.0
6  allen   38   80.0
3   jack   15   80.0


In [None]:
df.nlargest(2,columns=['score','age']) #求最大值：获取几个数据，取哪几列
df.nsmallest(2,columns=['score','age'])

Unnamed: 0,name,age,score
4,alice,20,30.6
1,tom,15,60.5
