In [35]:
import numpy as np
import pandas as pd

### Series

#### 创建

In [2]:
ser = pd.Series(np.random.randn(5))
ser

0    0.814743
1   -1.414246
2   -0.557643
3   -1.754314
4    0.914817
dtype: float64

In [3]:
ser.index

RangeIndex(start=0, stop=5, step=1)

In [4]:
ser = pd.Series(np.random.randn(5),index=['a','b','c','d','e'])
ser

a    2.094285
b   -2.127441
c    0.565257
d    0.876615
e    0.512639
dtype: float64

In [5]:
ser.index # 查看索引

Index(['a', 'b', 'c', 'd', 'e'], dtype='object')

In [6]:
ser.dtype # 查看值的数据类型

dtype('float64')

In [7]:
# 字典传入
d = {'a':0,'b':1,'c':2}
d

{'a': 0, 'b': 1, 'c': 2}

In [8]:
pd.Series(d)

a    0
b    1
c    2
dtype: int64

In [9]:
pd.Series(d,index=['b','a','e','c'])

b    1.0
a    0.0
e    NaN
c    2.0
dtype: float64

In [11]:
# 以标量形式传入
pd.Series(5,index=['a','b','c','d','e'])

a    5
b    5
c    5
d    5
e    5
dtype: int64

In [13]:
# 数组
s = pd.Series([5,3.14,True,-3])

In [14]:
s.values

array([5, 3.14, True, -3], dtype=object)

### Series数组特性

In [15]:
ser

a    2.094285
b   -2.127441
c    0.565257
d    0.876615
e    0.512639
dtype: float64

In [16]:
ser[0]

2.094284513630553

In [17]:
ser[:3]

a    2.094285
b   -2.127441
c    0.565257
dtype: float64

In [18]:
ser[ser>0]

a    2.094285
c    0.565257
d    0.876615
e    0.512639
dtype: float64

In [20]:
ser[ser>ser.median()]

a    2.094285
d    0.876615
dtype: float64

In [21]:
ser[[4,3,1]]

e    0.512639
d    0.876615
b   -2.127441
dtype: float64

#### Series字典特性

In [22]:
ser

a    2.094285
b   -2.127441
c    0.565257
d    0.876615
e    0.512639
dtype: float64

In [23]:
ser['a']

2.094284513630553

In [24]:
ser['e']=12

In [25]:
ser

a     2.094285
b    -2.127441
c     0.565257
d     0.876615
e    12.000000
dtype: float64

In [26]:
ser['z']

KeyError: 'z'

In [28]:
ser.get('z',-1)

-1

In [29]:
ser.get('a',-1)

2.094284513630553

In [30]:
'z'in ser

False

In [31]:
'b' in ser

True

### 矢量化操作

In [32]:
ser+ser

a     4.188569
b    -4.254881
c     1.130514
d     1.753230
e    24.000000
dtype: float64

In [33]:
ser*2

a     4.188569
b    -4.254881
c     1.130514
d     1.753230
e    24.000000
dtype: float64

In [35]:
# 标签对齐
ser[1:]+ser[:-1]

a         NaN
b   -4.254881
c    1.130514
d    1.753230
e         NaN
dtype: float64

### Series属性

In [36]:
s = pd.Series(np.random.randn(5),name='something')
s

0    0.453354
1   -1.368879
2   -1.904687
3   -1.643453
4    0.146560
Name: something, dtype: float64

In [38]:
s.name

'something'

In [41]:
s1 = s.rename('different')

In [42]:
s1

0    0.453354
1   -1.368879
2   -1.904687
3   -1.643453
4    0.146560
Name: different, dtype: float64

In [44]:
id(s)

89883368

In [45]:
id(s1)

89883984

 ## DataFrame

### 创建

In [46]:
d = pd.Series([1,2,3],index=['a','b','c'],name='one')
d

a    1
b    2
c    3
Name: one, dtype: int64

In [47]:
df = pd.DataFrame(d)
df

Unnamed: 0,one
a,1
b,2
c,3


In [2]:
# 字典创建
d = {'one':pd.Series([1,2,3],index=['a','b','c']),
    'two':pd.Series([1,2,3,4],index=['a','b','c','d'])}

In [3]:
df1 = pd.DataFrame(d)
df1

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [4]:
# 二维数组创建
pd.DataFrame(np.random.rand(3,2),
            index = ['a','b','c'],
            columns=['one','two'])

Unnamed: 0,one,two
a,0.764133,0.356558
b,0.36345,0.529898
c,0.136266,0.917746


In [5]:
pd.DataFrame(d,columns=['one','three'])

Unnamed: 0,one,three
a,1,
b,2,
c,3,


In [6]:
df1

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [7]:
# 修改名称 mapper方式
d1 = {'one':'A','two':'B'}
df1.rename(mapper=d1,axis=1)

Unnamed: 0,A,B
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [8]:
d2 = {'d':'A'}
df1.rename(mapper=d2,axis=0)

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
A,,4


In [9]:
# 直接全部修改
df1.rename(index=d2,columns=d1)

Unnamed: 0,A,B
a,1.0,1
b,2.0,2
c,3.0,3
A,,4


### DataFrame的列操作

In [10]:
df1['one']

a    1.0
b    2.0
c    3.0
d    NaN
Name: one, dtype: float64

In [62]:
df1['three'] = df1['one']*df1['two']
df1

Unnamed: 0,one,two,three
a,1.0,1,1.0
b,2.0,2,4.0
c,3.0,3,9.0
d,,4,


In [63]:
df1['flag'] = df1['one']>2
df1

Unnamed: 0,one,two,three,flag
a,1.0,1,1.0,False
b,2.0,2,4.0,False
c,3.0,3,9.0,True
d,,4,,False


### 删除或移除

In [64]:
del df1['two']
df1

Unnamed: 0,one,three,flag
a,1.0,1.0,False
b,2.0,4.0,False
c,3.0,9.0,True
d,,,False


In [65]:
df1.pop('three')

a    1.0
b    4.0
c    9.0
d    NaN
Name: three, dtype: float64

In [66]:
# 索引对齐
A = pd.Series([2,4,6],index=[1,2,3])
B = pd.Series([1,3,5],index=[0,1,2])

In [67]:
A+B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [68]:
A.add(B,fill_value=1)

0    2.0
1    5.0
2    9.0
3    7.0
dtype: float64

In [11]:
df1

Unnamed: 0,one,two
a,1.0,1
b,2.0,2
c,3.0,3
d,,4


In [12]:
# insert将数组插入指定位置
df1.insert(1,'bar',df1['one'])

In [13]:
df1

Unnamed: 0,one,bar,two
a,1.0,1.0,1
b,2.0,2.0,2
c,3.0,3.0,3
d,,,4


In [14]:
df1.T # 转置

Unnamed: 0,a,b,c,d
one,1.0,2.0,3.0,
bar,1.0,2.0,3.0,
two,1.0,2.0,3.0,4.0


## 数据读写

In [18]:
df = pd.read_table('./data/student_grade.txt')
df

  """Entry point for launching an IPython kernel.


FileNotFoundError: [Errno 2] File b'./data/student_grade.txt' does not exist: b'./data/student_grade.txt'

In [80]:
df = pd.read_csv('./data/student_grade.txt',sep='\t')

In [78]:
df

Unnamed: 0,姓名,语文,数学,英语,总分,班名次
0,杨璐,131,143,144,418,1
1,王雪,131,135,144,410,2
2,韩林霖,127,139,142,408,3
3,沙龙逸,123,148,136,407,4
4,李鉴学,126,135,140,401,5
5,韩雨萌,129,133,138,400,6
6,刘帅,116,143,140,399,7
7,康惠雯,114,142,139,395,8
8,刘钰婷,115,139,135,389,9
9,林世博,116,142,129,387,10


In [19]:
# 创建数据
columns = ['年龄','星座','身高']
data = [
    [30,'白羊座','180cm'],
    [27,'金牛座','170cm'],
    [32,'双子座','175cm'],
    [25,'巨蟹座','170cm'],
    [28,'狮子座','160cm'],
    [31,'双鱼座','165cm']  
]

index = ['A','B','C','D','E','F']
df = pd.DataFrame(data=data,index=index,columns=columns)
df

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


#### 保存文件

In [20]:
df.to_csv('model.csv')

In [84]:
df.to_csv('model1.csv',encoding='gbk') # 设置编码

In [85]:
df.to_csv('model2.csv',encoding='gbk',columns=['星座','身高']) # 保存指定列

In [86]:
df.to_csv('model3.csv',encoding='gbk',header=False)  # 不保存列名

In [87]:
df.to_csv('model4.csv',encoding='gbk',header=[1,2,3]) # 指定列名

In [88]:
df.to_csv('model5.csv',encoding='gbk',index=False) # 不保存索引

In [89]:
df.to_csv('model6.csv',encoding='gbk',sep='\t') # 设置分隔符 

In [90]:
df.to_excel('明星数据.xlsx',sheet_name='model')

### 读取数据

In [23]:
pd.read_csv('model.csv')

Unnamed: 0.1,Unnamed: 0,年龄,星座,身高
0,A,30,白羊座,180cm
1,B,27,金牛座,170cm
2,C,32,双子座,175cm
3,D,25,巨蟹座,170cm
4,E,28,狮子座,160cm
5,F,31,双鱼座,165cm


In [24]:
pd.read_csv('model.csv',index_col=0) # 删除默认索引

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


In [96]:
pd.read_csv('model.csv',header=None,index_col=0)

Unnamed: 0_level_0,1,2,3
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


In [97]:
pd.read_csv('model3.csv',index_col=0,encoding='gbk',names=['年龄','星座','身高'])

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


In [99]:
pd.read_csv('model6.csv',index_col=0,encoding='gbk',sep='\t')

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


In [101]:
pd.read_excel('明星数据.xlsx',sheet_name='model')

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


#### 重置索引

In [22]:
df = pd.read_csv('model.csv',index_col=0)
df

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


In [105]:
df.reset_index(drop=True)

Unnamed: 0,年龄,星座,身高
0,30,白羊座,180cm
1,27,金牛座,170cm
2,32,双子座,175cm
3,25,巨蟹座,170cm
4,28,狮子座,160cm
5,31,双鱼座,165cm


### 重新索引

In [21]:
df

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


In [107]:
df.(index=['B','C','D'],columns=['年龄','星座'])

Unnamed: 0,年龄,星座
B,27,金牛座
C,32,双子座
D,25,巨蟹座


#### 替换索引

In [108]:
df

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


In [110]:
df.set_index('星座',drop=False)

Unnamed: 0_level_0,年龄,星座,身高
星座,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
白羊座,30,白羊座,180cm
金牛座,27,金牛座,170cm
双子座,32,双子座,175cm
巨蟹座,25,巨蟹座,170cm
狮子座,28,狮子座,160cm
双鱼座,31,双鱼座,165cm


In [111]:
df.set_index('身高',append=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,年龄,星座
Unnamed: 0_level_1,身高,Unnamed: 2_level_1,Unnamed: 3_level_1
A,180cm,30,白羊座
B,170cm,27,金牛座
C,175cm,32,双子座
D,170cm,25,巨蟹座
E,160cm,28,狮子座
F,165cm,31,双鱼座


#### 类型转换

In [112]:
# 查看数据表基本信息
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6 entries, A to F
Data columns (total 3 columns):
年龄    6 non-null int64
星座    6 non-null object
身高    6 non-null object
dtypes: int64(1), object(2)
memory usage: 352.0+ bytes


In [113]:
df.dtypes

年龄     int64
星座    object
身高    object
dtype: object

In [114]:
df.年龄.dtypes

dtype('int64')

In [117]:
df.get_dtype_counts()

int64     1
object    2
dtype: int64

In [118]:
df.年龄.astype('float32')

A    30.0
B    27.0
C    32.0
D    25.0
E    28.0
F    31.0
Name: 年龄, dtype: float32

#### 查看数据信息

In [119]:
grade = pd.read_csv('./data/student_grade.txt',sep='\t')
grade

Unnamed: 0,姓名,语文,数学,英语,总分,班名次
0,杨璐,131,143,144,418,1
1,王雪,131,135,144,410,2
2,韩林霖,127,139,142,408,3
3,沙龙逸,123,148,136,407,4
4,李鉴学,126,135,140,401,5
5,韩雨萌,129,133,138,400,6
6,刘帅,116,143,140,399,7
7,康惠雯,114,142,139,395,8
8,刘钰婷,115,139,135,389,9
9,林世博,116,142,129,387,10


In [120]:
# 查看几条数据信息
grade.head() # 默认查看前5条

Unnamed: 0,姓名,语文,数学,英语,总分,班名次
0,杨璐,131,143,144,418,1
1,王雪,131,135,144,410,2
2,韩林霖,127,139,142,408,3
3,沙龙逸,123,148,136,407,4
4,李鉴学,126,135,140,401,5


In [121]:
grade.head(1)

Unnamed: 0,姓名,语文,数学,英语,总分,班名次
0,杨璐,131,143,144,418,1


In [122]:
grade.tail() #默认查看后5条

Unnamed: 0,姓名,语文,数学,英语,总分,班名次
63,赵森,90,29,64,183,64
64,满朝升,78,45,47,170,65
65,李忠浩,86,32,46,164,66
66,侯禹志,75,23,34,132,67
67,尹鸿涛,66,23,34,123,68


In [123]:
grade.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 68 entries, 0 to 67
Data columns (total 6 columns):
姓名     68 non-null object
语文     68 non-null int64
数学     68 non-null int64
英语     68 non-null int64
总分     68 non-null int64
班名次    68 non-null int64
dtypes: int64(5), object(1)
memory usage: 3.3+ KB


In [124]:
# 查看数据形状
grade.shape

(68, 6)

In [125]:
grade.shape[0]

68

In [126]:
grade.shape[1]

6

In [127]:
grade.index

RangeIndex(start=0, stop=68, step=1)

In [128]:
grade.columns

Index(['姓名', '语文', '数学', '英语', '总分', '班名次'], dtype='object')

In [129]:
grade.values

array([['杨璐', 131, 143, 144, 418, 1],
       ['王雪', 131, 135, 144, 410, 2],
       ['韩林霖', 127, 139, 142, 408, 3],
       ['沙龙逸', 123, 148, 136, 407, 4],
       ['李鉴学', 126, 135, 140, 401, 5],
       ['韩雨萌', 129, 133, 138, 400, 6],
       ['刘帅', 116, 143, 140, 399, 7],
       ['康惠雯', 114, 142, 139, 395, 8],
       ['刘钰婷', 115, 139, 135, 389, 9],
       ['林世博', 116, 142, 129, 387, 10],
       ['张希', 123, 130, 134, 387, 11],
       ['徐冲', 122, 124, 139, 385, 12],
       ['苑宇飞', 118, 136, 131, 385, 13],
       ['卢一凡', 121, 123, 139, 383, 14],
       ['张瑞鑫', 126, 115, 139, 380, 15],
       ['范作鑫', 121, 127, 131, 379, 16],
       ['裴子翔', 111, 139, 128, 378, 17],
       ['武传禹', 119, 129, 130, 378, 18],
       ['任雪桐', 124, 108, 144, 376, 19],
       ['刘姗', 124, 128, 122, 374, 20],
       ['王柏坤', 121, 123, 128, 372, 21],
       ['赵永刚', 116, 131, 122, 369, 22],
       ['张馨月大', 114, 124, 122, 360, 23],
       ['张曦月', 116, 123, 119, 358, 24],
       ['胡丁文', 116, 122, 118, 356, 25],
       ['赵美欣',

### 索引器

#### 基于标签的索引

In [130]:
df

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


In [131]:
df.loc['C','星座']

'双子座'

In [133]:
df.loc['B','身高']

'170cm'

In [135]:
# 同时索引出BC的身高和星座
df.loc['B':'C','星座':'身高']

Unnamed: 0,星座,身高
B,金牛座,170cm
C,双子座,175cm


In [136]:
df1 = df.reset_index()
df1

Unnamed: 0,index,年龄,星座,身高
0,A,30,白羊座,180cm
1,B,27,金牛座,170cm
2,C,32,双子座,175cm
3,D,25,巨蟹座,170cm
4,E,28,狮子座,160cm
5,F,31,双鱼座,165cm


In [137]:
df1.loc[1:2,'星座':'身高']

Unnamed: 0,星座,身高
1,金牛座,170cm
2,双子座,175cm


In [139]:
df.loc['A'::2,:] # 所有列

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
C,32,双子座,175cm
E,28,狮子座,160cm


In [140]:
df.loc['A'::2]

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
C,32,双子座,175cm
E,28,狮子座,160cm


In [141]:
df.loc[:,'年龄':'星座']# 所有行

Unnamed: 0,年龄,星座
A,30,白羊座
B,27,金牛座
C,32,双子座
D,25,巨蟹座
E,28,狮子座
F,31,双鱼座


In [142]:
df[::-1]

Unnamed: 0,年龄,星座,身高
F,31,双鱼座,165cm
E,28,狮子座,160cm
D,25,巨蟹座,170cm
C,32,双子座,175cm
B,27,金牛座,170cm
A,30,白羊座,180cm


In [144]:
df.loc[['A','C','D'],['年龄','身高']]

Unnamed: 0,年龄,身高
A,30,180cm
C,32,175cm
D,25,170cm


In [145]:
df.loc[['A','C','C','A']]

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
C,32,双子座,175cm
C,32,双子座,175cm
A,30,白羊座,180cm


#### 基于位置的索引

In [146]:
df

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


In [147]:
df.iloc[2,1]

'双子座'

In [148]:
df.iloc[:4,:2]

Unnamed: 0,年龄,星座
A,30,白羊座
B,27,金牛座
C,32,双子座
D,25,巨蟹座


In [149]:
df.iloc[:4:2,1:]

Unnamed: 0,星座,身高
A,白羊座,180cm
C,双子座,175cm


In [150]:
df

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm


In [151]:
df.iloc[[0,2,3,5],[0,1]]

Unnamed: 0,年龄,星座
A,30,白羊座
C,32,双子座
D,25,巨蟹座
F,31,双鱼座


#### 新增/删除

In [29]:
df['血型']=['A','AB','O','B','B','A']
df

Unnamed: 0,年龄,星座,身高,年龄的对数,血型
A,30,白羊座,180cm,3.401197,A
B,27,金牛座,170cm,3.295837,AB
C,32,双子座,175cm,3.465736,O
D,25,巨蟹座,170cm,3.218876,B
E,28,狮子座,160cm,3.332205,B
F,31,双鱼座,165cm,3.433987,A


In [30]:
df['年龄的对数']=np.log(df.年龄)
df

Unnamed: 0,年龄,星座,身高,年龄的对数,血型
A,30,白羊座,180cm,3.401197,A
B,27,金牛座,170cm,3.295837,AB
C,32,双子座,175cm,3.465736,O
D,25,巨蟹座,170cm,3.218876,B
E,28,狮子座,160cm,3.332205,B
F,31,双鱼座,165cm,3.433987,A


In [33]:
df.loc['G']=[20,'射手座','172cm',1,'O']

In [34]:
df

Unnamed: 0,年龄,星座,身高,年龄的对数,血型
A,30,白羊座,180cm,3.4012,A
B,27,金牛座,170cm,3.29584,AB
C,32,双子座,175cm,3.46574,O
D,25,巨蟹座,170cm,3.21888,B
E,28,狮子座,160cm,3.3322,B
F,31,双鱼座,165cm,3.43399,A
G,20,射手座,172cm,1.0,O


In [160]:
df.loc['G','年龄的对数'] = np.log(df.loc['G','年龄'])

In [161]:
df

Unnamed: 0,年龄,星座,身高,血型,年龄的对数
A,30,白羊座,180cm,A,3.401197
B,27,金牛座,170cm,AB,3.295837
C,32,双子座,175cm,O,3.465736
D,25,巨蟹座,170cm,B,3.218876
E,28,狮子座,160cm,B,3.332205
F,31,双鱼座,165cm,A,3.433987
G,20,射手座,172cm,O,2.995732


In [162]:
df.pop('年龄的对数')

A    3.401197
B    3.295837
C    3.465736
D    3.218876
E    3.332205
F    3.433987
G    2.995732
Name: 年龄的对数, dtype: float64

In [163]:
df

Unnamed: 0,年龄,星座,身高,血型
A,30,白羊座,180cm,A
B,27,金牛座,170cm,AB
C,32,双子座,175cm,O
D,25,巨蟹座,170cm,B
E,28,狮子座,160cm,B
F,31,双鱼座,165cm,A
G,20,射手座,172cm,O


In [164]:
# drop
df.drop('G')

Unnamed: 0,年龄,星座,身高,血型
A,30,白羊座,180cm,A
B,27,金牛座,170cm,AB
C,32,双子座,175cm,O
D,25,巨蟹座,170cm,B
E,28,狮子座,160cm,B
F,31,双鱼座,165cm,A


In [165]:
df.drop('血型',axis=1)

Unnamed: 0,年龄,星座,身高
A,30,白羊座,180cm
B,27,金牛座,170cm
C,32,双子座,175cm
D,25,巨蟹座,170cm
E,28,狮子座,160cm
F,31,双鱼座,165cm
G,20,射手座,172cm


In [166]:
df.drop(index=['E','G'])

Unnamed: 0,年龄,星座,身高,血型
A,30,白羊座,180cm,A
B,27,金牛座,170cm,AB
C,32,双子座,175cm,O
D,25,巨蟹座,170cm,B
F,31,双鱼座,165cm,A


In [169]:
df.drop(columns=['星座','血型'])

Unnamed: 0,年龄,身高
A,30,180cm
B,27,170cm
C,32,175cm
D,25,170cm
E,28,160cm
F,31,165cm
G,20,172cm


#### 更新数据

In [170]:
df

Unnamed: 0,年龄,星座,身高,血型
A,30,白羊座,180cm,A
B,27,金牛座,170cm,AB
C,32,双子座,175cm,O
D,25,巨蟹座,170cm,B
E,28,狮子座,160cm,B
F,31,双鱼座,165cm,A
G,20,射手座,172cm,O


In [171]:
df.loc['D','身高'] = '180cm'

In [172]:
df

Unnamed: 0,年龄,星座,身高,血型
A,30,白羊座,180cm,A
B,27,金牛座,170cm,AB
C,32,双子座,175cm,O
D,25,巨蟹座,180cm,B
E,28,狮子座,160cm,B
F,31,双鱼座,165cm,A
G,20,射手座,172cm,O


In [173]:
df.iloc[4,0]=30

In [174]:
df

Unnamed: 0,年龄,星座,身高,血型
A,30,白羊座,180cm,A
B,27,金牛座,170cm,AB
C,32,双子座,175cm,O
D,25,巨蟹座,180cm,B
E,30,狮子座,160cm,B
F,31,双鱼座,165cm,A
G,20,射手座,172cm,O


In [175]:
df.血型 = 'A'

In [176]:
df

Unnamed: 0,年龄,星座,身高,血型
A,30,白羊座,180cm,A
B,27,金牛座,170cm,A
C,32,双子座,175cm,A
D,25,巨蟹座,180cm,A
E,30,狮子座,160cm,A
F,31,双鱼座,165cm,A
G,20,射手座,172cm,A


In [181]:
# 全部代码行输出
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity='all'

#### isin组合

In [183]:
s = pd.Series(np.arange(5),index=np.arange(5)[::-1],dtype='int64')
s

4    0
3    1
2    2
1    3
0    4
dtype: int64

In [184]:
s.isin([2,4,6])

4    False
3    False
2     True
1    False
0     True
dtype: bool

In [185]:
s[s.isin([2,4,6])]

2    2
0    4
dtype: int64

In [187]:
s.index.isin([2,4,6])

array([ True, False,  True, False, False])

In [188]:
s[s.index.isin([2,4,6])]

4    0
2    2
dtype: int64

In [189]:
df1 = pd.DataFrame({'A':[1,2,3,4],
                   'B':['a','b','f','n'],
                   'C':['a','n','a','n']})
df1

Unnamed: 0,A,B,C
0,1,a,a
1,2,b,n
2,3,f,a
3,4,n,n


In [190]:
values = ['a','b',1,3]
df1.isin(values)

Unnamed: 0,A,B,C
0,True,True,True
1,False,True,False
2,True,False,True
3,False,False,False


In [192]:
df1.isin({'A':[1,4],'C':['c']}) # 内部值需要列表形式

Unnamed: 0,A,B,C
0,True,False,False
1,False,False,False
2,False,False,False
3,True,False,False


In [195]:
value = {'B':['a','b'],'C':['a','c'],'A':[1,3]}

In [199]:
row = df1.isin(value).all(axis=1)

In [200]:
df1[row]

Unnamed: 0,A,B,C
0,1,a,a


In [204]:
col = df1.isin(value).any()

In [206]:
col

A    True
B    True
C    True
dtype: bool

In [207]:
df1.loc[:,col]

Unnamed: 0,A,B,C
0,1,a,a
1,2,b,n
2,3,f,a
3,4,n,n


#### 随机抽样

In [208]:
s = pd.Series([0,1,2,3,4,5])
s

0    0
1    1
2    2
3    3
4    4
5    5
dtype: int64

In [212]:
s.sample() # 默认抽一个

2    2
dtype: int64

In [213]:
s.sample(n=3) # 无放回抽样

0    0
4    4
1    1
dtype: int64

In [216]:
s.sample(n=6)

1    1
0    0
2    2
5    5
4    4
3    3
dtype: int64

In [217]:
s.sample(frac=0.7) # 返回百分比，不能和n连用

5    5
1    1
4    4
0    0
dtype: int64

In [218]:
s.sample(n=6,replace=True) # 有放回抽样

2    2
0    0
5    5
5    5
5    5
4    4
dtype: int64

In [227]:
w = [0,0,0.1,0,0.8,0.1]  # 设置权重
s.sample(n=3,weights=w)

5    5
4    4
2    2
dtype: int64

In [228]:
df2 = pd.DataFrame({'col1':[9,8,7,6],'weight':[0.5,0.4,0.1,0]})
df2

Unnamed: 0,col1,weight
0,9,0.5
1,8,0.4
2,7,0.1
3,6,0.0


In [230]:
df2.sample(n=3,weights='weight') # 行采样

Unnamed: 0,col1,weight
0,9,0.5
1,8,0.4
2,7,0.1


In [234]:
df2.sample(n=1,axis=1) # 列采样

Unnamed: 0,col1
0,9
1,8
2,7
3,6


In [238]:
df2.sample(n=3,random_state=1) # 重现随机结果

Unnamed: 0,col1,weight
3,6,0.0
2,7,0.1
0,9,0.5


### 描述性统计

In [253]:
grade.head()

Unnamed: 0,姓名,语文,数学,英语,总分,班名次
0,杨璐,131,143,144,418,1
1,王雪,131,135,144,410,2
2,韩林霖,127,139,142,408,3
3,沙龙逸,123,148,136,407,4
4,李鉴学,126,135,140,401,5


In [254]:
# 求和
grade.语文.sum()

7344

In [255]:
# 求平均
grade.数学.mean()

102.82352941176471

In [257]:
# 求数量
grade.班名次.count()

68

In [258]:
# max min
grade.语文.max()

131

In [259]:
grade.英语.min()

34

In [260]:
# 乘积
grade.语文.prod()

963770320257286144

In [261]:
# 众数
grade.数学.mode()

0    123
1    139
dtype: int64

In [263]:
# 数值计数，频数
grade.语文.value_counts().head()

116    5
114    4
105    3
101    3
121    3
Name: 语文, dtype: int64

In [265]:
# std.var
grade.英语.std()

30.289646524302064

In [266]:
grade.英语.var()

917.4626865671642

In [267]:
# describe查看基本统计指标
grade.describe()

Unnamed: 0,语文,数学,英语,总分,班名次
count,68.0,68.0,68.0,68.0,68.0
mean,108.0,102.823529,109.0,319.823529,34.5
std,14.519159,33.395835,30.289647,73.782241,19.77372
min,66.0,21.0,34.0,123.0,1.0
25%,100.5,85.75,100.25,286.75,17.75
50%,111.5,111.5,119.0,343.5,34.5
75%,118.0,127.25,130.25,378.0,51.25
max,131.0,148.0,144.0,418.0,68.0


In [268]:
grade.describe(include='object')

Unnamed: 0,姓名
count,68
unique,68
top,郭娜
freq,1


In [269]:
grade.describe(include='all')

Unnamed: 0,姓名,语文,数学,英语,总分,班名次
count,68,68.0,68.0,68.0,68.0,68.0
unique,68,,,,,
top,郭娜,,,,,
freq,1,,,,,
mean,,108.0,102.823529,109.0,319.823529,34.5
std,,14.519159,33.395835,30.289647,73.782241,19.77372
min,,66.0,21.0,34.0,123.0,1.0
25%,,100.5,85.75,100.25,286.75,17.75
50%,,111.5,111.5,119.0,343.5,34.5
75%,,118.0,127.25,130.25,378.0,51.25


In [270]:
# value_counts
grade.语文.value_counts().head()

116    5
114    4
105    3
101    3
121    3
Name: 语文, dtype: int64

In [271]:
# 不同值出现的频率
grade.语文.value_counts(normalize=True)

116    0.073529
114    0.058824
105    0.044118
101    0.044118
121    0.044118
118    0.044118
112    0.044118
107    0.029412
109    0.029412
110    0.029412
99     0.029412
115    0.029412
126    0.029412
131    0.029412
123    0.029412
124    0.029412
103    0.029412
85     0.014706
86     0.014706
87     0.014706
79     0.014706
83     0.014706
82     0.014706
94     0.014706
78     0.014706
75     0.014706
66     0.014706
90     0.014706
127    0.014706
95     0.014706
96     0.014706
97     0.014706
98     0.014706
106    0.014706
108    0.014706
111    0.014706
113    0.014706
117    0.014706
119    0.014706
122    0.014706
129    0.014706
Name: 语文, dtype: float64

In [272]:
# 升序
grade.语文.value_counts(ascending=True)

129    1
122    1
119    1
117    1
113    1
111    1
108    1
106    1
98     1
97     1
96     1
95     1
127    1
90     1
66     1
75     1
78     1
94     1
82     1
83     1
79     1
87     1
86     1
85     1
103    2
124    2
123    2
131    2
126    2
115    2
99     2
110    2
109    2
107    2
112    3
118    3
121    3
101    3
105    3
114    4
116    5
Name: 语文, dtype: int64

In [274]:
# 分箱，等距分箱
grade.语文.value_counts(bins=5)

(105.0, 118.0]    28
(118.0, 131.0]    15
(92.0, 105.0]     15
(79.0, 92.0]       6
(65.934, 79.0]     4
Name: 语文, dtype: int64

In [275]:
# 默认不对空值进行计数
grade.语文[grade.语文<80]=np.nan

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [276]:
grade.语文

0     131.0
1     131.0
2     127.0
3     123.0
4     126.0
5     129.0
6     116.0
7     114.0
8     115.0
9     116.0
10    123.0
11    122.0
12    118.0
13    121.0
14    126.0
15    121.0
16    111.0
17    119.0
18    124.0
19    124.0
20    121.0
21    116.0
22    114.0
23    116.0
24    116.0
25    118.0
26    112.0
27    109.0
28    114.0
29    110.0
      ...  
38    115.0
39    101.0
40     98.0
41    103.0
42     96.0
43    105.0
44    118.0
45    101.0
46    112.0
47    109.0
48    107.0
49     94.0
50    107.0
51    106.0
52    105.0
53    101.0
54     85.0
55     95.0
56     97.0
57     99.0
58     83.0
59     87.0
60     82.0
61      NaN
62     99.0
63     90.0
64      NaN
65     86.0
66      NaN
67      NaN
Name: 语文, Length: 68, dtype: float64

In [278]:
grade.语文.value_counts(dropna=False) # 统计空值的数量

 116.0    5
NaN       4
 114.0    4
 118.0    3
 105.0    3
 101.0    3
 121.0    3
 112.0    3
 131.0    2
 110.0    2
 124.0    2
 103.0    2
 109.0    2
 107.0    2
 115.0    2
 99.0     2
 126.0    2
 123.0    2
 129.0    1
 111.0    1
 119.0    1
 127.0    1
 122.0    1
 86.0     1
 113.0    1
 90.0     1
 108.0    1
 98.0     1
 96.0     1
 94.0     1
 106.0    1
 85.0     1
 95.0     1
 97.0     1
 83.0     1
 87.0     1
 82.0     1
 117.0    1
Name: 语文, dtype: int64