# pandas功能

In [4]:
import pandas as pd

## 基本功能

In [6]:
# 重新索引
s=pd.Series([10,20,30],index=['a','b','c'])
s.reindex(['b','c','a'])

b    20
c    30
a    10
dtype: int64

In [7]:
s.reindex(['b','c','a','d'])

b    20.0
c    30.0
a    10.0
d     NaN
dtype: float64

In [8]:
s.reindex(['b','c','a','d'],fill_value=0)

b    20
c    30
a    10
d     0
dtype: int64

In [9]:
# 丢弃指定轴上的项
df=pd.DataFrame([[1,2],[3,4]],index=['a','b'],columns=['c','d'])
df

Unnamed: 0,c,d
a,1,2
b,3,4


In [11]:
# 删除行
df.drop('a')

Unnamed: 0,c,d
b,3,4


In [12]:
# 删除列
df.drop('c',axis=1)

Unnamed: 0,d
a,2
b,4


In [13]:
df.drop(columns=['c'])

Unnamed: 0,d
a,2
b,4


In [14]:
# 原地修改
df.drop('a',inplace=True)

In [15]:
df

Unnamed: 0,c,d
b,3,4


### 索引、取值、过滤

In [16]:
df=pd.DataFrame([[1,2],[3,4]],index=['a','b'],columns=['c','d'])

In [17]:
df

Unnamed: 0,c,d
a,1,2
b,3,4


In [19]:
df['c']

a    1
b    3
Name: c, dtype: int64

In [20]:
# 标签索引
df.loc['a']

c    1
d    2
Name: a, dtype: int64

In [21]:
df.iloc[0]

c    1
d    2
Name: a, dtype: int64

In [22]:
df=pd.read_csv('../../data/titanic.csv')

In [23]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [24]:
# 取第一行
df.loc[0]

PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                               22.0
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object

In [25]:
# 取出[0:1)行的指定列
df.loc[0:1,'Name':'Age']

Unnamed: 0,Name,Sex,Age
0,"Braund, Mr. Owen Harris",male,22.0
1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0


In [26]:
df.iloc[0:1,0:2]

Unnamed: 0,PassengerId,Survived
0,1,0


In [29]:
# 条件过滤
df[df['Age']>70]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.75,,Q
493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0,A23,S
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.775,,S


In [31]:
df[(df['Age']>70)& (df['Sex']=='female')]

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked


In [32]:
# 算数运算和数据对齐
s1 = pd.Series([1, 2, 3], index=['a', 'b', 'c'])
s2 = pd.Series([10, 20], index=['b', 'c'])
s1

a    1
b    2
c    3
dtype: int64

In [33]:
s2

b    10
c    20
dtype: int64

In [34]:
s1+s2

a     NaN
b    12.0
c    23.0
dtype: float64

In [35]:
# 填充缺失值
s1.add(s2,fill_value=0)

a     1.0
b    12.0
c    23.0
dtype: float64

In [41]:
# 函数应用和映射
df[['Age']].apply(sum)


Age   NaN
dtype: float64

In [42]:
s = pd.Series([1, 2, 3])
s.map(lambda x: x * 10)

0    10
1    20
2    30
dtype: int64

In [48]:
# 排序和排名
df.sort_index(axis=1)

Unnamed: 0,Age,Cabin,Embarked,Fare,Name,Parch,PassengerId,Pclass,Sex,SibSp,Survived,Ticket
0,22.0,,S,7.2500,"Braund, Mr. Owen Harris",0,1,3,male,1,0,A/5 21171
1,38.0,C85,C,71.2833,"Cumings, Mrs. John Bradley (Florence Briggs Th...",0,2,1,female,1,1,PC 17599
2,26.0,,S,7.9250,"Heikkinen, Miss. Laina",0,3,3,female,0,1,STON/O2. 3101282
3,35.0,C123,S,53.1000,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",0,4,1,female,1,1,113803
4,35.0,,S,8.0500,"Allen, Mr. William Henry",0,5,3,male,0,0,373450
...,...,...,...,...,...,...,...,...,...,...,...,...
886,27.0,,S,13.0000,"Montvila, Rev. Juozas",0,887,2,male,0,0,211536
887,19.0,B42,S,30.0000,"Graham, Miss. Margaret Edith",0,888,1,female,0,1,112053
888,,,S,23.4500,"Johnston, Miss. Catherine Helen ""Carrie""",2,889,3,female,1,0,W./C. 6607
889,26.0,C148,C,30.0000,"Behr, Mr. Karl Howell",0,890,1,male,0,1,111369


In [49]:
df.sort_values(by='Age', ascending=False)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
630,631,1,1,"Barkworth, Mr. Algernon Henry Wilson",male,80.0,0,0,27042,30.0000,A23,S
851,852,0,3,"Svensson, Mr. Johan",male,74.0,0,0,347060,7.7750,,S
493,494,0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
96,97,0,1,"Goldschmidt, Mr. George B",male,71.0,0,0,PC 17754,34.6542,A5,C
116,117,0,3,"Connors, Mr. Patrick",male,70.5,0,0,370369,7.7500,,Q
...,...,...,...,...,...,...,...,...,...,...,...,...
859,860,0,3,"Razi, Mr. Raihed",male,,0,0,2629,7.2292,,C
863,864,0,3,"Sage, Miss. Dorothy Edith ""Dolly""",female,,8,2,CA. 2343,69.5500,,S
868,869,0,3,"van Melkebeke, Mr. Philemon",male,,0,0,345777,9.5000,,S
878,879,0,3,"Laleff, Mr. Kristo",male,,0,0,349217,7.8958,,S


In [51]:
s = pd.Series([100, 200, 100, 300])
s.rank()

0    1.5
1    3.0
2    1.5
3    4.0
dtype: float64

In [54]:
# pandas 允许 index 重复
s = pd.Series([1, 2, 3], index=['a', 'a', 'b'])
s

a    1
a    2
b    3
dtype: int64

In [55]:
s.index.is_unique

False

In [56]:
s['a']

a    1
a    2
dtype: int64

In [57]:
s[~s.index.duplicated()]

a    1
b    3
dtype: int64

## 汇总和计算描述统计

In [59]:
df['Age'].count()

np.int64(714)

In [60]:
df.count()

PassengerId    891
Survived       891
Pclass         891
Name           891
Sex            891
Age            714
SibSp          891
Parch          891
Ticket         891
Fare           891
Cabin          204
Embarked       889
dtype: int64

In [61]:
df['Age'].describe()

count    714.000000
mean      29.699118
std       14.526497
min        0.420000
25%       20.125000
50%       28.000000
75%       38.000000
max       80.000000
Name: Age, dtype: float64

In [62]:
df['Age'].min()

np.float64(0.42)

In [63]:
df['Age'].max()

np.float64(80.0)

In [64]:
df['Fare'].argmax()

np.int64(258)

In [65]:
df.iloc[df['Fare'].argmax()]

PassengerId                 259
Survived                      1
Pclass                        1
Name           Ward, Miss. Anna
Sex                      female
Age                        35.0
SibSp                         0
Parch                         0
Ticket                 PC 17755
Fare                   512.3292
Cabin                       NaN
Embarked                      C
Name: 258, dtype: object

In [66]:
df['Age'].idxmax()

630

In [69]:
df.iloc[630]

PassengerId                                     631
Survived                                          1
Pclass                                            1
Name           Barkworth, Mr. Algernon Henry Wilson
Sex                                            male
Age                                            80.0
SibSp                                             0
Parch                                             0
Ticket                                        27042
Fare                                           30.0
Cabin                                           A23
Embarked                                          S
Name: 630, dtype: object

In [70]:
df['Age'].quantile(0.5)

np.float64(28.0)

In [71]:
df['Age'].quantile([0.25,0.5,0.75])

0.25    20.125
0.50    28.000
0.75    38.000
Name: Age, dtype: float64

In [72]:
df["Survived"].sum()

np.int64(342)

In [73]:
df["Age"].mean()
df["Fare"].mean()

np.float64(32.204207968574636)

In [74]:
df["Age"].median()

np.float64(28.0)

In [76]:
df["Age"].var()

np.float64(211.01912474630802)

In [77]:
df["Age"].std()

np.float64(14.526497332334042)

In [78]:
df["Survived"].cumsum()

0        0
1        1
2        2
3        3
4        3
      ... 
886    340
887    341
888    341
889    342
890    342
Name: Survived, Length: 891, dtype: int64

In [79]:
df.groupby("Pclass")["Survived"].mean()

Pclass
1    0.629630
2    0.472826
3    0.242363
Name: Survived, dtype: float64