In [34]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt

In [2]:
titanic = pd.read_csv('./titanic.csv')
titanic.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


## Summary statistics

### Summarizing numerical data
- .mean()
- .median()
- .min()
- .maxx()
- .var()
- .std()
- .sum()
- .quantile()

In [3]:
titanic['Age'].mean()

29.69911764705882

In [4]:
titanic['Age'].mode()

0    24.0
dtype: float64

In [5]:
titanic.Age.min()

0.42

In [6]:
titanic.Age.max()

80.0

In [7]:
titanic['Age'].var() #<--Return unbiased variance over requested axis.

211.0191247463081

In [8]:
titanic['Age'].quantile() #<--Return values at the given quantile over requested axis.

28.0

In [9]:
titanic['Age'].std()

14.526497332334044

In [10]:
titanic['Age'].sum()

21205.17

### summarizing dates

### .agg() method

##### on Single column

In [11]:
def pct30(column): return column.quantile(0.3)

In [12]:
titanic['Age'].agg(pct30)#<-- applying agg() on a column using simple function

22.0

In [13]:
titanic['Age'].agg(lambda x: x.quantile(.3)) #<-- using lambda function

22.0

##### on multiple column

In [14]:
titanic[['Age', 'Fare']].agg(lambda x: x.quantile(0.3))

Age     22.00
Fare     8.05
dtype: float64

##### multiple summaries

In [15]:
def pct30(column): return column.quantile(0.3)
def pct40(column): return column.quantile(0.4)

In [16]:
titanic['Age'].agg([pct30,pct40])

pct30    22.0
pct40    25.0
Name: Age, dtype: float64

### cumulative statistics
- .cumsum()
- .cummax()
- .cummin()
- .cumprod()

In [17]:
pd.DataFrame(titanic['Age'].cumsum()).head(4)

Unnamed: 0,Age
0,22.0
1,60.0
2,86.0
3,121.0


## Counting

#### Dropping duplicate names

In [18]:
titanic.drop_duplicates(subset = "Pclass")

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


In [19]:
titanic.drop_duplicates(subset = ["Pclass", 'SibSp'])

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.075,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C
15,16,1,2,"Hewlett, Mrs. (Mary D Kingcome)",female,55.0,0,0,248706,16.0,,S
16,17,0,3,"Rice, Master. Eugene",male,2.0,4,1,382652,29.125,,Q
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,C23 C25 C27,S
38,39,0,3,"Vander Planke, Miss. Augusta Maria",female,18.0,2,0,345764,18.0,,S


#### .values_count()

In [20]:
pd.DataFrame(titanic['Age'].value_counts())

Unnamed: 0,Age
24.00,30
22.00,27
18.00,26
19.00,25
30.00,25
...,...
55.50,1
70.50,1
66.00,1
23.50,1


In [21]:
pd.DataFrame(titanic['Age'].value_counts(sort=True))

Unnamed: 0,Age
24.00,30
22.00,27
18.00,26
19.00,25
30.00,25
...,...
55.50,1
70.50,1
66.00,1
23.50,1


In [22]:
pd.DataFrame(titanic['Age'].value_counts(normalize=True))

Unnamed: 0,Age
24.00,0.042017
22.00,0.037815
18.00,0.036415
19.00,0.035014
30.00,0.035014
...,...
55.50,0.001401
70.50,0.001401
66.00,0.001401
23.50,0.001401


## Group summary satistics

In [23]:
titanic[titanic['Sex'] == 'male']['Age'].mean()

30.72664459161148

In [24]:
titanic[titanic['Sex'] == 'female']['Age'].mean()

27.915708812260537

In [25]:
titanic.groupby('Sex')['Age'].mean()

Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64

In [26]:
titanic.groupby(['Survived', 'Sex'])['Age'].count() # < -- multiple group

Survived  Sex   
0         female     64
          male      360
1         female    197
          male       93
Name: Age, dtype: int64

In [27]:
titanic.groupby('Sex')['Age'].agg(['count', 'min', 'max'])# <-- multiple stats

Unnamed: 0_level_0,count,min,max
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,261,0.75,63.0
male,453,0.42,80.0


In [28]:
titanic.groupby(['Survived', 'Sex'])[['Age', 'SibSp']].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,SibSp
Survived,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1
0,female,25.046875,1.209877
0,male,31.618056,0.440171
1,female,28.847716,0.515021
1,male,27.276022,0.385321


In [29]:
titanic.groupby(['Survived', 'Sex'])[['Age', 'SibSp']].agg(['count', 'min', 'max'])

Unnamed: 0_level_0,Unnamed: 1_level_0,Age,Age,Age,SibSp,SibSp,SibSp
Unnamed: 0_level_1,Unnamed: 1_level_1,count,min,max,count,min,max
Survived,Sex,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
0,female,64,2.0,57.0,81,0,8
0,male,360,1.0,74.0,468,0,8
1,female,197,0.75,63.0,233,0,4
1,male,93,0.42,80.0,109,0,4


## Pivot tables
**Signature**:
titanic.pivot_table(
    values=None,
    index=None,
    columns=None,
    aggfunc='mean',
    fill_value=None,
    margins=False,
    dropna=True,
    margins_name='All',
    observed=False,
)

In [31]:
titanic.groupby('Sex')['Age'].mean()

Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64

In [32]:
#pivot and implicitly define agffunc=np.mean
titanic.pivot_table(values = 'Age', index='Sex')

Unnamed: 0_level_0,Age
Sex,Unnamed: 1_level_1
female,27.915709
male,30.726645


In [40]:
#explicitly define statistics i:e np.median
titanic.pivot_table(values= 'Age', index='Sex', aggfunc=np.median)

Unnamed: 0_level_0,Age
Sex,Unnamed: 1_level_1
female,27.0
male,29.0


In [41]:
#multiple statistics
titanic.pivot_table(values='Age', index='Sex', aggfunc=[np.std, np.median])

Unnamed: 0_level_0,std,median
Unnamed: 0_level_1,Age,Age
Sex,Unnamed: 1_level_2,Unnamed: 2_level_2
female,14.110146,27.0
male,14.678201,29.0


#### pivot on two varibales

In [49]:
#in groupby

# titanic.groupby(['Survived','Sex'])['Age'].mean().unstack()

#pivot on two varibales
titanic.pivot_table(values='Age', index='Sex', columns='Survived')

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,25.046875,28.847716
male,31.618056,27.276022


#### filling missing values in pivot table

In [50]:
titanic.pivot_table(values='Age', index='Sex', columns='Survived', fill_value=0)

Survived,0,1
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1
female,25.046875,28.847716
male,31.618056,27.276022


#### summing with pivot table

In [51]:
titanic.pivot_table(values='Age', 
                    index='Sex', 
                    columns='Survived',
                    fill_value=0,
                    margins=True)

Survived,0,1,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,25.046875,28.847716,27.915709
male,31.618056,27.276022,30.726645
All,30.626179,28.34369,29.699118


In [52]:
titanic.pivot_table(values='Age', 
                    index='Sex', 
                    columns='Survived',
                    fill_value=0,
                    margins=True,
                    margins_name='mean')

Survived,0,1,mean
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,25.046875,28.847716,27.915709
male,31.618056,27.276022,30.726645
mean,30.626179,28.34369,29.699118
