# How to calculate summary statistics?

In [1]:
import pandas as pd
import numpy as np

In [2]:
titanic = pd.read_csv("https://raw.githubusercontent.com/pandas-dev/pandas/master/doc/data/titanic.csv")
titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [6]:
#What is the average age of the Titanic passengers?
titanic['Age'].mean()

29.69911764705882

In [9]:
#What is the median age and ticket fare price of the Titanic passengers?
titanic[['Age','Fare']].median()

Age     29.699118
Fare    32.204208
dtype: float64

In [10]:
titanic[["Age", "Fare"]].describe()

Unnamed: 0,Age,Fare
count,714.0,891.0
mean,29.699118,32.204208
std,14.526497,49.693429
min,0.42,0.0
25%,20.125,7.9104
50%,28.0,14.4542
75%,38.0,31.0
max,80.0,512.3292


In [14]:
titanic.shape

(891, 12)

In [15]:
titanic.agg(
      {
           "Age": ["min", "max", "median", "skew"],
          "Fare": ["min", "max", "median", "mean"],
      }
   )

Unnamed: 0,Age,Fare
max,80.0,512.3292
mean,,32.204208
median,28.0,14.4542
min,0.42,0.0
skew,0.389108,


In [17]:
titanic['Age'].skew()

0.38910778230082704

### Aggregating statistics grouped by category

In [18]:
titanic['Age'].nunique()

88

In [19]:
titanic['Age']

0      22.0
1      38.0
2      26.0
3      35.0
4      35.0
       ... 
886    27.0
887    19.0
888     NaN
889    26.0
890    32.0
Name: Age, Length: 891, dtype: float64

In [20]:
#What is the average age for male versus female Titanic passengers?
titanic[["Sex", "Age"]].groupby("Sex").mean()

Unnamed: 0_level_0,Age
Sex,Unnamed: 1_level_1
female,27.915709
male,30.726645


In [23]:
titanic[["Sex", "Age"]].groupby("Sex").max()

Unnamed: 0_level_0,Age
Sex,Unnamed: 1_level_1
female,63.0
male,80.0


In [24]:
titanic.groupby("Sex").mean()

Unnamed: 0_level_0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
female,431.028662,0.742038,2.159236,27.915709,0.694268,0.649682,44.479818
male,454.147314,0.188908,2.389948,30.726645,0.429809,0.235702,25.523893


In [25]:
#It does not make much sense to get the average value of the Pclass. if we are only interested in the average age for each gender, the selection of columns (rectangular brackets [] as usual) is supported on the grouped data as well:
titanic.groupby("Sex")["Age"].mean()

Sex
female    27.915709
male      30.726645
Name: Age, dtype: float64

In [26]:
#What is the mean ticket fare price for each of the sex and cabin class combinations?
titanic.groupby(["Sex", "Pclass"])["Fare"].mean()

Sex     Pclass
female  1         106.125798
        2          21.970121
        3          16.118810
male    1          67.226127
        2          19.741782
        3          12.661633
Name: Fare, dtype: float64

In [29]:
titanic.groupby(["Sex", "Survived"])["Fare","Age"].mean()

  titanic.groupby(["Sex", "Survived"])["Fare","Age"].mean()


Unnamed: 0_level_0,Unnamed: 1_level_0,Fare,Age
Sex,Survived,Unnamed: 2_level_1,Unnamed: 3_level_1
female,0,23.024385,25.046875
female,1,51.938573,28.847716
male,0,21.960993,31.618056
male,1,40.821484,27.276022


### Count number of records by category

In [30]:
titanic["Pclass"].value_counts()

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [33]:
titanic["Sex"].value_counts()

male      577
female    314
Name: Sex, dtype: int64

#### Value_counts() - The function is a shortcut, as it is actually a groupby operation in combination with counting of the number of records within each group:

In [34]:
titanic.groupby("Pclass")["Pclass"].count()

Pclass
1    216
2    184
3    491
Name: Pclass, dtype: int64

In [35]:
titanic.groupby(["Pclass","Sex"])["Pclass","Sex"].count()

  titanic.groupby(["Pclass","Sex"])["Pclass","Sex"].count()


Unnamed: 0_level_0,Unnamed: 1_level_0,Pclass,Sex
Pclass,Sex,Unnamed: 2_level_1,Unnamed: 3_level_1
1,female,94,94
1,male,122,122
2,female,76,76
2,male,108,108
3,female,144,144
3,male,347,347


# Remember

### Aggregation statistics can be calculated on entire columns or rows

### groupby provides the power of the split-apply-combine pattern

### value_counts is a convenient shortcut to count the number of entries in each category of a variable