# Chapter 5 - Getting Started with `pandas`

## 5.3 Summarizing and Computing Descriptive Statistics

In [1]:
import pandas as pd
import numpy as np

`Series` and `DataFrame` come in-built with common mathematical and statistical methods. Most of these fall into the category of reductions or summary statistics, methods that extract a single value (like the sum or mean) from a Series or a Series of values from the rows or columns of a DataFrame.

Some common functions include:

- `df.sum()`

- `df.idxmax()`

- `df.idxmin()`

- `df.cumsum()`

- `df.describe()`

In [2]:
df = pd.read_csv('dataset-D-wines.csv', dtype={'id' : np.str, 'points' : np.int, 'price' : np.float})
display(df.describe(include='all'))

Unnamed: 0,id,variety,points,price
count,20.0,20,20.0,18.0
unique,20.0,15,,
top,28236.0,Pinot Noir,,
freq,1.0,4,,
mean,,,88.05,30.5
std,,,2.910507,18.712217
min,,,82.0,10.0
25%,,,85.0,17.75
50%,,,88.5,22.0
75%,,,90.0,36.0


In [3]:
# df.sum() returns a series containing column sums
df[['points', 'price']].sum()

points    1761.0
price      549.0
dtype: float64

In [4]:
# When specifying skipna=False, the column sum will be NaN if there is at least one NaN in the Series to sum
df[['points', 'price']].sum(skipna=False)

points    1761.0
price        NaN
dtype: float64

In [5]:
df2 = pd.read_csv('dataset-C-enrolment.csv')
# Slice the df and make year the index
df2 = df2[(df2.sex=='MF')][['year', 'intake', 'graduates']]
df2.index=df2.year
df2.drop('year', inplace=True, axis=1)
display(df2.T)

year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
intake,220,238,332,356,383,350,364,374,403,397,405,399,391
graduates,187,204,207,209,207,227,329,347,368,356,355,351,375


In [6]:
# sum(axis=1) sums across the columns
print(df2.T.sum(axis=1))

intake       4612
graduates    3722
dtype: int64


In [7]:
# df.sum() returns a series containing column sums
display(df.loc[:2,['points', 'price']])
df.loc[:2,['points', 'price']].sum(axis=1)

Unnamed: 0,points,price
0,88,36.0
1,85,50.0
2,85,10.0


0    124.0
1    135.0
2     95.0
dtype: float64

In [8]:
df3 = pd.read_csv('dataset-D1-wines.csv')
display(df3)

# idxmax() and idxmin() returns the index values where the maximum
# and minimum values can be found, respectively
print(df3[['points', 'price']].idxmax())
print()
print(df3[['points', 'price']].idxmin())

Unnamed: 0,id,variety,points,price
0,146568,Chardonnay,88,12.0
1,99586,Cabernet Sauvignon,92,65.0
2,74081,Aglianico,90,
3,49142,Marzemino,90,75.0
4,86968,Nebbiolo,91,


points    1
price     3
dtype: int64

points    0
price     0
dtype: int64


In [9]:
# df.cumsum() accumulates down the row, adding each value from the row along the way
display(df2)
display(df2.cumsum())

Unnamed: 0_level_0,intake,graduates
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2005,220,187
2006,238,204
2007,332,207
2008,356,209
2009,383,207
2010,350,227
2011,364,329
2012,374,347
2013,403,368
2014,397,356


Unnamed: 0_level_0,intake,graduates
year,Unnamed: 1_level_1,Unnamed: 2_level_1
2005,220,187
2006,458,391
2007,790,598
2008,1146,807
2009,1529,1014
2010,1879,1241
2011,2243,1570
2012,2617,1917
2013,3020,2285
2014,3417,2641


In [10]:
# cumsum() also works with axis=1, this way it sums across columns traversing right
display(df2.T)
display(df2.T.cumsum(axis=1))

year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
intake,220,238,332,356,383,350,364,374,403,397,405,399,391
graduates,187,204,207,209,207,227,329,347,368,356,355,351,375


year,2005,2006,2007,2008,2009,2010,2011,2012,2013,2014,2015,2016,2017
intake,220,458,790,1146,1529,1879,2243,2617,3020,3417,3822,4221,4612
graduates,187,391,598,807,1014,1241,1570,1917,2285,2641,2996,3347,3722


In [11]:
# df.describe() gives the summary statistics of all columns in the df
display(df2.describe())
display(df.describe(include='all'))

Unnamed: 0,intake,graduates
count,13.0,13.0
mean,354.769231,286.307692
std,60.204033,77.737148
min,220.0,187.0
25%,350.0,207.0
50%,374.0,329.0
75%,397.0,355.0
max,405.0,375.0


Unnamed: 0,id,variety,points,price
count,20.0,20,20.0,18.0
unique,20.0,15,,
top,28236.0,Pinot Noir,,
freq,1.0,4,,
mean,,,88.05,30.5
std,,,2.910507,18.712217
min,,,82.0,10.0
25%,,,85.0,17.75
50%,,,88.5,22.0
75%,,,90.0,36.0


### Correlation and Covariance

Correlation and Covariance are calculated from pairs of arguments.

In [12]:
wines_df = pd.read_csv('dataset-D-wines.csv')
# df.corr() gives the correlation matrix of the 2 variables
display(wines_df[['points', 'price']].corr())

loans_df = pd.read_csv('dataset-A-loans.csv', index_col=0)
# Series.corr(Series) will give the correlation of each value pair in the two Series objects
display(loans_df['loan_amnt'].corr(loans_df['int_rate']))

Unnamed: 0,points,price
points,1.0,0.53859
price,0.53859,1.0


-0.5115868676889116

In [13]:
# df.corr() gives the covariance matrix of the 2 variables
display(wines_df[['points', 'price']].cov())

loans_df = pd.read_csv('dataset-A-loans.csv', index_col=0)
# Series.corr(Series) will give the covariance of the 2 Series of values
display(loans_df['loan_amnt'].cov(loans_df['int_rate']))

Unnamed: 0,points,price
points,8.471053,29.647059
price,29.647059,350.147059


-20645.0

In [14]:
# df.corrwith(Series) gives the pairwise correlations between the df's
# columns with another Series / df
loans_df[['loan_amnt', 'int_rate']].corrwith(loans_df['int_rate'])

loan_amnt   -0.511587
int_rate     1.000000
dtype: float64

### Unique Values, Value Counts, and Membership

These are related to the cardinality of each `Series`. The related functions are:

- `Series.unique()`

- `Series.value_counts()`

- `Series.isin()`

In [15]:
df4 = pd.read_csv('dataset-A-loans.csv', index_col=0)
display(df4)

Unnamed: 0,loan_amnt,int_rate,term,grade
48304290,30000.0,8.18,36 months,B
49904421,14225.0,13.33,60 months,C
32038416,12000.0,20.2,60 months,E
11456303,18000.0,8.39,36 months,A
23613274,4000.0,12.49,36 months,B
55949701,15000.0,16.99,60 months,D


In [16]:
# Series.unique() gives all unique values in a Series
df4['grade'].unique()

array(['B', 'C', 'E', 'A', 'D'], dtype=object)

In [17]:
# Series.value_counts() gives the unique values in a Series and their frequencies
df4['term'].value_counts()

 36 months    3
 60 months    3
Name: term, dtype: int64

In [18]:
display(df4['grade'])
# Series.isin(list) gives Boolean values indicating if it exists in a list passed in
# NOTE: This is very useful for filtering
display(df4['grade'].isin(['A', 'B']))

48304290    B
49904421    C
32038416    E
11456303    A
23613274    B
55949701    D
Name: grade, dtype: object

48304290     True
49904421    False
32038416    False
11456303     True
23613274     True
55949701    False
Name: grade, dtype: bool

<hr>

**References:**

Python for Data Analysis, 2nd Edition, McKinney (2017)