In [1]:
# import libraries
import pandas as pd
import numpy as np

### Combine summary statistics and subseting

In [2]:
#load dataset
pop = pd.read_csv('../datasets/clean/pop.csv', index_col=0, header=0)
display(pop.head())

Unnamed: 0,Area,Element,Year,Unit,Value
0,Afghanistan,Total Population - Both sexes,1950,1000 persons,7752.118
1,Afghanistan,Total Population - Male,1950,1000 persons,4099.243
2,Afghanistan,Total Population - Female,1950,1000 persons,3652.874
3,Afghanistan,Rural population,1950,1000 persons,7286.991
4,Afghanistan,Urban population,1950,1000 persons,465.127


In [3]:
# For each store type, aggregate weekly_sales: get min, max, mean, and median
pop_stats = pop.groupby('Element')['Value'].agg([min, max, np.mean, np.median])
print(pop_stats)

                                 min          max          mean     median
Element                                                                   
Rural population               0.000   893272.090  16017.317498  1344.1570
Total Population - Both sexes  0.642  1459377.612  27495.710560  3344.2680
Total Population - Female      6.889   711188.365  15698.798808  2401.3380
Total Population - Male        6.812   748189.245  15999.003693  2394.9885
Urban population               0.000   863601.691  11490.376450  1146.8050


#### Pivot table

In [4]:
# Pivot for mean and median population value for each element 
mean_pop_by_element = pop.pivot_table(values = 'Value', index ='Element', aggfunc=[np.mean, np.median]) 
#the default aggfunc is mean
print(mean_pop_by_element)

                                       mean     median
                                      Value      Value
Element                                               
Rural population               16017.317498  1344.1570
Total Population - Both sexes  27495.710560  3344.2680
Total Population - Female      15698.798808  2401.3380
Total Population - Male        15999.003693  2394.9885
Urban population               11490.376450  1146.8050


In [5]:
# Print the mean population value for each element and country; fill missing values with 0s; 
print(pop.pivot_table(values="Value", index="Element", columns="Area", fill_value=0))

Area                            Afghanistan      Albania       Algeria  \
Element                                                                  
Rural population               13159.032203  1536.157594  10419.227783   
Total Population - Both sexes  16610.597812  2533.624246  22791.334000   
Total Population - Female       8077.201261  1244.939217  11283.972957   
Total Population - Male         8533.397188  1288.684348  11507.360754   
Urban population                3154.671072   997.890826  12475.643377   

Area                           American Samoa    Andorra        Angola  \
Element                                                                  
Rural population                     7.600986   5.214652   7102.500290   
Total Population - Both sexes       39.528420  45.019333  12279.778522   
Total Population - Female            0.000000   0.000000   6156.048696   
Total Population - Male              0.000000   0.000000   6123.732319   
Urban population                    3

In [6]:
# sum all rows and cols
print('\nTotal from mean:\n', pop.pivot_table(values="Value", index="Element", columns="Area", fill_value=0).sum(0).sum())


Total from mean:
 19141171.950865466


### Index

In [7]:
# see head of original pop dataframe again to compare
display(pop.head())

# Index by country (area)
pop_e = pop.set_index('Element')

display(pop_e.head())
# Reset the index, keeping its contents
display(pop_e.reset_index(drop=False).head())
# Reset the index, dropping its contents
display(pop_e.reset_index(drop=True).head())

Unnamed: 0,Area,Element,Year,Unit,Value
0,Afghanistan,Total Population - Both sexes,1950,1000 persons,7752.118
1,Afghanistan,Total Population - Male,1950,1000 persons,4099.243
2,Afghanistan,Total Population - Female,1950,1000 persons,3652.874
3,Afghanistan,Rural population,1950,1000 persons,7286.991
4,Afghanistan,Urban population,1950,1000 persons,465.127


Unnamed: 0_level_0,Area,Year,Unit,Value
Element,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total Population - Both sexes,Afghanistan,1950,1000 persons,7752.118
Total Population - Male,Afghanistan,1950,1000 persons,4099.243
Total Population - Female,Afghanistan,1950,1000 persons,3652.874
Rural population,Afghanistan,1950,1000 persons,7286.991
Urban population,Afghanistan,1950,1000 persons,465.127


Unnamed: 0,Element,Area,Year,Unit,Value
0,Total Population - Both sexes,Afghanistan,1950,1000 persons,7752.118
1,Total Population - Male,Afghanistan,1950,1000 persons,4099.243
2,Total Population - Female,Afghanistan,1950,1000 persons,3652.874
3,Rural population,Afghanistan,1950,1000 persons,7286.991
4,Urban population,Afghanistan,1950,1000 persons,465.127


Unnamed: 0,Area,Year,Unit,Value
0,Afghanistan,1950,1000 persons,7752.118
1,Afghanistan,1950,1000 persons,4099.243
2,Afghanistan,1950,1000 persons,3652.874
3,Afghanistan,1950,1000 persons,7286.991
4,Afghanistan,1950,1000 persons,465.127


In [8]:
# to subset using a list of values we used subset 
pop_ru = pop[pop['Element'].isin(['Rural population','Urban population'])] 
display(pop_ru.head())
# if the elements of the list are in the index we can just use simple notation
display(pop_e.loc[['Rural population','Urban population']])

Unnamed: 0,Area,Element,Year,Unit,Value
3,Afghanistan,Rural population,1950,1000 persons,7286.991
4,Afghanistan,Urban population,1950,1000 persons,465.127
8,Afghanistan,Rural population,1951,1000 persons,7352.856
9,Afghanistan,Urban population,1951,1000 persons,486.654
13,Afghanistan,Rural population,1952,1000 persons,7425.363


Unnamed: 0_level_0,Area,Year,Unit,Value
Element,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rural population,Afghanistan,1950,1000 persons,7286.991
Rural population,Afghanistan,1951,1000 persons,7352.856
Rural population,Afghanistan,1952,1000 persons,7425.363
Rural population,Afghanistan,1953,1000 persons,7504.561
Rural population,Afghanistan,1954,1000 persons,7590.370
...,...,...,...,...
Urban population,Zimbabwe,2014,1000 persons,5009.401
Urban population,Zimbabwe,2015,1000 persons,5109.485
Urban population,Zimbabwe,2016,1000 persons,5215.894
Urban population,Zimbabwe,2017,1000 persons,5328.766


In [9]:
# multilevel index, access with tuples
pop_ea = pop.set_index(['Element', 'Area'])
display(pop_ea.head())
display(pop_ea.loc[['Rural population', 'Urban population']])
display(pop_ea.loc[[('Rural population', 'Afghanistan'), ('Rural population', 'Zimbabwe')]])
display(pop_ea.loc[pd.IndexSlice[:, 'Zimbabwe'],:]) # to ignore index at level 0

Unnamed: 0_level_0,Unnamed: 1_level_0,Year,Unit,Value
Element,Area,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total Population - Both sexes,Afghanistan,1950,1000 persons,7752.118
Total Population - Male,Afghanistan,1950,1000 persons,4099.243
Total Population - Female,Afghanistan,1950,1000 persons,3652.874
Rural population,Afghanistan,1950,1000 persons,7286.991
Urban population,Afghanistan,1950,1000 persons,465.127


Unnamed: 0_level_0,Unnamed: 1_level_0,Year,Unit,Value
Element,Area,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rural population,Afghanistan,1950,1000 persons,7286.991
Rural population,Afghanistan,1951,1000 persons,7352.856
Rural population,Afghanistan,1952,1000 persons,7425.363
Rural population,Afghanistan,1953,1000 persons,7504.561
Rural population,Afghanistan,1954,1000 persons,7590.370
...,...,...,...,...
Urban population,Zimbabwe,2014,1000 persons,5009.401
Urban population,Zimbabwe,2015,1000 persons,5109.485
Urban population,Zimbabwe,2016,1000 persons,5215.894
Urban population,Zimbabwe,2017,1000 persons,5328.766


Unnamed: 0_level_0,Unnamed: 1_level_0,Year,Unit,Value
Element,Area,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Rural population,Afghanistan,1950,1000 persons,7286.991
Rural population,Afghanistan,1951,1000 persons,7352.856
Rural population,Afghanistan,1952,1000 persons,7425.363
Rural population,Afghanistan,1953,1000 persons,7504.561
Rural population,Afghanistan,1954,1000 persons,7590.370
Rural population,...,...,...,...
Rural population,Zimbabwe,2014,1000 persons,10402.274
Rural population,Zimbabwe,2015,1000 persons,10667.966
Rural population,Zimbabwe,2016,1000 persons,10934.468
Rural population,Zimbabwe,2017,1000 persons,11201.138


Unnamed: 0_level_0,Unnamed: 1_level_0,Year,Unit,Value
Element,Area,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total Population - Both sexes,Zimbabwe,1950,1000 persons,2746.852
Total Population - Male,Zimbabwe,1950,1000 persons,1369.575
Total Population - Female,Zimbabwe,1950,1000 persons,1377.277
Rural population,Zimbabwe,1950,1000 persons,2454.484
Urban population,Zimbabwe,1950,1000 persons,292.368
...,...,...,...,...
Total Population - Both sexes,Zimbabwe,2018,1000 persons,14438.802
Total Population - Male,Zimbabwe,2018,1000 persons,6879.119
Total Population - Female,Zimbabwe,2018,1000 persons,7559.693
Rural population,Zimbabwe,2018,1000 persons,11465.748


In [10]:
pop.dtypes

Area        object
Element     object
Year         int64
Unit        object
Value      float64
dtype: object

In [11]:
# subset by setting a range
display(pop.loc[:,'Year':'Value'].head())
display(pop_e[(pop_e.Area == 'Zimbabwe') & (pop_e.Year == 2018)].loc['Total Population - Both sexes':'Rural population'])

Unnamed: 0,Year,Unit,Value
0,1950,1000 persons,7752.118
1,1950,1000 persons,4099.243
2,1950,1000 persons,3652.874
3,1950,1000 persons,7286.991
4,1950,1000 persons,465.127


Unnamed: 0_level_0,Area,Year,Unit,Value
Element,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Total Population - Both sexes,Zimbabwe,2018,1000 persons,14438.802
Total Population - Male,Zimbabwe,2018,1000 persons,6879.119
Total Population - Female,Zimbabwe,2018,1000 persons,7559.693
Rural population,Zimbabwe,2018,1000 persons,11465.748
