In [106]:
import polars as pl
import polars.selectors as cs

# Aggregating rates across rows and columns

## Example - The mean of a rate isn't the combined rate

In [6]:
(df :=
 pl.DataFrame({'Group': 3*['A'] + 2*['B'] + 3*['C'],
               'Value': [1, 2, 3, 5, 2, 3, 3, 2]})
)

Group,Value
str,i64
"""A""",1
"""A""",2
"""A""",3
"""B""",5
"""B""",2
"""C""",3
"""C""",3
"""C""",2


#### Group means

In [8]:
(df
.group_by('Group')
.agg(mean_value = pl.col('Value').mean())
)

Group,mean_value
str,f64
"""A""",2.0
"""C""",2.666667
"""B""",3.5


#### Mean of groups means

In [20]:
(mean_of_group_means :=
 df
.group_by('Group')
.agg(mean_value = pl.col('Value').mean())
.select(pl.col('mean_value').mean().alias('mean_of_means'))
)

mean_of_means
f64
2.722222


#### Grand mean

In [21]:
(grand_mean :=
 df
.select(pl.col('Value').mean().alias('grand_mean'))
)

grand_mean
f64
2.625


#### Mean of the group means `!=` grand mean

In [23]:
mean_of_group_means - grand_mean

mean_of_means
f64
0.097222


### Examples of common rates

1. Mean,
2. Percent of total and other ratios,
3. Population rates (e.g., X per 1K people),
4. Anything you describe using "per"


## Review - Aggregating rates

When aggregating a rate across groups, we need to

1. Aggregate the numerator and demon, then
2. Compute the rate from to using the aggregated values.

In [22]:
(grand_mean_via_groups :=
 df
.group_by('Group')
.agg(sum_value = pl.col('Value').sum(),
     cnt_value = pl.col('Value').count())
.select(pl.col('sum_value').sum().alias('sum_of_sums'),
        pl.col('cnt_value').sum().alias('cnt_of_cnts'),
       )
.with_columns(grand_mean = pl.col('sum_of_sums')/pl.col('cnt_of_cnts'))
.drop('sum_of_sums', 'cnt_of_cnts')
)

grand_mean
f64
2.625


In [27]:
grand_mean_via_groups - grand_mean

grand_mean
f64
0.0


### Why do we care?

Because sometimes the data is already an aggregated value!

In [26]:
( auto_sales :=
  pl.read_csv('./data/auto_sales_*.csv')
  .rename({'':'ID'})
)

ID,Salesperson,Compact,Sedan,SUV,Truck
i64,str,i64,i64,i64,i64
0,"""Ann""",22,18,15,12
1,"""Bob""",19,12,17,20
2,"""Yolanda""",19,8,32,15
3,"""Xerxes""",12,23,18,9
0,"""Ann""",22,18,15,12
1,"""Bob""",20,14,6,24
2,"""Yolanda""",19,10,28,17
3,"""Xerxes""",11,27,17,9


## Aggregating a rate across rows and columns

1. Compute the numerator and denominator row aggregations using `group_by` and `agg`.
2. Compute the numerator and denominator column aggregations using `reduce` or `fold`.

In [29]:
(type_columns := 
 [c for c in auto_sales.columns if c not in ('ID','Salesperson')]
)

['Compact', 'Sedan', 'SUV', 'Truck']

In [79]:
(auto_sales
 .group_by('Salesperson')
 .agg(**{f'sum_{c}':pl.col(c).sum() for c in type_columns},
      **{f'cnt_{c}':pl.col(c).count() for c in type_columns},
     )
 .with_columns(sum_types = pl.reduce(lambda acc, col: acc + col, cs.starts_with('sum')),
               cnt_types = pl.reduce(lambda acc, col: acc + col, cs.starts_with('cnt')),
              )
 .with_columns(grand_mean = pl.col('sum_types')/pl.col('cnt_types'))
.drop(cs.starts_with('sum') | cs.starts_with('cnt'))
 
)

Salesperson,grand_mean
str,f64
"""Ann""",16.75
"""Yolanda""",18.5
"""Xerxes""",15.75
"""Bob""",16.5


## <font color="red"> Exercise 3.9.2 </font> - World Bank Population Ratio (urban/total) for each region over the 1980's

Use the approach illustrated in the last example to compute the overall ratio of Urban to Total population for each region across all years in the 1990's. Do this by

1. Loading the raw WB data,
2. Select the columns of interest using column selectors,
3. Filter to the two measures of interest,
4. Unstack the two measures,
5. Group and aggregate the numerator and denominator of the ratio across rows,
6. Use `reduce` to aggregate the numerator and denominator of the ratio across columns, and
7. Compute the ratio.

In [None]:
( wb :=
  pl.read_csv('./data/world_bank_raw_download_F23.csv', infer_schema_length=10000, null_values = '..')
  .filter(pl.col("Series Name").is_not_null())
)

In [110]:
(wb_ratio_5 :=
 wb
.select(cs.string() - cs.contains('Code') | cs.starts_with('199'))
.filter(pl.col('Series Name').str.contains(r'^(Urban|Population)'))
.unpivot(on = cs.starts_with('199'),
         index= cs.string(),
         variable_name = "Year",
         value_name = "Number of People"
          )
.pivot(on = 'Series Name',
       index = cs.string() - cs.by_name('Series Name'),
       aggregate_function='sum'
      )
.group_by("Region")
.agg([
        pl.col("Urban population").sum().alias("sum_urban_population"),
        pl.col("Population, total").sum().alias("sum_total_population")
     ])
.with_columns(
        total_sum_urban=pl.reduce(lambda acc, col: acc + col, [pl.col("sum_urban_population")]),
        total_sum_total=pl.reduce(lambda acc, col: acc + col, [pl.col("sum_total_population")])
    )
.with_columns(
        overall_ratio=(pl.col("total_sum_urban") / pl.col("total_sum_total"))
    )

)

Region,sum_urban_population,sum_total_population,total_sum_urban,total_sum_total,overall_ratio
str,f64,f64,f64,f64,f64
"""Africa""",14241000000.0,42861000000.0,14241000000.0,42861000000.0,0.332257
"""Asia""",61075000000.0,186640000000.0,61075000000.0,186640000000.0,0.327232
"""Europe""",30163000000.0,42882000000.0,30163000000.0,42882000000.0,0.703395
,227520000000.0,544660000000.0,227520000000.0,544660000000.0,0.417733
"""The Americas""",34530000000.0,46308000000.0,34530000000.0,46308000000.0,0.745658
"""Oceania""",1200500000.0,1715900000.0,1200500000.0,1715900000.0,0.699632
"""Middle East""",5607700000.0,9036600000.0,5607700000.0,9036600000.0,0.62056
