# Preliminaries

In [None]:
! pip install gdown

In [None]:
! wget https://raw.githubusercontent.com/bsheese/225exercises/refs/heads/main/hospital_helper.py

In [None]:
import gdown
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import hospital_helper
import os
import numpy as np

file_id = '1BEs2Fa1qJEYWE-HyE3t63PplMi2IbXc9'
output = 'hospital_waste.csv'

if not os.path.exists('/content/hospital_waste.csv'):
  gdown.download(id=file_id, output=output, quiet=False)

df = pd.read_csv(output)
df = hospital_helper.clean_variable_names(df)
df = hospital_helper.quick_cleanup1(df)
df = hospital_helper.quick_cleanup2(df)

**Question 1: Grouping a Single Column**

Select the `rmw` column from the `df` DataFrame. Group this selection by `hospital` and calculate the mean `rmw` for each hospital.


In [None]:
df.groupby('hospital')['rmw'].mean()

**Question 2: Grouping an Entire DataFrame**

Group the entire `df` DataFrame by `hospital`. Then, calculate the median for all numeric columns in each group.

In [None]:
df.groupby('hospital').median(numeric_only=True)

**Question 3: Grouping by Multiple Keys**

Select the `rmw` column from the `df` DataFrame. Group this selection by both `city` and `state`, and then calculate the median `rmw` for each city/state combination.

In [None]:
df.groupby(['city', 'state'])['rmw'].median()

**Question 4: Iterating Over Groups**

Write a `for` loop that iterates through the DataFrame grouped by `state`. For each state, print the state's name and the total (sum) of regulated medical waste (`rmw`) generated in that state.

In [None]:
for state_name, state_data in df.groupby('state'):
    total_rmw = state_data['rmw'].sum()
    print(f"{state_name}: Total RMW = {total_rmw}")

**Question 5: Selecting a Column After Grouping**

Group the DataFrame by `city`. Calculate the mean regulated medical waste (`rmw`) for each city. Write the line of code that would produce a pandas **Series** as the result.

In [None]:
df.groupby('city')['rmw'].mean()

**Question 6: Selecting a Subset of Columns After Grouping**

Group the DataFrame by `city`. Calculate the mean regulated medical waste (`rmw`) for each city. Write the line of code that would produce a pandas **DataFrame** as the result.

In [None]:
df.groupby('city')[['rmw']].mean()

**Question 7: Multiple Aggregations**

Group the DataFrame by `state` and use the `.agg()` method to calculate the mean and the median for both the `rmw` and `rmw/apd` columns.


In [None]:
df.groupby('state')[['rmw', 'rmw/apd']].agg(['mean', 'median'])

**Question 8: Customizing Aggregation Output Column Names**

Group the DataFrame by `hospital`. Calculate the minimum and maximum regulated medical waste (`rmw`) using `agg()`. The resulting columns should be named `min_rmw` and `max_rmw`, respectively.

In [None]:
df.groupby('hospital')['rmw'].agg(min_rmw='min', max_rmw='max')

**Question 9: Filling Missing Values**

Some rows in the `rmw/apd` column may be missing. Write code using that fills any missing values in the `rmw/apd` column with the **median** `rmw/apd` value calculated for the specific `hospital` the row belongs to.

In [None]:
df['rmw/apd'] = df.groupby('hospital')['rmw/apd'].transform(lambda x: x.fillna(x.median()))

**Question 10: Quantile Analysis with `pd.qcut()`**

Divide the `rmw/apd` column into 4 groups with an equal number of observations (quartiles) using `pd.qcut()`. Then, group by these quartiles and find the size (count of members) of each group.

In [None]:
quantiles = pd.qcut(df['rmw/apd'], 4)
df.groupby(quantiles, observed=True).size()