In [1]:
import numpy as np
import pandas as pd

In [2]:
np.random.seed(0)

In [3]:
pd.options.mode.copy_on_write = True

# Data Aggregation

Group by operations are used to apply some logic based on specific categories. For example, you might want to calculate the average sales per region or the last transaction per customer.

In pandas, group by operations follow a three-stage process called "split-apply-combine":

- **Split:** Divide the data into groups based on some criteria.
- **Apply:** Apply a function to each group independently. It can be an aggregation, group-specific transformation, or even group filtering.
- **Combine:** Combine the results into a data structure (Series or DataFrame).

To perform a groupby operation, you must provide some kind of mapping of labels to group names; usually, it's the column names you want to group by. The result of groupby is a `SeriesGroupBy` or `DataFrameGroupBy` object. This object acts as a container for the grouped data and allows you to perform various operations. The highlights are:

- **Iteration:** Iterating over the groups and applying the desired operations within each group.
- **Indexing:** Access specific groups using their group labels.

In [4]:
df = pd.DataFrame({
    'Category': ['Starter', 'Main', 'Main', 'Dessert', 'Starter', 'Main', 'Dessert', 'Dessert', 'Starter', 'Main'],
    'Dish': ['Salad', 'Steak', 'Pasta', 'Ice Cream', 'Salad', 'Pasta', 'Cake', 'Ice Cream', 'Soup', 'Steak'],
    'Price': [15.50, 115.00, 112.00, 16.00, 14.50, 114.00, 17.00, 15.00, 14.00, 113.00],
    'Quantity': [3, 2, 1, 4, 5, 2, 3, 4, 2, 1],
    'Rating': [np.nan, 4.7, 4.3, np.nan, 4.2, 4.6, 4.9, np.nan, 4.0, 4.4]
})
df

Unnamed: 0,Category,Dish,Price,Quantity,Rating
0,Starter,Salad,15.5,3,
1,Main,Steak,115.0,2,4.7
2,Main,Pasta,112.0,1,4.3
3,Dessert,Ice Cream,16.0,4,
4,Starter,Salad,14.5,5,4.2
5,Main,Pasta,114.0,2,4.6
6,Dessert,Cake,17.0,3,4.9
7,Dessert,Ice Cream,15.0,4,
8,Starter,Soup,14.0,2,4.0
9,Main,Steak,113.0,1,4.4


In [5]:
# DataFrameGroupBy Metadata

groups = df.groupby(['Category', 'Dish'])

print(f'Número de grupos: {groups.ngroups}')
print(f'Grupos: {groups.groups}')

Número de grupos: 6
Grupos: {('Dessert', 'Cake'): [6], ('Dessert', 'Ice Cream'): [3, 7], ('Main', 'Pasta'): [2, 5], ('Main', 'Steak'): [1, 9], ('Starter', 'Salad'): [0, 4], ('Starter', 'Soup'): [8]}


In [6]:
# Iterating over groups
# Note that 'group' is a DataFrame!

for name, group in groups:
    print(name, '\t', type(name))
    print(group, '\t', type(group))

('Dessert', 'Cake') 	 <class 'tuple'>
  Category  Dish  Price  Quantity  Rating
6  Dessert  Cake   17.0         3     4.9 	 <class 'pandas.core.frame.DataFrame'>
('Dessert', 'Ice Cream') 	 <class 'tuple'>
  Category       Dish  Price  Quantity  Rating
3  Dessert  Ice Cream   16.0         4     NaN
7  Dessert  Ice Cream   15.0         4     NaN 	 <class 'pandas.core.frame.DataFrame'>
('Main', 'Pasta') 	 <class 'tuple'>
  Category   Dish  Price  Quantity  Rating
2     Main  Pasta  112.0         1     4.3
5     Main  Pasta  114.0         2     4.6 	 <class 'pandas.core.frame.DataFrame'>
('Main', 'Steak') 	 <class 'tuple'>
  Category   Dish  Price  Quantity  Rating
1     Main  Steak  115.0         2     4.7
9     Main  Steak  113.0         1     4.4 	 <class 'pandas.core.frame.DataFrame'>
('Starter', 'Salad') 	 <class 'tuple'>
  Category   Dish  Price  Quantity  Rating
0  Starter  Salad   15.5         3     NaN
4  Starter  Salad   14.5         5     4.2 	 <class 'pandas.core.frame.DataFram

In [7]:
# Selecting a group

groups.get_group(('Dessert', 'Ice Cream'))

Unnamed: 0,Category,Dish,Price,Quantity,Rating
3,Dessert,Ice Cream,16.0,4,
7,Dessert,Ice Cream,15.0,4,


# Group By Usages

The most common way to use `groupby` is to run aggregation operations such as `min()`, `max()`, `mean()`, `sum()`, `count()`, etc. To run multiple aggregations at once, we can combine `groupby` with the `agg` method.

By default, the column subset chosen becomes the DataFrame index. If this is not the desired behavior, you can use `as_index=False`.

Besides aggregations, pandas has built-in methods to support other kinds of operations over groups. They are:

- **Transformation:** Perform group-restricted boundary operations and return an object indexed the same as the original.
- **Filtration:** Discard partial or whole groups based on a group-wise computation.

In addition, note that the group keys are sorted during the `groupby` operation. You can pass `sort=False` so the sorting is not done. It can increase the speed of the `groupby` in some cases.

In [8]:
df = pd.DataFrame({
    'Category': ['Starter', 'Main', 'Main', 'Dessert', 'Starter', 'Main', 'Dessert', 'Dessert', 'Starter', 'Main'],
    'Dish': ['Salad', 'Steak', 'Pasta', 'Ice Cream', 'Salad', 'Pasta', 'Cake', 'Ice Cream', 'Soup', 'Steak'],
    'Price': [15.50, 115.00, 112.00, 16.00, 14.50, 114.00, 17.00, 15.00, 14.00, 113.00],
    'Quantity': [3, 2, 1, 4, 5, 2, 3, 4, 2, 1],
    'Rating': [np.nan, 4.7, 4.3, np.nan, 4.2, 4.6, 4.9, np.nan, 4.0, 4.4]
})
df

Unnamed: 0,Category,Dish,Price,Quantity,Rating
0,Starter,Salad,15.5,3,
1,Main,Steak,115.0,2,4.7
2,Main,Pasta,112.0,1,4.3
3,Dessert,Ice Cream,16.0,4,
4,Starter,Salad,14.5,5,4.2
5,Main,Pasta,114.0,2,4.6
6,Dessert,Cake,17.0,3,4.9
7,Dessert,Ice Cream,15.0,4,
8,Starter,Soup,14.0,2,4.0
9,Main,Steak,113.0,1,4.4


In [9]:
# Group by column names

df.groupby(['Category', 'Dish']).sum()

Unnamed: 0_level_0,Unnamed: 1_level_0,Price,Quantity,Rating
Category,Dish,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Dessert,Cake,17.0,3,4.9
Dessert,Ice Cream,31.0,8,0.0
Main,Pasta,226.0,3,8.9
Main,Steak,228.0,3,9.1
Starter,Salad,30.0,8,4.2
Starter,Soup,14.0,2,4.0


In [10]:
# Group by using a single column

df.groupby(['Category', 'Dish'])['Price'].sum()

Category  Dish     
Dessert   Cake          17.0
          Ice Cream     31.0
Main      Pasta        226.0
          Steak        228.0
Starter   Salad         30.0
          Soup          14.0
Name: Price, dtype: float64

In [11]:
# Group by using columns subset

df.groupby(['Category'])[['Price', 'Quantity']].sum()

Unnamed: 0_level_0,Price,Quantity
Category,Unnamed: 1_level_1,Unnamed: 2_level_1
Dessert,48.0,11
Main,454.0,6
Starter,44.0,10


In [12]:
# By default, the column subset chosen becomes the DataFrame index.
# Use as_index=False, if this is not the desired behavior.

df.groupby(['Category'], as_index=False)[['Price']].sum()

Unnamed: 0,Category,Price
0,Dessert,48.0
1,Main,454.0
2,Starter,44.0


In [13]:
# Multiples aggregations at once

df.groupby(['Category'], as_index=False)[['Price']].agg(['max', 'min', 'mean'])

Unnamed: 0_level_0,Category,Price,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,max,min,mean
0,Dessert,17.0,15.0,16.0
1,Main,115.0,112.0,113.5
2,Starter,15.5,14.0,14.666667


In [14]:
# Multiple aggregations at once for multiple columns
#
# Note that when multiple functions are applied,
# the resulting DataFrame has hierarchical columns

grouped = df.groupby(['Category'], as_index=False)[['Price', 'Rating']].agg({
    'Price': ['max', 'min', 'mean'],
    'Rating': ['mean'],
})
grouped

Unnamed: 0_level_0,Category,Price,Price,Price,Rating
Unnamed: 0_level_1,Unnamed: 1_level_1,max,min,mean,mean
0,Dessert,17.0,15.0,16.0,4.9
1,Main,115.0,112.0,113.5,4.5
2,Starter,15.5,14.0,14.666667,4.1


In [15]:
# Tip: You can use renaming approaches 
# to make columns flat again

grouped.columns = [' '.join(x).title().replace(' ', '') for x in grouped.columns.values if len(x)>1]
grouped

Unnamed: 0,Category,PriceMax,PriceMin,PriceMean,RatingMean
0,Dessert,17.0,15.0,16.0,4.9
1,Main,115.0,112.0,113.5,4.5
2,Starter,15.5,14.0,14.666667,4.1


In [16]:
df

Unnamed: 0,Category,Dish,Price,Quantity,Rating
0,Starter,Salad,15.5,3,
1,Main,Steak,115.0,2,4.7
2,Main,Pasta,112.0,1,4.3
3,Dessert,Ice Cream,16.0,4,
4,Starter,Salad,14.5,5,4.2
5,Main,Pasta,114.0,2,4.6
6,Dessert,Cake,17.0,3,4.9
7,Dessert,Ice Cream,15.0,4,
8,Starter,Soup,14.0,2,4.0
9,Main,Steak,113.0,1,4.4


In [17]:
# Applying Transformations

df.groupby(['Category'])['Price'].cumsum()

0     15.5
1    115.0
2    227.0
3     16.0
4     30.0
5    341.0
6     33.0
7     48.0
8     44.0
9    454.0
Name: Price, dtype: float64

In [18]:
# Note that the groups are not included in the result.
# If you want them in the DataFrame
# you just need to add them since their index is the same.

df['cumsum'] = df.groupby(['Category'])['Price'].cumsum()
df

Unnamed: 0,Category,Dish,Price,Quantity,Rating,cumsum
0,Starter,Salad,15.5,3,,15.5
1,Main,Steak,115.0,2,4.7,115.0
2,Main,Pasta,112.0,1,4.3,227.0
3,Dessert,Ice Cream,16.0,4,,16.0
4,Starter,Salad,14.5,5,4.2,30.0
5,Main,Pasta,114.0,2,4.6,341.0
6,Dessert,Cake,17.0,3,4.9,33.0
7,Dessert,Ice Cream,15.0,4,,48.0
8,Starter,Soup,14.0,2,4.0,44.0
9,Main,Steak,113.0,1,4.4,454.0


In [19]:
# Applying filtering
# using built-in methods
#
# Get the 0-nth value of each group

df.groupby(['Category'])['Rating'].nth(0, dropna='any')

1    4.7
4    4.2
6    4.9
Name: Rating, dtype: float64

In [20]:
# Applying Filtering Using the Filter Method
#
# The filter method takes a User-Defined Function (UDF) that,
# when applied to an entire group, return a boolean Series (or DataFrame)
# Then, the subset of groups for which UDF was True is returned

df.groupby(['Category'])['Price'].filter(lambda x: x.max() > 100)

1    115.0
2    112.0
5    114.0
9    113.0
Name: Price, dtype: float64

## Generic Applying 

As mentioned before, group by follows a split-apply-combine method to deal with group by operations. Consequently, it provides the necessary flexibility to apply any UDF on groups. For example, you can use it to calculate complex statistics, transform data, or even create new columns based on group-specific logic.

Note, however, that the groups are not included in the result. To keep them, you must set the parameter `group_keys=True`


In [21]:
df = pd.DataFrame({
    'Category': ['Starter', 'Main', 'Main', 'Dessert', 'Starter', 'Main', 'Dessert', 'Dessert', 'Starter', 'Main'],
    'Dish': ['Salad', 'Steak', 'Pasta', 'Ice Cream', 'Salad', 'Pasta', 'Cake', 'Ice Cream', 'Soup', 'Steak'],
    'Price': [15.50, 115.00, 112.00, 16.00, 14.50, 114.00, 17.00, 15.00, 14.00, 113.00],
    'Quantity': [3, 2, 1, 4, 5, 2, 3, 4, 2, 1],
    'Rating': [np.nan, 4.7, 4.3, np.nan, 4.2, 4.6, 4.9, np.nan, 4.0, 4.4]
})
df

Unnamed: 0,Category,Dish,Price,Quantity,Rating
0,Starter,Salad,15.5,3,
1,Main,Steak,115.0,2,4.7
2,Main,Pasta,112.0,1,4.3
3,Dessert,Ice Cream,16.0,4,
4,Starter,Salad,14.5,5,4.2
5,Main,Pasta,114.0,2,4.6
6,Dessert,Cake,17.0,3,4.9
7,Dessert,Ice Cream,15.0,4,
8,Starter,Soup,14.0,2,4.0
9,Main,Steak,113.0,1,4.4


In [22]:
def minmax(g):
    return (g - g.min()) / (g.max() - g.min())

In [23]:
df.groupby(['Category'], group_keys=True)['Price'].apply(minmax)

Category   
Dessert   3    0.500000
          6    1.000000
          7    0.000000
Main      1    1.000000
          2    0.000000
          5    0.666667
          9    0.333333
Starter   0    1.000000
          4    0.333333
          8    0.000000
Name: Price, dtype: float64

In [24]:
df.groupby(['Category'], group_keys=False)['Price'].apply(minmax)

0    1.000000
1    1.000000
2    0.000000
3    0.500000
4    0.333333
5    0.666667
6    1.000000
7    0.000000
8    0.000000
9    0.333333
Name: Price, dtype: float64

In [25]:
df['NormalizedPrice'] = df.groupby(['Category'], group_keys=False)['Price'].apply(minmax)
df

Unnamed: 0,Category,Dish,Price,Quantity,Rating,NormalizedPrice
0,Starter,Salad,15.5,3,,1.0
1,Main,Steak,115.0,2,4.7,1.0
2,Main,Pasta,112.0,1,4.3,0.0
3,Dessert,Ice Cream,16.0,4,,0.5
4,Starter,Salad,14.5,5,4.2,0.333333
5,Main,Pasta,114.0,2,4.6,0.666667
6,Dessert,Cake,17.0,3,4.9,1.0
7,Dessert,Ice Cream,15.0,4,,0.0
8,Starter,Soup,14.0,2,4.0,0.0
9,Main,Steak,113.0,1,4.4,0.333333


# References

- [Python for Data Analysis by Wes McKinney (3e)](https://wesmckinney.com/book/)
- [Pandas Official Documentation](https://pandas.pydata.org/docs/user_guide/10min.html)
- [Frequently Asked Questions (FAQ) on Pandas](https://pandas.pydata.org/docs/user_guide/gotchas.html)


# Exercises

To help you understand the concepts covered in this notebook, here are some practice problems.

These questions refer to a dataset containing information on the type, cast, director, description and name of Netflix titles over the years. The dataset is available on [Kaggle by Rabie El Kharoua](https://www.kaggle.com/datasets/rabieelkharoua/predict-pet-adoption-status-dataset).


## Dataset Description
The Pet Adoption Dataset provides a comprehensive look into various factors that can influence the likelihood of a pet being adopted from a shelter. This dataset includes detailed information about pets available for adoption, covering various characteristics and attributes.

## Features

- PetID: Unique identifier for each pet.
- PetType: Type of pet (e.g., Dog, Cat, Bird, Rabbit).
- Breed: Specific breed of the pet.
- AgeMonths: Age of the pet in months.
- Color: Color of the pet.
- Size: Size category of the pet (Small, Medium, Large).
- WeightKg: Weight of the pet in kilograms.
- Vaccinated: Vaccination status of the pet (0 - Not vaccinated, 1 - Vaccinated).
- HealthCondition: Health condition of the pet (0 - Healthy, 1 - Medical condition).
- TimeInShelterDays: Duration the pet has been in the shelter (days).
- AdoptionFee: Adoption fee charged for the pet (in dollars).
- PreviousOwner: Whether the pet had a previous owner (0 - No, 1 - Yes).
- AdoptionLikelihood: Likelihood of the pet being adopted (0 - Unlikely, 1 - Likely).


# Note
You may need to specify the dataset path explictly if using Windows

In [28]:
df = pd.read_csv('datasets/pet-adoption/pet_adoption_data.csv')
df

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood
0,500,Bird,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0
1,501,Rabbit,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0
2,502,Dog,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0
3,503,Bird,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0
4,504,Rabbit,Rabbit,123,Gray,Large,20.498100,0,0,28,14,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002,2502,Dog,Poodle,72,Orange,Small,27.039045,1,0,66,26,1,1
2003,2503,Rabbit,Rabbit,124,Brown,Small,4.726954,1,1,59,150,0,0
2004,2504,Rabbit,Rabbit,113,Orange,Small,1.758592,1,0,68,302,0,0
2005,2505,Dog,Labrador,12,Gray,Large,20.961592,1,0,59,478,0,0


1. For each PetType, calculate the average AdoptionFee.

In [29]:
df.groupby('PetType')['AdoptionFee'].mean()

PetType
Bird      261.796715
Cat       247.263366
Dog       247.524904
Rabbit    240.277890
Name: AdoptionFee, dtype: float64

2. For each PetType, Breed, and Color, calculate the quantity of vaccinated, health condition, and the minimum, maximum, and average of AdoptionFee.

In [31]:
df.groupby(['PetType', 'Breed', 'Color'], as_index=False).agg({
    'Vaccinated': 'sum',
    'HealthCondition': 'sum',
    'AdoptionFee': ['min', 'max', 'mean']
})

Unnamed: 0_level_0,PetType,Breed,Color,Vaccinated,HealthCondition,AdoptionFee,AdoptionFee,AdoptionFee
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,sum,sum,min,max,mean
0,Bird,Parakeet,Black,63,7,16,499,273.761364
1,Bird,Parakeet,Brown,66,14,3,496,281.23913
2,Bird,Parakeet,Gray,73,23,0,498,253.877551
3,Bird,Parakeet,Orange,79,24,1,498,256.633929
4,Bird,Parakeet,White,62,19,0,499,246.463918
5,Cat,Persian,Black,34,14,14,487,251.471698
6,Cat,Persian,Brown,34,14,12,492,250.183673
7,Cat,Persian,Gray,36,11,8,485,233.326531
8,Cat,Persian,Orange,37,9,5,496,246.811321
9,Cat,Persian,White,31,8,17,499,254.583333


3. For each PetType and Size, get the minimum, maximum, and median AgeMonths.

In [32]:
df.groupby(['PetType', 'Size'], as_index=False)['AgeMonths'].agg(['min', 'max', 'median'])

Unnamed: 0,PetType,Size,min,max,median
0,Bird,Large,5,179,98.0
1,Bird,Medium,1,179,88.0
2,Bird,Small,1,179,92.0
3,Cat,Large,3,179,96.5
4,Cat,Medium,2,179,85.0
5,Cat,Small,2,179,89.0
6,Dog,Large,1,179,91.0
7,Dog,Medium,1,179,90.0
8,Dog,Small,2,177,103.5
9,Rabbit,Large,2,179,99.0


4. For each Size, get the minimum, maximum, and average of WeightKg and AgeMonths.

In [34]:
df.groupby(by=['Size'], as_index=False)[['WeightKg', 'AgeMonths']].agg(['min', 'max', 'mean'])

Unnamed: 0_level_0,Size,WeightKg,WeightKg,WeightKg,AgeMonths,AgeMonths,AgeMonths
Unnamed: 0_level_1,Unnamed: 1_level_1,min,max,mean,min,max,mean
0,Large,1.036774,29.948787,15.75137,1,179,93.452381
1,Medium,1.046434,29.995628,15.790286,1,179,88.787115
2,Small,1.018198,29.992795,15.571442,1,179,94.926094


5. For each Breed, get the minimum, maximum, and median TimeInShelterDays.

In [36]:
df.groupby(by=['Breed'], as_index=False)['TimeInShelterDays'].agg(['min', 'max', 'median'])

Unnamed: 0,Breed,min,max,median
0,Golden Retriever,1,88,48.0
1,Labrador,1,89,42.0
2,Parakeet,1,89,43.0
3,Persian,1,89,45.5
4,Poodle,1,89,43.0
5,Rabbit,1,89,48.0
6,Siamese,1,88,43.0


6. For each PetType and Size, calculate the 90th percentile of AdoptionFee.

> **Tip:**
> You can use `.quantile(0.9)`

In [38]:
df.groupby(by=['PetType', 'Size'], as_index=False)['AdoptionFee'].quantile(0.9)

Unnamed: 0,PetType,Size,AdoptionFee
0,Bird,Large,470.1
1,Bird,Medium,446.2
2,Bird,Small,437.2
3,Cat,Large,462.0
4,Cat,Medium,452.7
5,Cat,Small,445.0
6,Dog,Large,456.8
7,Dog,Medium,437.4
8,Dog,Small,432.3
9,Rabbit,Large,465.6


7. Rank pets by TimeInShelterDays for each Breed. Show the top 10 highest for each breed.

In [44]:
df['rank'] = df.groupby(by=['Breed'])['TimeInShelterDays'].rank(method='first')
df.nlargest(10, 'rank')

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood,rank
1867,2367,Rabbit,Rabbit,85,Black,Large,12.379081,0,0,89,494,1,1,493.0
1710,2210,Rabbit,Rabbit,29,Brown,Small,10.345551,1,0,89,373,1,0,492.0
947,1447,Rabbit,Rabbit,76,Orange,Large,11.938185,0,0,89,466,1,0,491.0
614,1114,Rabbit,Rabbit,33,Gray,Medium,24.942499,0,0,89,193,1,0,490.0
359,859,Rabbit,Rabbit,135,White,Large,21.524843,1,0,89,143,0,0,489.0
357,857,Rabbit,Rabbit,46,Brown,Large,3.446756,1,0,89,168,0,0,488.0
235,735,Rabbit,Rabbit,113,Orange,Medium,27.698216,1,1,89,384,1,0,487.0
1485,1985,Bird,Parakeet,24,Gray,Medium,23.057591,1,0,89,477,0,1,487.0
1428,1928,Bird,Parakeet,34,Orange,Medium,21.932212,1,0,89,72,0,1,486.0
1798,2298,Rabbit,Rabbit,145,White,Large,1.42149,0,0,88,163,0,0,486.0


8. For each Breed, normalize (min-max) the AdoptionFee, then multiply the result by 200.

In [49]:
def minmax(g):
    return (g - g.min()) / (g.max() - g.min())


df['NormalizedAdoptionFee'] = df.groupby(by='Breed', group_keys=False)['AdoptionFee'].apply(minmax) * 200
df

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood,rank,NormalizedAdoptionFee
0,500,Bird,Parakeet,131,Orange,Large,5.039768,1,0,27,140,0,0,155.0,56.112224
1,501,Rabbit,Rabbit,73,White,Large,16.086727,0,0,8,235,0,0,43.0,94.188377
2,502,Dog,Golden Retriever,136,Orange,Medium,2.076286,0,0,85,385,0,0,158.0,154.158215
3,503,Bird,Parakeet,97,White,Small,3.339423,0,0,61,217,1,0,334.0,86.973948
4,504,Rabbit,Rabbit,123,Gray,Large,20.498100,0,0,28,14,1,0,147.0,5.611222
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002,2502,Dog,Poodle,72,Orange,Small,27.039045,1,0,66,26,1,1,131.0,9.696970
2003,2503,Rabbit,Rabbit,124,Brown,Small,4.726954,1,1,59,150,0,0,331.0,60.120240
2004,2504,Rabbit,Rabbit,113,Orange,Small,1.758592,1,0,68,302,0,0,379.0,121.042084
2005,2505,Dog,Labrador,12,Gray,Large,20.961592,1,0,59,478,0,0,132.0,191.967871


9. For each PetType, calculate the rate of AdoptionLikelihood.

In [52]:
df.groupby('PetType', as_index=False)['AdoptionLikelihood'].mean()

Unnamed: 0,PetType,AdoptionLikelihood
0,Bird,0.301848
1,Cat,0.287129
2,Dog,0.463602
3,Rabbit,0.25355


10. Filter out breeds with AdoptionLikelihood greater than 50%.

In [53]:
df.groupby('Breed').filter(lambda x: x['AdoptionLikelihood'].mean() > 50)

Unnamed: 0,PetID,PetType,Breed,AgeMonths,Color,Size,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood,rank,NormalizedAdoptionFee


11. For each PetType, flag pets with urgency set to true that have been in the shelter longer than the 95th percentile of TimeInShelterDays.

In [57]:
percentile95 = df.groupby(['PetType', 'Size'], as_index=False)['TimeInShelterDays'].quantile(0.95)
percentile95.columns = ['PetType', 'Size', '95thPercentileTimeInShelterDays']
percentile95

Unnamed: 0,PetType,Size,95thPercentileTimeInShelterDays
0,Bird,Large,82.65
1,Bird,Medium,83.0
2,Bird,Small,84.0
3,Cat,Large,81.0
4,Cat,Medium,85.0
5,Cat,Small,87.0
6,Dog,Large,86.3
7,Dog,Medium,83.1
8,Dog,Small,84.0
9,Rabbit,Large,86.0


In [61]:
def flag_urgent_pets(group):
    threshold = group['TimeInShelterDays'].quantile(0.95)
    group['UrgencyFlag'] = (group['TimeInShelterDays'] > threshold)
    return group

df.groupby(['PetType', 'Size'], group_keys=False).apply(flag_urgent_pets, include_groups=False)

Unnamed: 0,PetID,Breed,AgeMonths,Color,WeightKg,Vaccinated,HealthCondition,TimeInShelterDays,AdoptionFee,PreviousOwner,AdoptionLikelihood,rank,NormalizedAdoptionFee,UrgencyFlag
0,500,Parakeet,131,Orange,5.039768,1,0,27,140,0,0,155.0,56.112224,False
1,501,Rabbit,73,White,16.086727,0,0,8,235,0,0,43.0,94.188377,False
2,502,Golden Retriever,136,Orange,2.076286,0,0,85,385,0,0,158.0,154.158215,True
3,503,Parakeet,97,White,3.339423,0,0,61,217,1,0,334.0,86.973948,False
4,504,Rabbit,123,Gray,20.498100,0,0,28,14,1,0,147.0,5.611222,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2002,2502,Poodle,72,Orange,27.039045,1,0,66,26,1,1,131.0,9.696970,False
2003,2503,Rabbit,124,Brown,4.726954,1,1,59,150,0,0,331.0,60.120240,False
2004,2504,Rabbit,113,Orange,1.758592,1,0,68,302,0,0,379.0,121.042084,False
2005,2505,Labrador,12,Gray,20.961592,1,0,59,478,0,0,132.0,191.967871,False
