# Group By Practice

## Intitalization

In [1]:
# import libraries
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
# Get potential datasets using Seaborn method
sns.get_dataset_names()

['anagrams',
 'anscombe',
 'attention',
 'brain_networks',
 'car_crashes',
 'diamonds',
 'dots',
 'dowjones',
 'exercise',
 'flights',
 'fmri',
 'geyser',
 'glue',
 'healthexp',
 'iris',
 'mpg',
 'penguins',
 'planets',
 'seaice',
 'taxis',
 'tips',
 'titanic']

In [3]:
# Create a sample dataframe
df = sns.load_dataset("tips")

## Overview of Data

In [5]:
# Hypothetical dataset for tip amount following service
df.head(10)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4
5,25.29,4.71,Male,No,Sun,Dinner,4
6,8.77,2.0,Male,No,Sun,Dinner,2
7,26.88,3.12,Male,No,Sun,Dinner,4
8,15.04,1.96,Male,No,Sun,Dinner,2
9,14.78,3.23,Male,No,Sun,Dinner,2


In [6]:
# Returns the data types and counts of each column

df.info()

# The data is formatted properly so we do not have to perform any transformations
# This is to be expected with sample data from Seaborn, but real datasets will often require 
# cleaning and replacement of null values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 244 entries, 0 to 243
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   total_bill  244 non-null    float64 
 1   tip         244 non-null    float64 
 2   sex         244 non-null    category
 3   smoker      244 non-null    category
 4   day         244 non-null    category
 5   time        244 non-null    category
 6   size        244 non-null    int64   
dtypes: category(4), float64(2), int64(1)
memory usage: 7.4 KB


In [18]:
# Returns the counts of the days and time used in our dataset
days = df['day'].value_counts()
times = df['time'].value_counts()

print(days)

print(times)

day
Sat     87
Sun     76
Thur    62
Fri     19
Name: count, dtype: int64
time
Dinner    176
Lunch      68
Name: count, dtype: int64


In [7]:
# Returns the count of men and women in our dataset
df['sex'].value_counts()

sex
Male      157
Female     87
Name: count, dtype: int64

In [14]:
# Returns the count of smokers vs. nonsmokers in our dataset
df['smoker'].value_counts()

smoker
No     151
Yes     93
Name: count, dtype: int64

In [13]:
# Create Varibales to Store Averages for Categories

avg_bill = df['total_bill'].mean()

avg_tip = df['tip'].mean()

avg_size = df['tip'].mean()

# Return the values of each

print(f"Average Total Bill: {avg_bill.round(2)} \nAverage Tip: {avg_tip.round(2)} \nAverage Size of Group: {avg_size.round()}")

Average Total Bill: 19.79 
Average Tip: 3.0 
Average Size of Group: 3.0


## GroupBy Practice

### Normal GroupBy

In [23]:
# Returns the average total bill by time of service
df.groupby('time',observed=True)['total_bill'].mean()

time
Lunch     17.168676
Dinner    20.797159
Name: total_bill, dtype: float64

In [24]:
# to_frame allows us to create a frame with our calculation
df.groupby('time',observed=True)['total_bill'].mean().round(2).to_frame('average_bill')

Unnamed: 0_level_0,average_bill
time,Unnamed: 1_level_1
Lunch,17.17
Dinner,20.8


In [25]:
# Reset index will make future calculations much easier '.reset_index()'
# It makes for one pretty long line of code also c:
df.groupby('time',observed=True)['total_bill'].mean().round(2).to_frame('average_bill').reset_index()

Unnamed: 0,time,average_bill
0,Lunch,17.17
1,Dinner,20.8


In [28]:
# Sort Values '.sort_values()'
df.groupby('time',observed=True)['total_bill'].mean().round(2).to_frame('average_bill').reset_index().sort_values('average_bill', ascending = False)

Unnamed: 0,time,average_bill
1,Dinner,20.8
0,Lunch,17.17


In [29]:
# Assign to a new DataFrame
avgbill = df.groupby('time',observed=True)['total_bill'].mean().round(2).to_frame('average_bill').reset_index().sort_values('average_bill', ascending = False)

In [30]:
# Return the new dataframe
avgbill

Unnamed: 0,time,average_bill
1,Dinner,20.8
0,Lunch,17.17


### Common Calculations

In [31]:
# Count
df.groupby('time',observed=True)['total_bill'].count()

time
Lunch      68
Dinner    176
Name: total_bill, dtype: int64

In [32]:
# Size will count nulls where count will not
df.groupby('time',observed=True)['total_bill'].size()

time
Lunch      68
Dinner    176
Name: total_bill, dtype: int64

In [34]:
# Unique
df.groupby('time',observed=True)['total_bill'].nunique()

time
Lunch      64
Dinner    168
Name: total_bill, dtype: int64

In [35]:
# Max
df.groupby('time',observed=True)['total_bill'].max()

time
Lunch     43.11
Dinner    50.81
Name: total_bill, dtype: float64

In [36]:
# Minimum
df.groupby('time',observed=True)['total_bill'].min()

time
Lunch     7.51
Dinner    3.07
Name: total_bill, dtype: float64

In [37]:
# Sum
df.groupby('time',observed=True)['total_bill'].sum()

time
Lunch     1167.47
Dinner    3660.30
Name: total_bill, dtype: float64

In [38]:
# Numpy calculations also allowed
df.groupby('time',observed=True)['total_bill'].median()

time
Lunch     15.965
Dinner    18.390
Name: total_bill, dtype: float64

### Challenges

In [40]:
# Find the average tip for each night of the week
df.groupby('day', observed=True)['tip'].mean()

day
Thur    2.771452
Fri     2.734737
Sat     2.993103
Sun     3.255132
Name: tip, dtype: float64

In [41]:
# Find the max tip for each sex
df.groupby('sex', observed=True)['tip'].max()

sex
Male      10.0
Female     6.5
Name: tip, dtype: float64

In [42]:
# Find the top tips for each night
df.groupby('day', observed=True)['tip'].quantile(.9)

day
Thur    4.920
Fri     4.060
Sat     4.802
Sun     5.035
Name: tip, dtype: float64

### Intermediate GroupBy

In [49]:
# Multiple Calculations
df.groupby('time',observed=True)['total_bill'].agg(['min', 'max'])

Unnamed: 0_level_0,min,max
time,Unnamed: 1_level_1,Unnamed: 2_level_1
Lunch,7.51,43.11
Dinner,3.07,50.81


In [50]:
# Describe is a useful method for overviews
df.groupby('time',observed=True)['total_bill'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Lunch,68.0,17.168676,7.713882,7.51,12.235,15.965,19.5325,43.11
Dinner,176.0,20.797159,9.142029,3.07,14.4375,18.39,25.2825,50.81


In [52]:
# Two Calculation Columns
df.groupby('time',observed=True)[['tip','total_bill']].mean().round(2)

Unnamed: 0_level_0,tip,total_bill
time,Unnamed: 1_level_1,Unnamed: 2_level_1
Lunch,2.73,17.17
Dinner,3.1,20.8


In [53]:
# Two Groupby Columns
df.groupby(['day', 'time'], observed=True)['total_bill'].mean().round(2)

day   time  
Thur  Lunch     17.66
      Dinner    18.78
Fri   Lunch     12.85
      Dinner    19.66
Sat   Dinner    20.44
Sun   Dinner    21.41
Name: total_bill, dtype: float64

In [54]:
df.groupby(['day', 'time'], observed=True)[['tip','total_bill']].mean().round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,tip,total_bill
day,time,Unnamed: 2_level_1,Unnamed: 3_level_1
Thur,Lunch,2.77,17.66
Thur,Dinner,3.0,18.78
Fri,Lunch,2.38,12.85
Fri,Dinner,2.94,19.66
Sat,Dinner,2.99,20.44
Sun,Dinner,3.26,21.41


In [55]:
# Reset Index to make cleaner
df.groupby(['day', 'time'], observed=True)[['tip','total_bill']].mean().round(2).reset_index()

Unnamed: 0,day,time,tip,total_bill
0,Thur,Lunch,2.77,17.66
1,Thur,Dinner,3.0,18.78
2,Fri,Lunch,2.38,12.85
3,Fri,Dinner,2.94,19.66
4,Sat,Dinner,2.99,20.44
5,Sun,Dinner,3.26,21.41


### Transform

In [56]:
# Create a new dataframe to perform transformations
dtransform = sns.load_dataset('tips')

In [62]:
# Display dataframe
dtransform.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,medianbill
0,16.99,1.01,Female,No,Sun,Dinner,2,18.39
1,10.34,1.66,Male,No,Sun,Dinner,3,18.39
2,21.01,3.5,Male,No,Sun,Dinner,3,18.39
3,23.68,3.31,Male,No,Sun,Dinner,2,18.39
4,24.59,3.61,Female,No,Sun,Dinner,4,18.39


In [58]:
# Perform transformation, creating new column
dtransform['medianbill'] = df.groupby('time', observed=True)['total_bill'].transform('median')

In [63]:
dtransform.head(5)

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,medianbill
0,16.99,1.01,Female,No,Sun,Dinner,2,18.39
1,10.34,1.66,Male,No,Sun,Dinner,3,18.39
2,21.01,3.5,Male,No,Sun,Dinner,3,18.39
3,23.68,3.31,Male,No,Sun,Dinner,2,18.39
4,24.59,3.61,Female,No,Sun,Dinner,4,18.39


In [61]:
# Perform Query to look at lunch median
dtransform.query("time == 'Lunch'")

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,medianbill
77,27.20,4.00,Male,No,Thur,Lunch,4,15.965
78,22.76,3.00,Male,No,Thur,Lunch,2,15.965
79,17.29,2.71,Male,No,Thur,Lunch,2,15.965
80,19.44,3.00,Male,Yes,Thur,Lunch,2,15.965
81,16.66,3.40,Male,No,Thur,Lunch,2,15.965
...,...,...,...,...,...,...,...,...
222,8.58,1.92,Male,Yes,Fri,Lunch,1,15.965
223,15.98,3.00,Female,No,Fri,Lunch,3,15.965
224,13.42,1.58,Male,Yes,Fri,Lunch,2,15.965
225,16.27,2.50,Female,Yes,Fri,Lunch,2,15.965


In [64]:
# Use transformation for queries
# Returns rows where total bill is greater than the median
dtransform.query("total_bill>medianbill")

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,medianbill
2,21.01,3.50,Male,No,Sun,Dinner,3,18.39
3,23.68,3.31,Male,No,Sun,Dinner,2,18.39
4,24.59,3.61,Female,No,Sun,Dinner,4,18.39
5,25.29,4.71,Male,No,Sun,Dinner,4,18.39
7,26.88,3.12,Male,No,Sun,Dinner,4,18.39
...,...,...,...,...,...,...,...,...
238,35.83,4.67,Female,No,Sat,Dinner,3,18.39
239,29.03,5.92,Male,No,Sat,Dinner,3,18.39
240,27.18,2.00,Female,Yes,Sat,Dinner,2,18.39
241,22.67,2.00,Male,Yes,Sat,Dinner,2,18.39


### Advanced GroupBy

In [65]:
# Z-Score for each group - Shows where data lies on normal distribution
dtransform['Z_Score'] = df.groupby('time', observed=True)['total_bill'].transform(lambda x: (x - x.mean()) / x.std())

In [66]:
dtransform

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size,medianbill,Z_Score
0,16.99,1.01,Female,No,Sun,Dinner,2,18.39,-0.416446
1,10.34,1.66,Male,No,Sun,Dinner,3,18.39,-1.143855
2,21.01,3.50,Male,No,Sun,Dinner,3,18.39,0.023282
3,23.68,3.31,Male,No,Sun,Dinner,2,18.39,0.315339
4,24.59,3.61,Female,No,Sun,Dinner,4,18.39,0.414880
...,...,...,...,...,...,...,...,...,...
239,29.03,5.92,Male,No,Sat,Dinner,3,18.39,0.900549
240,27.18,2.00,Female,Yes,Sat,Dinner,2,18.39,0.698186
241,22.67,2.00,Male,Yes,Sat,Dinner,2,18.39,0.204861
242,17.82,1.75,Male,No,Sat,Dinner,2,18.39,-0.325656
