# **GroupBy Operations**

## **Understanding the GroupBy**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.info()

In [None]:
titanic.head()

In [None]:
titanic.tail()

In [None]:
titanic.info()

In [None]:
titanic_slice = titanic.iloc[:10, [2,3]]

In [None]:
titanic_slice

In [None]:
titanic_slice.groupby(by = 'sex')

In [None]:
gbo = titanic_slice.groupby(by = 'sex')

In [None]:
type(gbo)

In [None]:
gbo.groups

In [None]:
gbo_l = list(gbo)

In [None]:
gbo_l[0]

In [None]:
type(gbo_l[0])

In [None]:
gbo_l[0][0]

In [None]:
gbo_l[0][1]

In [None]:
type(gbo_l[0][1])

In [None]:
gbo_l[1][1]

#### **Another approach for achieving the same result**

In [None]:
titanic_slice_f = titanic_slice.loc[titanic_slice.sex == 'female']

In [None]:
titanic_slice_f

In [None]:
titanic_slice_m = titanic_slice.loc[titanic_slice.sex == 'male']

In [None]:
titanic_slice_m

In [None]:
titanic_slice_m.equals(gbo_l[1][1])

In [None]:
for element in gbo:
    print(element)

## **Splitting with many keys**

In [None]:
import pandas as pd

In [None]:
summer = pd.read_csv('summer.csv')

In [None]:
summer.info()

In [None]:
summer.head()

In [None]:
summer.Country.nunique()

In [None]:
split1 = summer.groupby('Country')

In [None]:
split1_l = list(split1)

In [None]:
len(split1_l)

In [None]:
countries = [c[0] for c in split1_l]

In [None]:
countries.index('BUL')

In [None]:
split1_l[19][1]

In [None]:
split2 = summer.groupby(by = ['Country', 'Gender'])

In [None]:
split2_l = list(split2)

In [None]:
split2_l[30:32]

## **Split-Apply-Combine explained**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic_slice = titanic.iloc[:10, [2,3]]

In [None]:
titanic_slice

In [None]:
list(titanic_slice.groupby('sex'))[0][1]

In [None]:
list(titanic_slice.groupby('sex'))[0][1].age.mean()

In [None]:
list(titanic_slice.groupby('sex'))[1][1]

In [None]:
list(titanic_slice.groupby('sex'))[1][1].age.mean()

In [None]:
titanic_slice.groupby('sex').mean()

In [None]:
titanic.info()

In [None]:
titanic.groupby('sex').sum(numeric_only = True)

In [None]:
titanic.groupby('sex').survived.sum()

In [None]:
titanic.groupby('sex')[['fare', 'age']].max()

In [None]:
l = list(titanic.groupby('sex'))

In [None]:
l[0][1].age.max()

In [None]:
l[1][1].age.max()

In [None]:
titanic_mean = titanic.groupby('sex').mean(numeric_only = True)

In [None]:
titanic_mean

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
plt.style.available

In [None]:
plt.style.use("seaborn-v0_8")

In [None]:
titanic_mean.plot(kind = 'bar', figsize = (7, 15), subplots = True)
plt.show()

## **Split-Apply-Combine applied**

In [None]:
import pandas as pd

In [None]:
summer = pd.read_csv('summer.csv')

In [None]:
summer.info()

In [None]:
summer.head()

In [None]:
summer.groupby('Country')

In [None]:
medals_by_country = summer.groupby('Country').Medal.count()

In [None]:
medals_by_country

In [None]:
top20 = medals_by_country.nlargest(20)

In [None]:
top20

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use('seaborn-v0_8')

In [None]:
font_dict_common = {
    'weight': 'bold',
    'family': 'sans',
    'color': 'darkred'
}
font_dict_specific = {
    'labelpad': 20,
    'fontsize': 12
}
top20.plot(kind = 'bar', figsize = (10, 7))
plt.xlabel('Medals', **font_dict_common, **font_dict_specific)
plt.xticks(**font_dict_common, fontsize = 10, rotation = 0)
plt.ylabel('Country', **font_dict_common, **font_dict_specific)
plt.yticks(**font_dict_common, fontsize = 10)
plt.title('Summer Olympic Games (Medals per Country)', **font_dict, fontsize = 14)
plt.show()

In [None]:
titanic = pd.read_csv('titanic.csv')

In [None]:
titanic.info()

In [None]:
titanic.describe()

In [None]:
titanic.groupby('pclass').fare.mean()

In [None]:
titanic.groupby('sex').survived.mean()

In [None]:
titanic.groupby('pclass').survived.mean()

In [None]:
titanic['ad_chi'] = 'adult'

In [None]:
titanic.head(10)

In [None]:
titanic.loc[titanic.age < 18, 'ad_chi'] = 'child'

In [None]:
titanic.head(10)

In [None]:
titanic.ad_chi.value_counts()

In [None]:
titanic.groupby('ad_chi').survived.mean()

In [None]:
titanic.groupby(['sex', 'ad_chi']).survived.count()

In [None]:
titanic.groupby(['sex', 'ad_chi']).survived.mean()

In [None]:
survival_rate = titanic.groupby(['sex', 'ad_chi']).survived.mean().sort_values(ascending = False)

In [None]:
survival_rate.plot(kind = 'bar', figsize = (10, 5))
plt.title('Titanic Survival Rate by Sex/Age')
plt.xlabel('Groups')
plt.xticks(rotation = 0)
plt.ylabel('Survival Rate')
plt.show()

## **Advanced agregation with agg()**

In [None]:
import pandas as pd

In [None]:
titanic = pd.read_csv('titanic.csv', usecols = ['survived', 'pclass', 'sex', 'age', 'fare'])

In [None]:
titanic.info()

In [None]:
titanic.head()

In [None]:
titanic.groupby('sex').mean()

In [None]:
titanic.groupby('sex').sum()

In [None]:
titanic.groupby('sex').agg(['mean', 'sum'])

In [None]:
titanic.groupby('sex').agg(['mean', 'sum', 'min', 'max'])

In [None]:
titanic.groupby('sex').agg({'survived': ['mean', 'sum'], 'pclass': ['mean']})

In [None]:
titanic.groupby('sex').agg({'survived': ['mean', 'sum'], 'pclass': ['mean'], 'age': ['min', 'max'], 'fare': ['mean', 'min', 'max']})

## **GroupBy aggregation with relabeling**

In [1]:
import pandas as pd

In [2]:
titanic = pd.read_csv('titanic.csv',usecols = ['survived', 'pclass', 'sex', 'age', 'fare'])

In [3]:
titanic.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   survived  891 non-null    int64  
 1   pclass    891 non-null    int64  
 2   sex       891 non-null    object 
 3   age       714 non-null    float64
 4   fare      891 non-null    float64
dtypes: float64(2), int64(2), object(1)
memory usage: 34.9+ KB


In [4]:
titanic.head()

Unnamed: 0,survived,pclass,sex,age,fare
0,0,3,male,22.0,7.25
1,1,1,female,38.0,71.2833
2,1,3,female,26.0,7.925
3,1,1,female,35.0,53.1
4,0,3,male,35.0,8.05


In [5]:
titanic.groupby('sex').mean()

Unnamed: 0_level_0,survived,pclass,age,fare
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
female,0.742038,2.159236,27.915709,44.479818
male,0.188908,2.389948,30.726645,25.523893


In [6]:
titanic.groupby('sex').survived.mean()

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64

In [7]:
titanic.groupby('sex').agg(survival_rate = ('survived', 'mean'))

Unnamed: 0_level_0,survival_rate
sex,Unnamed: 1_level_1
female,0.742038
male,0.188908


In [12]:
titanic.groupby('sex').agg({'survived': ['mean', 'sum'], 'age': ['mean']})

Unnamed: 0_level_0,survived,survived,age
Unnamed: 0_level_1,mean,sum,mean
sex,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
female,0.742038,233,27.915709
male,0.188908,109,30.726645


In [13]:
titanic.groupby('sex').agg(survived_total = ('survived', 'sum'), survival_rate = ('survived', 'mean'), mean_age = ('age', 'mean'))

Unnamed: 0_level_0,survived_total,survival_rate,mean_age
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,233,0.742038,27.915709
male,109,0.188908,30.726645
