# Pandas Tutorial - Part 7

# Grouping and Aggregating - Analyzing and Exploring Your Data

### Implementation: Ali Moghanni

*Resources:*

This Jupyter notebook can be obtained at [https://github.com/alimoghanni/Pandas](https://github.com/alimoghanni/Pandas).

updated: **2020-04-01**

In [1]:
# Preamble: useful toolboxes, librairies, functions, etc.

import pandas as pd
import numpy as np

In [2]:
# Python dictionary

people = {
    "first": ["Agatha", "Hercule","Jane", "David", "Nicholle", "Bruce", "Jacob", "Maggie", "Cristiano"], 
    "last": ["Christie", "Poirot", "Marple", "Tom", "Tom", "Lee", "Gyllenhaal", "Gyllenhaal", "Ronaldo"], 
    "email": ["AgathaChristie@mail.com", "HerculePoirot@mail.com", "", "DavidTom@gmail.com", "NicholleTom@gmail.com", "BruceLee@yahoo.com","JacobGyllenhaal@mail.com", "MaggieGyllenhaal@mail.com", "CristianoRonaldo@mail.com"],
    "age": [85, 54, np.nan ,np.nan, 42, 32, 39, 42, 35 ],
    "occupation": ["Novelist","Private investigator", "Amateur detective", "Actor", "Actress", "Martial artist", "Actor", "producer", "footballer"],
    "nationality": ["English", "Belgian", "British", "American", "American", "Chinese", "American", "American", "Portuguese"],
    "born": ["15 September 1890", "", "December 1927", "March 23, 1978", "March 23, 1978", "November 27, 1940", "December 19, 1980", "November 16, 1977", "February 5, 1985"],
    "male": ["No", "Yes", "No", "Yes", "No", "Yes", "Yes", "No", "Yes"]
}

In [3]:
# create pandas DataFrame from dictionary

df = pd.DataFrame(people)

df

Unnamed: 0,first,last,email,age,occupation,nationality,born,male
0,Agatha,Christie,AgathaChristie@mail.com,85.0,Novelist,English,15 September 1890,No
1,Hercule,Poirot,HerculePoirot@mail.com,54.0,Private investigator,Belgian,,Yes
2,Jane,Marple,,,Amateur detective,British,December 1927,No
3,David,Tom,DavidTom@gmail.com,,Actor,American,"March 23, 1978",Yes
4,Nicholle,Tom,NicholleTom@gmail.com,42.0,Actress,American,"March 23, 1978",No
5,Bruce,Lee,BruceLee@yahoo.com,32.0,Martial artist,Chinese,"November 27, 1940",Yes
6,Jacob,Gyllenhaal,JacobGyllenhaal@mail.com,39.0,Actor,American,"December 19, 1980",Yes
7,Maggie,Gyllenhaal,MaggieGyllenhaal@mail.com,42.0,producer,American,"November 16, 1977",No
8,Cristiano,Ronaldo,CristianoRonaldo@mail.com,35.0,footballer,Portuguese,"February 5, 1985",Yes


In [4]:
df['age']

0    85.0
1    54.0
2     NaN
3     NaN
4    42.0
5    32.0
6    39.0
7    42.0
8    35.0
Name: age, dtype: float64

In [5]:
df['age'].median()

42.0

In [6]:
df.median()

age    42.0
dtype: float64

In [7]:
df.describe()

Unnamed: 0,age
count,7.0
mean,47.0
std,18.147543
min,32.0
25%,37.0
50%,42.0
75%,48.0
max,85.0


In [8]:
df['age'].count()

7

In [9]:
df['male']

0     No
1    Yes
2     No
3    Yes
4     No
5    Yes
6    Yes
7     No
8    Yes
Name: male, dtype: object

In [10]:
df['male'].value_counts()

Yes    5
No     4
Name: male, dtype: int64

In [11]:
df['nationality']

0       English
1       Belgian
2       British
3      American
4      American
5       Chinese
6      American
7      American
8    Portuguese
Name: nationality, dtype: object

In [12]:
df['nationality'].value_counts()

American      4
British       1
Belgian       1
Portuguese    1
Chinese       1
English       1
Name: nationality, dtype: int64

In [13]:
df['nationality'].value_counts(normalize=True)

American      0.444444
British       0.111111
Belgian       0.111111
Portuguese    0.111111
Chinese       0.111111
English       0.111111
Name: nationality, dtype: float64

In [14]:
nationality_grp = df.groupby(['nationality'])

In [15]:
nationality_grp.get_group('American')

Unnamed: 0,first,last,email,age,occupation,nationality,born,male
3,David,Tom,DavidTom@gmail.com,,Actor,American,"March 23, 1978",Yes
4,Nicholle,Tom,NicholleTom@gmail.com,42.0,Actress,American,"March 23, 1978",No
6,Jacob,Gyllenhaal,JacobGyllenhaal@mail.com,39.0,Actor,American,"December 19, 1980",Yes
7,Maggie,Gyllenhaal,MaggieGyllenhaal@mail.com,42.0,producer,American,"November 16, 1977",No


In [16]:
nationality_grp.get_group('Belgian')

Unnamed: 0,first,last,email,age,occupation,nationality,born,male
1,Hercule,Poirot,HerculePoirot@mail.com,54.0,Private investigator,Belgian,,Yes


In [17]:
nationality_grp['male'].value_counts()

nationality  male
American     No      2
             Yes     2
Belgian      Yes     1
British      No      1
Chinese      Yes     1
English      No      1
Portuguese   Yes     1
Name: male, dtype: int64

In [18]:
nationality_grp['male'].value_counts().loc['American']

male
No     2
Yes    2
Name: male, dtype: int64

In [19]:
nationality_grp['male'].value_counts().loc['Portuguese']

male
Yes    1
Name: male, dtype: int64

In [20]:
nationality_grp['age'].median()

nationality
American      42.0
Belgian       54.0
British        NaN
Chinese       32.0
English       85.0
Portuguese    35.0
Name: age, dtype: float64

In [21]:
nationality_grp['age'].agg(['median', 'mean'])

Unnamed: 0_level_0,median,mean
nationality,Unnamed: 1_level_1,Unnamed: 2_level_1
American,42.0,41.0
Belgian,54.0,54.0
British,,
Chinese,32.0,32.0
English,85.0,85.0
Portuguese,35.0,35.0


In [22]:
nationality_grp['age'].agg(['median', 'mean']).loc['American']

median    42.0
mean      41.0
Name: American, dtype: float64