# Pandas Tutorial - Part 8

# Cleaning Data - Casting Datatypes and Handling Missing Values

### Implementation: Ali Moghanni

*Resources:*

This Jupyter notebook can be obtained at [https://github.com/alimoghanni/Pandas](https://github.com/alimoghanni/Pandas).

updated: **2020-04-02**

In [1]:
# Preamble: useful toolboxes, librairies, functions, etc.

import pandas as pd
import numpy as np

In [2]:
# Python dictionary

people = {
    "first": ["Agatha", "Hercule","Jane", "David", "Nicholle", "Bruce", "Jacob", "Maggie", "Cristiano"], 
    "last": ["Christie", "Poirot", "Marple", "Tom", "Tom", "Lee", "Gyllenhaal", "Gyllenhaal", "Ronaldo"], 
    "email": ["AgathaChristie@mail.com", "HerculePoirot@mail.com", np.nan, "DavidTom@gmail.com", "NicholleTom@gmail.com", "BruceLee@yahoo.com","JacobGyllenhaal@mail.com", "MaggieGyllenhaal@mail.com", "CristianoRonaldo@mail.com"],
    "age": ['85', '54', np.nan ,None, '42', 32, 39, 42, 35 ],
    "occupation": ["Novelist","Private investigator", "Amateur detective", "Actor", "Actress", "Martial artist", "NA", "producer", "footballer"],
    "nationality": ["English", "Belgian", "British", "American", "Missing", "Chinese", "American", "American", "Portuguese"],
    "born": ["15 September 1890", "NA", "December 1927", "March 23, 1978", "March 23, 1978", "November 27, 1940", "December 19, 1980", "November 16, 1977", "February 5, 1985"],
    "male": ["No", "Yes", "No", "Yes", np.nan, "Yes", "Yes", "No", "Yes"]
}

In [3]:
# create pandas DataFrame from dictionary

df = pd.DataFrame(people)

#df.replace('NA', np.nan, inplace=True)
#df.replace('Missing', np.nan, inplace=True)

df

Unnamed: 0,first,last,email,age,occupation,nationality,born,male
0,Agatha,Christie,AgathaChristie@mail.com,85.0,Novelist,English,15 September 1890,No
1,Hercule,Poirot,HerculePoirot@mail.com,54.0,Private investigator,Belgian,,Yes
2,Jane,Marple,,,Amateur detective,British,December 1927,No
3,David,Tom,DavidTom@gmail.com,,Actor,American,"March 23, 1978",Yes
4,Nicholle,Tom,NicholleTom@gmail.com,42.0,Actress,Missing,"March 23, 1978",
5,Bruce,Lee,BruceLee@yahoo.com,32.0,Martial artist,Chinese,"November 27, 1940",Yes
6,Jacob,Gyllenhaal,JacobGyllenhaal@mail.com,39.0,,American,"December 19, 1980",Yes
7,Maggie,Gyllenhaal,MaggieGyllenhaal@mail.com,42.0,producer,American,"November 16, 1977",No
8,Cristiano,Ronaldo,CristianoRonaldo@mail.com,35.0,footballer,Portuguese,"February 5, 1985",Yes


In [4]:
# drop missing data

df.dropna()

Unnamed: 0,first,last,email,age,occupation,nationality,born,male
0,Agatha,Christie,AgathaChristie@mail.com,85,Novelist,English,15 September 1890,No
1,Hercule,Poirot,HerculePoirot@mail.com,54,Private investigator,Belgian,,Yes
5,Bruce,Lee,BruceLee@yahoo.com,32,Martial artist,Chinese,"November 27, 1940",Yes
6,Jacob,Gyllenhaal,JacobGyllenhaal@mail.com,39,,American,"December 19, 1980",Yes
7,Maggie,Gyllenhaal,MaggieGyllenhaal@mail.com,42,producer,American,"November 16, 1977",No
8,Cristiano,Ronaldo,CristianoRonaldo@mail.com,35,footballer,Portuguese,"February 5, 1985",Yes


In [5]:
df.dropna(axis='index', how='any')
# df.dropna(axis='index', how='all')

Unnamed: 0,first,last,email,age,occupation,nationality,born,male
0,Agatha,Christie,AgathaChristie@mail.com,85,Novelist,English,15 September 1890,No
1,Hercule,Poirot,HerculePoirot@mail.com,54,Private investigator,Belgian,,Yes
5,Bruce,Lee,BruceLee@yahoo.com,32,Martial artist,Chinese,"November 27, 1940",Yes
6,Jacob,Gyllenhaal,JacobGyllenhaal@mail.com,39,,American,"December 19, 1980",Yes
7,Maggie,Gyllenhaal,MaggieGyllenhaal@mail.com,42,producer,American,"November 16, 1977",No
8,Cristiano,Ronaldo,CristianoRonaldo@mail.com,35,footballer,Portuguese,"February 5, 1985",Yes


In [6]:
df.dropna(axis='index', how='any', subset=['email', 'occupation'])

Unnamed: 0,first,last,email,age,occupation,nationality,born,male
0,Agatha,Christie,AgathaChristie@mail.com,85.0,Novelist,English,15 September 1890,No
1,Hercule,Poirot,HerculePoirot@mail.com,54.0,Private investigator,Belgian,,Yes
3,David,Tom,DavidTom@gmail.com,,Actor,American,"March 23, 1978",Yes
4,Nicholle,Tom,NicholleTom@gmail.com,42.0,Actress,Missing,"March 23, 1978",
5,Bruce,Lee,BruceLee@yahoo.com,32.0,Martial artist,Chinese,"November 27, 1940",Yes
6,Jacob,Gyllenhaal,JacobGyllenhaal@mail.com,39.0,,American,"December 19, 1980",Yes
7,Maggie,Gyllenhaal,MaggieGyllenhaal@mail.com,42.0,producer,American,"November 16, 1977",No
8,Cristiano,Ronaldo,CristianoRonaldo@mail.com,35.0,footballer,Portuguese,"February 5, 1985",Yes


In [7]:
df.isna()

Unnamed: 0,first,last,email,age,occupation,nationality,born,male
0,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False
2,False,False,True,True,False,False,False,False
3,False,False,False,True,False,False,False,False
4,False,False,False,False,False,False,False,True
5,False,False,False,False,False,False,False,False
6,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False


In [8]:
df.fillna('MISSING')

Unnamed: 0,first,last,email,age,occupation,nationality,born,male
0,Agatha,Christie,AgathaChristie@mail.com,85,Novelist,English,15 September 1890,No
1,Hercule,Poirot,HerculePoirot@mail.com,54,Private investigator,Belgian,,Yes
2,Jane,Marple,MISSING,MISSING,Amateur detective,British,December 1927,No
3,David,Tom,DavidTom@gmail.com,MISSING,Actor,American,"March 23, 1978",Yes
4,Nicholle,Tom,NicholleTom@gmail.com,42,Actress,Missing,"March 23, 1978",MISSING
5,Bruce,Lee,BruceLee@yahoo.com,32,Martial artist,Chinese,"November 27, 1940",Yes
6,Jacob,Gyllenhaal,JacobGyllenhaal@mail.com,39,,American,"December 19, 1980",Yes
7,Maggie,Gyllenhaal,MaggieGyllenhaal@mail.com,42,producer,American,"November 16, 1977",No
8,Cristiano,Ronaldo,CristianoRonaldo@mail.com,35,footballer,Portuguese,"February 5, 1985",Yes


In [9]:
df.dtypes

first          object
last           object
email          object
age            object
occupation     object
nationality    object
born           object
male           object
dtype: object

In [10]:
type(np.nan)

float

In [11]:
df['age'] = df['age'].astype(float)

df['age']

0    85.0
1    54.0
2     NaN
3     NaN
4    42.0
5    32.0
6    39.0
7    42.0
8    35.0
Name: age, dtype: float64

In [12]:
df['age'].mean()

47.0

In [13]:
# na_vals = ['NA', 'Missing']
# df = pd.read_csv('data.csv', na_values=na_vals)

In [14]:
df['age'].unique()

array([85., 54., nan, 42., 32., 39., 35.])