In [1]:
import numpy as np
import pandas as pd

In [2]:
float_data = pd.Series([1.2, -3.5, np.nan, 0])

In [3]:
float_data

0    1.2
1   -3.5
2    NaN
3    0.0
dtype: float64

In [4]:
float_data.isna()

0    False
1    False
2     True
3    False
dtype: bool

In [5]:
string_data = pd.Series(["aardvark", np.nan, None, "avocado"])

In [6]:
string_data

0    aardvark
1         NaN
2        None
3     avocado
dtype: object

In [7]:
string_data.isna()

0    False
1     True
2     True
3    False
dtype: bool

## Filtering Out Missing Data

In [8]:
data = pd.Series([1, np.nan, 3.5, np.nan, 7])

In [9]:
data.dropna()

0    1.0
2    3.5
4    7.0
dtype: float64

In [10]:
data[data.notna()]

0    1.0
2    3.5
4    7.0
dtype: float64

In [11]:
 data = pd.DataFrame([[1., 6.5, 3.], [1., np.nan, np.nan],
                       [np.nan, np.nan, np.nan], [np.nan, 6.5, 3.]])

In [12]:
data.dropna()

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [14]:
data.dropna(how='all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [16]:
data.dropna(axis=0)

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


## Filling In Missing Data

In [20]:
df = pd.DataFrame(np.random.standard_normal((7, 3)))
df.iloc[:4, 1] = np.nan
df.iloc[:2, 2] = np.nan

In [21]:
df

Unnamed: 0,0,1,2
0,-2.611361,,
1,0.171015,,
2,0.475921,,0.989589
3,0.554072,,0.537221
4,-0.08303,-1.674509,-1.722404
5,-0.628079,0.668576,1.557162
6,-0.86682,1.75999,-1.408696


In [22]:
df.fillna(0)

Unnamed: 0,0,1,2
0,-2.611361,0.0,0.0
1,0.171015,0.0,0.0
2,0.475921,0.0,0.989589
3,0.554072,0.0,0.537221
4,-0.08303,-1.674509,-1.722404
5,-0.628079,0.668576,1.557162
6,-0.86682,1.75999,-1.408696


Calling fillna with a dictionary, you can use a different fill value for each column:

In [23]:
df.fillna({1: 1.2, 2:0.2})

Unnamed: 0,0,1,2
0,-2.611361,1.2,0.2
1,0.171015,1.2,0.2
2,0.475921,1.2,0.989589
3,0.554072,1.2,0.537221
4,-0.08303,-1.674509,-1.722404
5,-0.628079,0.668576,1.557162
6,-0.86682,1.75999,-1.408696


In [24]:
df = pd.DataFrame(np.random.standard_normal((6,3)))
df.iloc[2:,1] = np.nan
df.iloc[4:, 2] = np.nan

In [25]:
df

Unnamed: 0,0,1,2
0,0.250239,-0.827191,1.684347
1,1.041742,-1.344207,0.81624
2,-1.661333,,0.882972
3,-0.432008,,2.405352
4,1.190314,,
5,0.297031,,


In [30]:
df.ffill()

Unnamed: 0,0,1,2
0,0.250239,-0.827191,1.684347
1,1.041742,-1.344207,0.81624
2,-1.661333,-1.344207,0.882972
3,-0.432008,-1.344207,2.405352
4,1.190314,-1.344207,2.405352
5,0.297031,-1.344207,2.405352


In [31]:
df.ffill(limit=3)

Unnamed: 0,0,1,2
0,0.250239,-0.827191,1.684347
1,1.041742,-1.344207,0.81624
2,-1.661333,-1.344207,0.882972
3,-0.432008,-1.344207,2.405352
4,1.190314,-1.344207,2.405352
5,0.297031,,2.405352


In [32]:
df.fillna(df.mean())

Unnamed: 0,0,1,2
0,0.250239,-0.827191,1.684347
1,1.041742,-1.344207,0.81624
2,-1.661333,-1.085699,0.882972
3,-0.432008,-1.085699,2.405352
4,1.190314,-1.085699,1.447228
5,0.297031,-1.085699,1.447228


# Data Transformation

## Removing Duplicates

In [33]:
data = pd.DataFrame({"k1": ["one", "two"] * 3 + ["two"],
                "k2": [1, 1, 2, 3, 3, 4, 4]})

In [34]:
data

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4
6,two,4


In [35]:
data.duplicated()

0    False
1    False
2    False
3    False
4    False
5    False
6     True
dtype: bool

In [36]:
data.drop_duplicates()

Unnamed: 0,k1,k2
0,one,1
1,two,1
2,one,2
3,two,3
4,one,3
5,two,4


In [37]:
data['v1'] = range(7)

In [38]:
data

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1
2,one,2,2
3,two,3,3
4,one,3,4
5,two,4,5
6,two,4,6


In [39]:
data.drop_duplicates(subset=['k1'])

Unnamed: 0,k1,k2,v1
0,one,1,0
1,two,1,1


## Transforming Data Using a Function or Mapping

In [40]:
 data = pd.DataFrame({"food": ["bacon", "pulled pork", "bacon",                             
"pastrami", "corned beef", "bacon",   
"pastrami", "honey ham", "nova lox"],
 "ounces": [4, 3, 12, 6, 7.5, 8, 3, 5, 6]})

In [41]:
data

Unnamed: 0,food,ounces
0,bacon,4.0
1,pulled pork,3.0
2,bacon,12.0
3,pastrami,6.0
4,corned beef,7.5
5,bacon,8.0
6,pastrami,3.0
7,honey ham,5.0
8,nova lox,6.0


In [42]:
 meat_to_animal = {
 "bacon": "pig",
 "pulled pork": "pig",
 "pastrami": "cow",
 "corned beef": "cow",
 "honey ham": "pig",
 "nova lox": "salmon"
 }

In [43]:
data['animal'] = data['food'].map(meat_to_animal)

In [44]:
data

Unnamed: 0,food,ounces,animal
0,bacon,4.0,pig
1,pulled pork,3.0,pig
2,bacon,12.0,pig
3,pastrami,6.0,cow
4,corned beef,7.5,cow
5,bacon,8.0,pig
6,pastrami,3.0,cow
7,honey ham,5.0,pig
8,nova lox,6.0,salmon


In [45]:
def get_animal(x):
    return meat_to_animal[x]
data['food'].map(get_animal)

0       pig
1       pig
2       pig
3       cow
4       cow
5       pig
6       cow
7       pig
8    salmon
Name: food, dtype: object

##  Replacing Values

In [46]:
data = pd.Series([1., -999., 2., -999., -1000., 3.])