# Coding Practice Session 7
## Handling Missing Values in Pandas

In [2]:
import pandas as pd
import numpy as np

In [3]:
pool = [np.nan, 10, 56, 25.5, 4, 12, None, -45.6, "a", "d", "J", "R"]

In [4]:
df = pd.DataFrame(
    {
        "A": np.random.choice(pool, size=5),
        "B": np.random.choice(pool, size=5, replace=True),
        "C": np.random.choice(pool, size=5, replace=False),
        "D": np.random.choice(pool, size=5),
    }
)

In [5]:
df

Unnamed: 0,A,B,C,D
0,25.5,56,10,
1,10,4,J,d
2,R,56,a,4
3,56,d,,a
4,,,4,R


### Values Considered "Missing"

In [6]:
# NaN
s = pd.Series([1, 2, np.nan, 4, 5, pd.NA])
s

0       1
1       2
2     NaN
3       4
4       5
5    <NA>
dtype: object

In [7]:
s.isna()

0    False
1    False
2     True
3    False
4    False
5     True
dtype: bool

In [8]:
~s.isna()

0     True
1     True
2    False
3     True
4     True
5    False
dtype: bool

In [9]:
# None
pd.Series([1, None, 5, 6, None, 12])

0     1.0
1     NaN
2     5.0
3     6.0
4     NaN
5    12.0
dtype: float64

In [10]:
pd.Series(["p", None, "u", "z", None, "a"])

0       p
1    None
2       u
3       z
4    None
5       a
dtype: object

In [11]:
np.nan is None

False

In [12]:
np.nan == None

False

In [13]:
np.nan == np.nan

False

In [14]:
for item in pd.Series([1, 2, 3, np.nan, None, 56]):
    if item == np.nan:
        print("This will never be printed")
    elif item == None:
        print("This also will never be printed")

In [15]:
df = pd.DataFrame(
    {"A": [1, 2, 3, -999, 6, 21], "B": ["a", "N/A", "c", "Unknown", "", "v"]}
)

df

Unnamed: 0,A,B
0,1,a
1,2,
2,3,c
3,-999,Unknown
4,6,
5,21,v


In [16]:
df.replace([-999, "N/A", "Unknown", ""], np.nan, inplace=True)

In [17]:
df

Unnamed: 0,A,B
0,1.0,a
1,2.0,
2,3.0,c
3,,
4,6.0,
5,21.0,v


In [18]:
df["B"].dtype

dtype('O')

In [19]:
type(df.at[1, "B"])

float

In [20]:
pd.isna(np.nan)

True

In [21]:
pd.isna(None) == pd.isna(np.nan)

True

In [22]:
pd.isna(pd.NA)

True

### Detecting Missing Values

In [23]:
df = pd.DataFrame(
    {
        "A": [1, 2, np.nan, 4, 5],
        "B": [5, np.nan, np.nan, 3, 2],
        "C": [1, 2, 3, np.nan, 5],
        "D": ["a", "b", "c", None, "e"],
    }
)

df

Unnamed: 0,A,B,C,D
0,1.0,5.0,1.0,a
1,2.0,,2.0,b
2,,,3.0,c
3,4.0,3.0,,
4,5.0,2.0,5.0,e


In [24]:
# returns the length of column (number of rows)
len(df)

5

In [25]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,True,False,False
2,True,True,False,False
3,False,False,True,True
4,False,False,False,False


In [26]:
df.notnull()

Unnamed: 0,A,B,C,D
0,True,True,True,True
1,True,False,True,True
2,False,False,True,True
3,True,True,False,False
4,True,True,True,True


In [27]:
df["B"]

0    5.0
1    NaN
2    NaN
3    3.0
4    2.0
Name: B, dtype: float64

In [28]:
df["B"].isnull()

0    False
1     True
2     True
3    False
4    False
Name: B, dtype: bool

In [29]:
# checks whether each column has a Null value
df.isnull().any()

A    True
B    True
C    True
D    True
dtype: bool

In [30]:
# checks whether each row has a Null value
df.isnull().any(axis=1)

0    False
1     True
2     True
3     True
4    False
dtype: bool

In [31]:
# if there are missing values in the entire DataFrame
df.isnull().any().any()

np.True_

In [32]:
# number of missing values in each column
df.isnull().sum()

A    1
B    2
C    1
D    1
dtype: int64

In [33]:
# number of missing values in each row
df.isnull().sum(axis=1)

0    0
1    1
2    2
3    2
4    0
dtype: int64

In [34]:
# total number of missing values
df.isnull().sum().sum()

np.int64(5)

In [35]:
(df.isnull().sum() / len(df)) * 100

A    20.0
B    40.0
C    20.0
D    20.0
dtype: float64

In [36]:
df.loc[df.isnull().any(axis=1)]

Unnamed: 0,A,B,C,D
1,2.0,,2.0,b
2,,,3.0,c
3,4.0,3.0,,


In [37]:
df.isnull().any(axis=1)

0    False
1     True
2     True
3     True
4    False
dtype: bool

In [38]:
df[df.notnull().all(axis=1)]

Unnamed: 0,A,B,C,D
0,1.0,5.0,1.0,a
4,5.0,2.0,5.0,e


In [39]:
df.notnull().all(axis=1)

0     True
1    False
2    False
3    False
4     True
dtype: bool

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5 entries, 0 to 4
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   A       4 non-null      float64
 1   B       3 non-null      float64
 2   C       4 non-null      float64
 3   D       4 non-null      object 
dtypes: float64(3), object(1)
memory usage: 292.0+ bytes


In [41]:
np.round(df.describe(), decimals=2)

Unnamed: 0,A,B,C
count,4.0,3.0,4.0
mean,3.0,3.33,2.75
std,1.83,1.53,1.71
min,1.0,2.0,1.0
25%,1.75,2.5,1.75
50%,3.0,3.0,2.5
75%,4.25,4.0,3.5
max,5.0,5.0,5.0


In [42]:
df = pd.DataFrame(
    {
        "A": np.random.choice(pool, size=5),
        "B": np.random.choice(pool, size=5, replace=True),
        "C": np.random.choice(pool, size=5, replace=False),
        "D": np.random.choice(pool, size=5),
    }
)

df

Unnamed: 0,A,B,C,D
0,J,,d,-45.6
1,56,56,25.5,12.0
2,25.5,R,10,12.0
3,,a,56,25.5
4,d,,R,10.0


In [43]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,True,False,False
1,False,False,False,False
2,False,False,False,False
3,True,False,False,False
4,False,True,False,False


In [44]:
df.isnull().any()

A     True
B     True
C    False
D    False
dtype: bool

In [45]:
df.isnull().any().any()

np.True_

In [46]:
df.isnull().any(axis=1)

0     True
1    False
2    False
3     True
4     True
dtype: bool

In [47]:
df.isnull().sum()

A    1
B    2
C    0
D    0
dtype: int64

In [48]:
df.isnull().sum(axis=1)

0    1
1    0
2    0
3    1
4    1
dtype: int64

In [49]:
df.isnull().sum().sum()

np.int64(3)

In [50]:
# columns with no null values
df.loc[:, df.notnull().all()]

Unnamed: 0,C,D
0,d,-45.6
1,25.5,12.0
2,10,12.0
3,56,25.5
4,R,10.0


In [51]:
# rows with no missing value
df[df.notnull().all(axis=1)]

Unnamed: 0,A,B,C,D
1,56.0,56,25.5,12
2,25.5,R,10.0,12


In [121]:
pd.notna([np.nan, 12, 14, None])

array([False,  True,  True, False])

### NA Semantics and Behavior

In [52]:
s = pd.Series([1, 2, 3, pd.NA, 4, 5])
s

0       1
1       2
2       3
3    <NA>
4       4
5       5
dtype: object

In [None]:
# missing values are ignored
pd.Series([np.nan, 12, 14, None]).value_counts()

12.0    1
14.0    1
Name: count, dtype: int64

In [53]:
# arithmetic operations
s + 10

0      11
1      12
2      13
3    <NA>
4      14
5      15
dtype: object

In [54]:
s * 2

0       2
1       4
2       6
3    <NA>
4       8
5      10
dtype: object

In [55]:
s // 2

0       0
1       1
2       1
3    <NA>
4       2
5       2
dtype: object

In [56]:
# comparison and logical operations
s < 3

0     True
1     True
2    False
3    False
4    False
5    False
dtype: bool

In [57]:
s > 3

0    False
1    False
2    False
3    False
4     True
5     True
dtype: bool

In [58]:
pd.isna(s)

0    False
1    False
2    False
3     True
4    False
5    False
dtype: bool

In [59]:
a = pd.Series([True, False, np.nan])
b = pd.Series([False, True, np.nan])

In [60]:
a & b

0    False
1    False
2    False
dtype: bool

In [61]:
a | b

0     True
1     True
2    False
dtype: bool

In [62]:
df = pd.DataFrame({"A": [1, 2, np.nan], "B": [4, np.nan, 6]})

df

Unnamed: 0,A,B
0,1.0,4.0
1,2.0,
2,,6.0


In [63]:
df > 2

Unnamed: 0,A,B
0,False,True
1,False,False
2,False,True


In [64]:
df[(df > 2).all(axis=1)]

Unnamed: 0,A,B


### Inserting Missing Values  

In [65]:
df = pd.DataFrame({"A": [1, 2, 3], "B": [4, 5, 6]}, index=list("abc"))

df

Unnamed: 0,A,B
a,1,4
b,2,5
c,3,6


In [66]:
df.at["b", "A"] = np.nan

In [67]:
df

Unnamed: 0,A,B
a,1.0,4
b,,5
c,3.0,6


In [68]:
df.at["c", "B"] = None
df

Unnamed: 0,A,B
a,1.0,4.0
b,,5.0
c,3.0,


In [69]:
df["C"] = [np.nan, 7, 9]
df

Unnamed: 0,A,B,C
a,1.0,4.0,
b,,5.0,7.0
c,3.0,,9.0


In [70]:
df["D"] = np.nan
df

Unnamed: 0,A,B,C,D
a,1.0,4.0,,
b,,5.0,7.0,
c,3.0,,9.0,


In [71]:
df = pd.DataFrame(
    {
        "A": [1, 2, 3, None],
        "B": [np.nan, np.nan, 20, 36],
        "C": np.nan,
        "D": [10, 20, 30, None],
    }
)

df

Unnamed: 0,A,B,C,D
0,1.0,,,10.0
1,2.0,,,20.0
2,3.0,20.0,,30.0
3,,36.0,,


In [72]:
df.isnull().any()

A    True
B    True
C    True
D    True
dtype: bool

In [73]:
s = pd.Series([1, 2, 3])
s

0    1
1    2
2    3
dtype: int64

In [74]:
s.dtype

dtype('int64')

In [75]:
s[2] = np.nan

In [76]:
s

0    1.0
1    2.0
2    NaN
dtype: float64

In [77]:
s.dtype

dtype('float64')

In [78]:
s.mean()

np.float64(1.5)

In [79]:
s.sum()

np.float64(3.0)

In [80]:
len(s)

3

In [81]:
df

Unnamed: 0,A,B,C,D
0,1.0,,,10.0
1,2.0,,,20.0
2,3.0,20.0,,30.0
3,,36.0,,


In [82]:
# NaN rows are excluded
df[df["A"] > 2]

Unnamed: 0,A,B,C,D
2,3.0,20.0,,30.0


In [83]:
df[df["D"] > 15]

Unnamed: 0,A,B,C,D
1,2.0,,,20.0
2,3.0,20.0,,30.0


### Handling Missing Values

In [84]:
s = pd.Series([1, 2, np.nan, 3, 4, np.nan])
s

0    1.0
1    2.0
2    NaN
3    3.0
4    4.0
5    NaN
dtype: float64

In [85]:
s.dropna()

0    1.0
1    2.0
3    3.0
4    4.0
dtype: float64

In [86]:
df = pd.DataFrame(
    {
        "A": np.random.choice(pool, size=5),
        "B": np.random.choice(pool, size=5, replace=True),
        "C": np.random.choice(pool, size=5, replace=False),
        "D": np.random.choice(pool, size=5),
    }
)

df

Unnamed: 0,A,B,C,D
0,-45.6,4,-45.6,4
1,d,a,a,a
2,-45.6,56,56,25.5
3,10,56,12,
4,56,R,J,a


In [87]:
df.dropna()  # removes records with missing values

Unnamed: 0,A,B,C,D
0,-45.6,4,-45.6,4
1,d,a,a,a
2,-45.6,56,56,25.5
4,56,R,J,a


In [88]:
df.dropna(axis="columns")

Unnamed: 0,A,B,C
0,-45.6,4,-45.6
1,d,a,a
2,-45.6,56,56
3,10,56,12
4,56,R,J


In [89]:
df["X"] = np.nan
df

Unnamed: 0,A,B,C,D,X
0,-45.6,4,-45.6,4,
1,d,a,a,a,
2,-45.6,56,56,25.5,
3,10,56,12,,
4,56,R,J,a,


In [90]:
df.dropna(axis=1, how="all")

Unnamed: 0,A,B,C,D
0,-45.6,4,-45.6,4
1,d,a,a,a
2,-45.6,56,56,25.5
3,10,56,12,
4,56,R,J,a


In [91]:
df.dropna(thresh=4)  # at least 4 non-NA values

Unnamed: 0,A,B,C,D,X
0,-45.6,4,-45.6,4,
1,d,a,a,a,
2,-45.6,56,56,25.5,
4,56,R,J,a,


In [92]:
df = pd.DataFrame(
    {"A": [1, 2, np.nan, 4], "B": [5, np.nan, np.nan, 3], "C": [1, 2, 3, np.nan]}
)

df

Unnamed: 0,A,B,C
0,1.0,5.0,1.0
1,2.0,,2.0
2,,,3.0
3,4.0,3.0,


In [93]:
df.fillna(0)

Unnamed: 0,A,B,C
0,1.0,5.0,1.0
1,2.0,0.0,2.0
2,0.0,0.0,3.0
3,4.0,3.0,0.0


In [94]:
df.fillna(100)

Unnamed: 0,A,B,C
0,1.0,5.0,1.0
1,2.0,100.0,2.0
2,100.0,100.0,3.0
3,4.0,3.0,100.0


In [None]:
df.fillna({"A": df["A"].mean(), "B": df["B"].mode(), "C": df["C"].median()})

Unnamed: 0,A,B,C
0,1.0,5.0,1.0
1,2.0,5.0,2.0
2,2.333333,,3.0
3,4.0,3.0,2.0


In [96]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,1.0
1,2.0,,2.0
2,,,3.0
3,4.0,3.0,


In [97]:
# forward fill: putting the last valid observed value
df.ffill()

Unnamed: 0,A,B,C
0,1.0,5.0,1.0
1,2.0,5.0,2.0
2,2.0,5.0,3.0
3,4.0,3.0,3.0


In [98]:
# like ffill, but starting from bottom
df.bfill()

Unnamed: 0,A,B,C
0,1.0,5.0,1.0
1,2.0,3.0,2.0
2,4.0,3.0,3.0
3,4.0,3.0,


In [99]:
df.fillna(df.mean())

Unnamed: 0,A,B,C
0,1.0,5.0,1.0
1,2.0,4.0,2.0
2,2.333333,4.0,3.0
3,4.0,3.0,2.0


In [100]:
df.fillna(df.median())

Unnamed: 0,A,B,C
0,1.0,5.0,1.0
1,2.0,4.0,2.0
2,2.0,4.0,3.0
3,4.0,3.0,2.0


In [101]:
df.median()

A    2.0
B    4.0
C    2.0
dtype: float64

In [None]:
df = pd.DataFrame({"A": [5, 8, None, 4, np.nan], "B": [None, np.nan, 7, 1, np.nan]})

df

Unnamed: 0,A,B
0,5.0,
1,8.0,
2,,7.0
3,4.0,1.0
4,,


In [132]:
col_A_nan_index = df[df["A"].isnull()].index

In [140]:
col_B_nan_index = df[df["B"].isnull()].index

In [139]:
df["A"].fillna(
    pd.Series(np.random.randint(1, 10, size=2), index=col_A_nan_index)
)

0    5.0
1    8.0
2    4.0
3    4.0
4    4.0
Name: A, dtype: float64

In [141]:
df["B"].fillna(
    pd.Series(np.random.randint(1, 10, size=3), index=col_B_nan_index)
)

0    9.0
1    4.0
2    7.0
3    1.0
4    1.0
Name: B, dtype: float64

In [107]:
# interpolation
s = pd.Series([1, 2, np.nan, 3, 4, np.nan, 6])
s

0    1.0
1    2.0
2    NaN
3    3.0
4    4.0
5    NaN
6    6.0
dtype: float64

In [108]:
s.interpolate()

0    1.0
1    2.0
2    2.5
3    3.0
4    4.0
5    5.0
6    6.0
dtype: float64

In [109]:
s.interpolate(method="linear")

0    1.0
1    2.0
2    2.5
3    3.0
4    4.0
5    5.0
6    6.0
dtype: float64

### Advanced Techniques

In [None]:
df = pd.DataFrame({"A": [1, 2, -999, 4, 5], "B": ["a", "N/A", "c", "Missing", "e"]})

df

Unnamed: 0,A,B
0,1,a
1,2,
2,-999,c
3,4,Missing
4,5,e


In [117]:
df.replace([-999, "Missing", "N/A"], np.nan, inplace=False)

Unnamed: 0,A,B
0,1.0,a
1,2.0,
2,,c
3,4.0,
4,5.0,e


In [118]:
df

Unnamed: 0,A,B
0,1,a
1,2,
2,-999,c
3,4,Missing
4,5,e


In [None]:
df.replace({"A": {-999: np.nan}, "B": {"N/A": pd.NA, "Missing": "Unknown"}})

Unnamed: 0,A,B
0,1.0,a
1,2.0,
2,,c
3,4.0,Unknown
4,5.0,e
