<a href="https://colab.research.google.com/github/aciofo/AI-Engineering/blob/main/machine-learning-fundamentals/data-preprocessing/missing_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

In [None]:
BASE_URL = "https://raw.githubusercontent.com/ProfAI/machine-learning-fondamenti/refs/heads/main/datasets/"

In [None]:
iris = pd.read_csv(BASE_URL + "iris_missing.csv")
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
iris.shape

(150, 5)

In [None]:
iris.count() # number of valid values

Unnamed: 0,0
sepal_length,147
sepal_width,140
petal_length,148
petal_width,150
species,150


In [None]:
iris.isna().sum() # how many na per column

Unnamed: 0,0
sepal_length,3
sepal_width,10
petal_length,2
petal_width,0
species,0


### Remove rows/columns with missing values

In [None]:
iris_drop = iris.copy()

In [None]:
iris_drop = iris_drop.dropna()
iris_drop.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
6,4.6,3.4,1.4,0.3,setosa
7,5.0,3.4,1.5,0.2,setosa


In [None]:
iris_drop.shape

(135, 5)

In [None]:
thresh = iris.shape[0] * 0.9
thresh

135.0

In [None]:
iris_drop = iris_drop.dropna(axis=1, thresh=thresh)
iris_drop.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
iris_drop = iris_drop.dropna(how = 'all')
iris_drop.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
iris_drop = iris_drop.dropna(subset = 'sepal_width')
iris_drop.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
5,,3.9,1.7,0.4,setosa
6,4.6,3.4,1.4,0.3,setosa


In [None]:
iris_drop.shape

(140, 5)

### Imputation of missing data

#### PANDAS

In [None]:
iris_imp = iris.copy()

In [None]:
col = 'sepal_width'

In [None]:
replace_with = round(iris_imp[col].mean(),1)

In [None]:
iris_imp[col] = iris_imp[col].fillna(replace_with)

In [None]:
iris_imp[col].head()

Unnamed: 0,sepal_width
0,3.1
1,3.0
2,3.2
3,3.1
4,3.6


In [None]:
iris_imp.isna().sum()

Unnamed: 0,0
sepal_length,3
sepal_width,0
petal_length,2
petal_width,0
species,0


In [None]:
replace_with = round(iris_imp[col].mode()[0],1)

In [None]:
iris_imp[col] = iris_imp[col].fillna(replace_with)
iris_imp.head()

In [None]:
replace_with = iris_imp.mean(numeric_only=True)
iris_imp = iris_imp.fillna(replace_with)
iris_imp.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.056429,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.056429,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [None]:
iris_imp.isna().sum()

Unnamed: 0,0
sepal_length,0
sepal_width,0
petal_length,0
petal_width,0
species,0


#### SCIKIT-LEARN

In [None]:
from sklearn.impute import SimpleImputer

In [None]:
X = iris.drop("species", axis = 1).values
X.shape

(150, 4)

In [None]:
imp = SimpleImputer(strategy='mean')
X_imp = imp.fit_transform(X)
X_imp[:5]

array([[5.1       , 3.05642857, 1.4       , 0.2       ],
       [4.9       , 3.        , 1.4       , 0.2       ],
       [4.7       , 3.2       , 1.3       , 0.2       ],
       [4.6       , 3.05642857, 1.5       , 0.2       ],
       [5.        , 3.6       , 1.4       , 0.2       ]])

In [None]:
np.isnan(X).sum(axis=0)

array([ 3, 10,  2,  0])

In [None]:
np.isnan(X_imp).sum(axis=0)

array([0, 0, 0, 0])