In [1]:
# Import
import pandas as pd
import numpy as np

from numpy import nan as NA

## 7.1 Handling Missing Data

In [2]:
string_data = pd.Series(['aardvark', 'artichoke', np.nan, 'avocado'])
string_data

0     aardvark
1    artichoke
2          NaN
3      avocado
dtype: object

In [3]:
string_data.isnull() 

0    False
1    False
2     True
3    False
dtype: bool

In [4]:
string_data[0] = None 
string_data.isnull()
# Il valore 'None' viene trattato esattamente come il valore np.nan

0     True
1    False
2     True
3    False
dtype: bool

### Filter Out Missing Data

In [5]:
data = pd.Series([1, NA, 3.5, NA, 7])
data.dropna() # Elimina i valori uguali np.Nan o None

0    1.0
2    3.5
4    7.0
dtype: float64

In [6]:
# E' l'equivalente di scrivere
data[data.notnull()] 
# 'notnull' ritorna un array di valori booleani con valari True se in quella posizione è 
# presente un elemento non nullo altrimenti False

0    1.0
2    3.5
4    7.0
dtype: float64

In [7]:
# Con i DataFrame è possibile scegliere se eliminare le colonne o le righe che contengono un valore
# nullo. Di default vengono eliminate le righe
data = pd.DataFrame([[1., 6.5, 3.], [1., NA, NA],
                     [NA, NA, NA], [NA, 6.5, 3.]])
data

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [8]:
clearned = data.dropna()

In [9]:
clearned

Unnamed: 0,0,1,2
0,1.0,6.5,3.0


In [10]:
# Se impostiamo il parametro 'how' uguale a 'all' vengono eliminate le righe (o colonne) che contengono
# tutti i valori np.NaN
data.dropna(how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
3,,6.5,3.0


In [11]:
# Per eliminare le colonne si usa il parametro 'axis = 1'
data[4] = NA
data

Unnamed: 0,0,1,2,4
0,1.0,6.5,3.0,
1,1.0,,,
2,,,,
3,,6.5,3.0,


In [12]:
data.dropna(axis = 1, how = 'all')

Unnamed: 0,0,1,2
0,1.0,6.5,3.0
1,1.0,,
2,,,
3,,6.5,3.0


In [13]:
df = pd.DataFrame(np.random.randn(7, 3))
df.iloc[:4, 1] = NA
df.iloc[:2, 2] = NA
df

Unnamed: 0,0,1,2
0,0.128821,,
1,0.19222,,
2,-0.603287,,0.06111
3,1.578957,,-0.392234
4,-0.727938,-0.933523,0.43729
5,-1.78067,0.364188,-0.824733
6,-0.278476,0.446783,0.905119


In [14]:
df.dropna()

Unnamed: 0,0,1,2
4,-0.727938,-0.933523,0.43729
5,-1.78067,0.364188,-0.824733
6,-0.278476,0.446783,0.905119


In [15]:
# Il parametro 'thresh' serve per stabilire quanti valori diversi da np.NaN devono essere presenti affinché 
# la riga (o colonna) non venga eliminata, rappresenta quindi una soglia. Se ad esempio thresh = 2 non vengono 
# eliminate le righe che presentano almeno due valori diversi da NaN
df.dropna(thresh = 2)

Unnamed: 0,0,1,2
2,-0.603287,,0.06111
3,1.578957,,-0.392234
4,-0.727938,-0.933523,0.43729
5,-1.78067,0.364188,-0.824733
6,-0.278476,0.446783,0.905119


### Filling In Missing Data

In [16]:
# E' possibile replicare un valore nullo con una costante tramite il metodo 'fillna'
df.fillna(0)

Unnamed: 0,0,1,2
0,0.128821,0.0,0.0
1,0.19222,0.0,0.0
2,-0.603287,0.0,0.06111
3,1.578957,0.0,-0.392234
4,-0.727938,-0.933523,0.43729
5,-1.78067,0.364188,-0.824733
6,-0.278476,0.446783,0.905119


In [17]:
#Si può anche utilizzare un dizionario nel quale specificare i valori da inserire per ogni colonna
df.fillna({1 : 0.5, 2 : 0})

Unnamed: 0,0,1,2
0,0.128821,0.5,0.0
1,0.19222,0.5,0.0
2,-0.603287,0.5,0.06111
3,1.578957,0.5,-0.392234
4,-0.727938,-0.933523,0.43729
5,-1.78067,0.364188,-0.824733
6,-0.278476,0.446783,0.905119


In [18]:
# Il metodo 'fillna' ritorna un oggetto ma è anche possibile modificare l'oggetto esistente 'in-place'
_ = df.fillna(0, inplace = True)
df

Unnamed: 0,0,1,2
0,0.128821,0.0,0.0
1,0.19222,0.0,0.0
2,-0.603287,0.0,0.06111
3,1.578957,0.0,-0.392234
4,-0.727938,-0.933523,0.43729
5,-1.78067,0.364188,-0.824733
6,-0.278476,0.446783,0.905119


In [19]:
df = pd.DataFrame(np.random.randn(6, 3))
df.iloc[2:, 1] = NA
df.iloc[4:, 2] = NA
df

Unnamed: 0,0,1,2
0,-0.808602,2.131511,-1.499446
1,2.081682,-0.938222,-0.220118
2,-1.14919,,-0.415007
3,0.246931,,1.666456
4,0.908896,,
5,-1.830974,,


In [20]:
df.fillna(method = 'ffill') 
# Con il parametro 'method = 'ffill' si specifica che i valori che np.nan vengono
# sostituiti con il valore che li precede sulla stessa colonna

Unnamed: 0,0,1,2
0,-0.808602,2.131511,-1.499446
1,2.081682,-0.938222,-0.220118
2,-1.14919,-0.938222,-0.415007
3,0.246931,-0.938222,1.666456
4,0.908896,-0.938222,1.666456
5,-1.830974,-0.938222,1.666456


In [21]:
df.fillna(method = 'ffill', limit = 2) # 'limit' specifica quanti valori vengano sostituiti

Unnamed: 0,0,1,2
0,-0.808602,2.131511,-1.499446
1,2.081682,-0.938222,-0.220118
2,-1.14919,-0.938222,-0.415007
3,0.246931,-0.938222,1.666456
4,0.908896,,1.666456
5,-1.830974,,1.666456


In [22]:
# Specificando axis = 1 il valore che viene utilizzato e quello che precede sulla riga
df.fillna(method = 'ffill', axis = 1)

Unnamed: 0,0,1,2
0,-0.808602,2.131511,-1.499446
1,2.081682,-0.938222,-0.220118
2,-1.14919,-1.14919,-0.415007
3,0.246931,0.246931,1.666456
4,0.908896,0.908896,0.908896
5,-1.830974,-1.830974,-1.830974


In [23]:
# E' possibile anche utilizzare il valore che segue con 'method = bfill'
df.fillna(method = 'bfill', axis = 1)
# Si può notare come alcuni valori non avendo nulla che li precede non cambino

Unnamed: 0,0,1,2
0,-0.808602,2.131511,-1.499446
1,2.081682,-0.938222,-0.220118
2,-1.14919,-0.415007,-0.415007
3,0.246931,1.666456,1.666456
4,0.908896,,
5,-1.830974,,


In [24]:
df.fillna(method = 'bfill')
# Come in questo caso, perché i valori NA si trovano in ultima riga, quindi nessuno li segue

Unnamed: 0,0,1,2
0,-0.808602,2.131511,-1.499446
1,2.081682,-0.938222,-0.220118
2,-1.14919,,-0.415007
3,0.246931,,1.666456
4,0.908896,,
5,-1.830974,,


In [25]:
# Si può un po' creativi, ad esempio sostituendo questi valori con la media o la mediana della Series
data = pd.Series([1., NA, 3.5, NA, 7])
data.fillna(data.mean())

0    1.000000
1    3.833333
2    3.500000
3    3.833333
4    7.000000
dtype: float64

In [26]:
# Inoltre ad esempio 