<a href="https://colab.research.google.com/github/ArifAygun/Magnimind-ML/blob/main/AA_2_Lab1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Dealing with missing data

## A.  Identifying missing values in tabular data

In [2]:
import pandas as pd
from io import StringIO
import sys

csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

# If you are using Python 2.7, you need
# to convert the string to unicode:

if (sys.version_info < (3, 0)):
    csv_data = unicode(csv_data)

### Step 1: Read the csv file as a pandas dataframe

In [4]:
df = pd.read_csv(StringIO(csv_data))

### Step 2: Check the number of missing values for the columns

In [6]:
df.isnull()

Unnamed: 0,A,B,C,D
0,False,False,False,False
1,False,False,True,False
2,False,False,False,True


In [7]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

### Step 3: access the underlying NumPy array via the `values` attribute

In [9]:
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

### Step 4: Remove rows from df that contain missing values

In [10]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


### Step 5: Remove columns from df that contain missing values

In [11]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


### Step 6: Only drop rows where all columns are NaN

In [14]:
df.dropna(how='all')

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


### Step 7: Drop rows that have less than 3 real values

In [16]:
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


### Step 8: Only drop rows where NaN appear in specific columns (here: 'C')

In [18]:
df.dropna(subset=["C"])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


## B. Imputing missing values

In [19]:
# again: our original array
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

### Step 1: impute missing values via the column mean
`from sklearn.impute import SimpleImputer`

`import numpy as np`

In [23]:
from sklearn.impute import SimpleImputer

import numpy as np

<br>
<br>

In [24]:
from numpy.core.fromnumeric import mean

imp_miss = SimpleImputer(missing_values=np.nan, strategy='mean')

imp_miss = imp_miss.fit(df.values)
imputed_data = imp_miss.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])