# Data Cleaning
Data cleaning means fixing bad data in your data set.

Bad data could be:

Empty cells
Data in wrong format
Wrong data
Duplicates

In [24]:
import pandas as pd

data = [
    (60, '2020/12/01', 110, 130, 409.1),
    (60, '2020/12/02', 117, 145, 479.0),
    (60, '2020/12/03', 103, 135, 340.0),
    (45, '2020/12/04', 109, 175, 282.4),
    (45, '2020/12/05', 117, 148, 406.0),
    (60, '2020/12/06', 102, 127, 300.0),
    (60, '2020/12/07', 110, 136, 374.0),
    (450, '2020/12/08', 104, 134, 253.3),
    (30, '2020/12/09', 109, 133, 195.1),
    (60, '2020/12/10', 98, 124, 269.0),
    (60, '2020/12/11', 103, 147, 329.3),
    (60, '2020/12/12', 100, 120, 250.7),
    (60, '2020/12/12', 100, 120, 250.7),
    (60, '2020/12/13', 106, 128, 345.3),
    (60, '2020/12/14', 104, 132, 379.3),
    (60, '2020/12/15', 98, 123, 275.0),
    (60, '2020/12/16', 98, 120, 215.2),
    (60, '2020/12/17', 100, 120, 300.0),
    (45, '2020/12/18', 90, 112, None),
    (60, '2020/12/19', 103, 123, 323.0),
    (45, '2020/12/20', 97, 125, 243.0),
    (60, '2020/12/21', 108, 131, 364.2),
    (45, None, 100, 119, 282.0),
    (60, '2020/12/23', 130, 101, 300.0),
    (45, '2020/12/24', 105, 132, 246.0),
    (60, '2020/12/25', 102, 126, 334.5),
    (60, 20201226, 100, 120, 250.0),
    (60, '2020/12/27', 92, 118, 241.0),
    (60, '2020/12/28', 103, 132, None),
    (60, '2020/12/29', 100, 132, 280.0),
    (60, '2020/12/30', 102, 129, 380.3),
    (60, '2020/12/31', 92, 115, 243.0)
]

columns = ['Duration', 'Date', 'Pulse', 'Maxpulse', 'Calories']

df = pd.DataFrame(data, columns=columns)

# Display the DataFrame
print(df)


    Duration        Date  Pulse  Maxpulse  Calories
0         60  2020/12/01    110       130     409.1
1         60  2020/12/02    117       145     479.0
2         60  2020/12/03    103       135     340.0
3         45  2020/12/04    109       175     282.4
4         45  2020/12/05    117       148     406.0
5         60  2020/12/06    102       127     300.0
6         60  2020/12/07    110       136     374.0
7        450  2020/12/08    104       134     253.3
8         30  2020/12/09    109       133     195.1
9         60  2020/12/10     98       124     269.0
10        60  2020/12/11    103       147     329.3
11        60  2020/12/12    100       120     250.7
12        60  2020/12/12    100       120     250.7
13        60  2020/12/13    106       128     345.3
14        60  2020/12/14    104       132     379.3
15        60  2020/12/15     98       123     275.0
16        60  2020/12/16     98       120     215.2
17        60  2020/12/17    100       120     300.0
18        45

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      31 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB


In [25]:
new_df = df.dropna()

print(new_df.to_string())

    Duration        Date  Pulse  Maxpulse  Calories
0         60  2020/12/01    110       130     409.1
1         60  2020/12/02    117       145     479.0
2         60  2020/12/03    103       135     340.0
3         45  2020/12/04    109       175     282.4
4         45  2020/12/05    117       148     406.0
5         60  2020/12/06    102       127     300.0
6         60  2020/12/07    110       136     374.0
7        450  2020/12/08    104       134     253.3
8         30  2020/12/09    109       133     195.1
9         60  2020/12/10     98       124     269.0
10        60  2020/12/11    103       147     329.3
11        60  2020/12/12    100       120     250.7
12        60  2020/12/12    100       120     250.7
13        60  2020/12/13    106       128     345.3
14        60  2020/12/14    104       132     379.3
15        60  2020/12/15     98       123     275.0
16        60  2020/12/16     98       120     215.2
17        60  2020/12/17    100       120     300.0
19        60

# Note: By default, the dropna() method returns a new DataFrame, and will not change the original.

### If you want to change the original DataFrame, use the inplace = True argument:

In [16]:
df.dropna(inplace = True)

print(df.to_string())

    Duration        Date  Pulse  Maxpulse  Calories
0         60  2020/12/01    110       130     409.1
1         60  2020/12/02    117       145     479.0
2         60  2020/12/03    103       135     340.0
3         45  2020/12/04    109       175     282.4
4         45  2020/12/05    117       148     406.0
5         60  2020/12/06    102       127     300.0
6         60  2020/12/07    110       136     374.0
7        450  2020/12/08    104       134     253.3
8         30  2020/12/09    109       133     195.1
9         60  2020/12/10     98       124     269.0
10        60  2020/12/11    103       147     329.3
11        60  2020/12/12    100       120     250.7
12        60  2020/12/12    100       120     250.7
13        60  2020/12/13    106       128     345.3
14        60  2020/12/14    104       132     379.3
15        60  2020/12/15     98       123     275.0
16        60  2020/12/16     98       120     215.2
17        60  2020/12/17    100       120     300.0
19        60

# Replace Empty Values
Another way of dealing with empty cells is to insert a new value instead.

This way you do not have to delete entire rows just because of some empty cells.

The fillna() method allows us to replace empty cells with a value:

In [26]:
df.fillna(inplace = True)

print(df.to_string())

ValueError: Must specify a fill 'value' or 'method'.

In [18]:
df["Calories"].fillna(130, inplace = True)

print(df.to_string())

    Duration        Date  Pulse  Maxpulse  Calories
0         60  2020/12/01    110       130     409.1
1         60  2020/12/02    117       145     479.0
2         60  2020/12/03    103       135     340.0
3         45  2020/12/04    109       175     282.4
4         45  2020/12/05    117       148     406.0
5         60  2020/12/06    102       127     300.0
6         60  2020/12/07    110       136     374.0
7        450  2020/12/08    104       134     253.3
8         30  2020/12/09    109       133     195.1
9         60  2020/12/10     98       124     269.0
10        60  2020/12/11    103       147     329.3
11        60  2020/12/12    100       120     250.7
12        60  2020/12/12    100       120     250.7
13        60  2020/12/13    106       128     345.3
14        60  2020/12/14    104       132     379.3
15        60  2020/12/15     98       123     275.0
16        60  2020/12/16     98       120     215.2
17        60  2020/12/17    100       120     300.0
19        60

In [30]:
import pandas as pd

data = [
    (60, '2020/12/01', 110, 130, 409.1),
    (60, '2020/12/02', 117, 145, 479.0),
    (60, '2020/12/03', 103, 135, 340.0),
    (45, '2020/12/04', 109, 175, 282.4),
    (45, '2020/12/05', 117, 148, 406.0),
    (60, '2020/12/06', 102, 127, 300.0),
    (60, '2020/12/07', 110, 136, 374.0),
    (450, '2020/12/08', 104, 134, 253.3),
    (30, '2020/12/09', 109, 133, 195.1),
    (60, '2020/12/10', 98, 124, 269.0),
    (60, '2020/12/11', 103, 147, 329.3),
    (60, '2020/12/12', 100, 120, 250.7),
    (60, '2020/12/12', 100, 120, 250.7),
    (60, '2020/12/13', 106, 128, 345.3),
    (60, '2020/12/14', 104, 132, 379.3),
    (60, '2020/12/15', 98, 123, 275.0),
    (60, '2020/12/16', 98, 120, 215.2),
    (60, '2020/12/17', 100, 120, 300.0),
    (45, '2020/12/18', 90, 112, None),
    (60, '2020/12/19', 103, 123, 323.0),
    (45, '2020/12/20', 97, 125, 243.0),
    (60, '2020/12/21', 108, 131, 364.2),
    (45, None, 100, 119, 282.0),
    (60, '2020/12/23', 130, 101, 300.0),
    (45, '2020/12/24', 105, 132, 246.0),
    (60, '2020/12/25', 102, 126, 334.5),
    (60, 20201226, 100, 120, 250.0),
    (60, '2020/12/27', 92, 118, 241.0),
    (60, '2020/12/28', 103, 132, None),
    (60, '2020/12/29', 100, 132, 280.0),
    (60, '2020/12/30', 102, 129, 380.3),
    (60, '2020/12/31', 92, 115, 243.0)
]

columns = ['Duration', 'Date', 'Pulse', 'Maxpulse', 'Calories']

df1 = pd.DataFrame(data, columns=columns)
df2 = pd.DataFrame(data, columns=columns)
df3 = pd.DataFrame(data, columns=columns)

# Display the DataFrame
print(df1)


    Duration        Date  Pulse  Maxpulse  Calories
0         60  2020/12/01    110       130     409.1
1         60  2020/12/02    117       145     479.0
2         60  2020/12/03    103       135     340.0
3         45  2020/12/04    109       175     282.4
4         45  2020/12/05    117       148     406.0
5         60  2020/12/06    102       127     300.0
6         60  2020/12/07    110       136     374.0
7        450  2020/12/08    104       134     253.3
8         30  2020/12/09    109       133     195.1
9         60  2020/12/10     98       124     269.0
10        60  2020/12/11    103       147     329.3
11        60  2020/12/12    100       120     250.7
12        60  2020/12/12    100       120     250.7
13        60  2020/12/13    106       128     345.3
14        60  2020/12/14    104       132     379.3
15        60  2020/12/15     98       123     275.0
16        60  2020/12/16     98       120     215.2
17        60  2020/12/17    100       120     300.0
18        45

In [31]:
df1["Date"].fillna(130, inplace = True)

print(df1.to_string())

    Duration        Date  Pulse  Maxpulse  Calories
0         60  2020/12/01    110       130     409.1
1         60  2020/12/02    117       145     479.0
2         60  2020/12/03    103       135     340.0
3         45  2020/12/04    109       175     282.4
4         45  2020/12/05    117       148     406.0
5         60  2020/12/06    102       127     300.0
6         60  2020/12/07    110       136     374.0
7        450  2020/12/08    104       134     253.3
8         30  2020/12/09    109       133     195.1
9         60  2020/12/10     98       124     269.0
10        60  2020/12/11    103       147     329.3
11        60  2020/12/12    100       120     250.7
12        60  2020/12/12    100       120     250.7
13        60  2020/12/13    106       128     345.3
14        60  2020/12/14    104       132     379.3
15        60  2020/12/15     98       123     275.0
16        60  2020/12/16     98       120     215.2
17        60  2020/12/17    100       120     300.0
18        45

In [32]:
# x = 

df3["Calories"].fillna(df3["Calories"].mode()[0], inplace = True)

print(df3.to_string())

    Duration        Date  Pulse  Maxpulse  Calories
0         60  2020/12/01    110       130     409.1
1         60  2020/12/02    117       145     479.0
2         60  2020/12/03    103       135     340.0
3         45  2020/12/04    109       175     282.4
4         45  2020/12/05    117       148     406.0
5         60  2020/12/06    102       127     300.0
6         60  2020/12/07    110       136     374.0
7        450  2020/12/08    104       134     253.3
8         30  2020/12/09    109       133     195.1
9         60  2020/12/10     98       124     269.0
10        60  2020/12/11    103       147     329.3
11        60  2020/12/12    100       120     250.7
12        60  2020/12/12    100       120     250.7
13        60  2020/12/13    106       128     345.3
14        60  2020/12/14    104       132     379.3
15        60  2020/12/15     98       123     275.0
16        60  2020/12/16     98       120     215.2
17        60  2020/12/17    100       120     300.0
18        45

In [41]:
x = df3[(df3['Maxpulse'] >100) & (df3['Maxpulse'] <150)]
print(x.to_string())

    Duration        Date  Pulse  Maxpulse  Calories
0         60  2020/12/01    110       130     409.1
1         60  2020/12/02    117       145     479.0
2         60  2020/12/03    103       135     340.0
4         45  2020/12/05    117       148     406.0
5         60  2020/12/06    102       127     300.0
6         60  2020/12/07    110       136     374.0
7        450  2020/12/08    104       134     253.3
8         30  2020/12/09    109       133     195.1
9         60  2020/12/10     98       124     269.0
10        60  2020/12/11    103       147     329.3
11        60  2020/12/12    100       120     250.7
12        60  2020/12/12    100       120     250.7
13        60  2020/12/13    106       128     345.3
14        60  2020/12/14    104       132     379.3
15        60  2020/12/15     98       123     275.0
16        60  2020/12/16     98       120     215.2
17        60  2020/12/17    100       120     300.0
18        45  2020/12/18     90       112     291.2
19        60

In [42]:
x = df3[df3['Duration'] == 60 ]
print(x.to_string())

    Duration        Date  Pulse  Maxpulse  Calories
0         60  2020/12/01    110       130     409.1
1         60  2020/12/02    117       145     479.0
2         60  2020/12/03    103       135     340.0
5         60  2020/12/06    102       127     300.0
6         60  2020/12/07    110       136     374.0
9         60  2020/12/10     98       124     269.0
10        60  2020/12/11    103       147     329.3
11        60  2020/12/12    100       120     250.7
12        60  2020/12/12    100       120     250.7
13        60  2020/12/13    106       128     345.3
14        60  2020/12/14    104       132     379.3
15        60  2020/12/15     98       123     275.0
16        60  2020/12/16     98       120     215.2
17        60  2020/12/17    100       120     300.0
19        60  2020/12/19    103       123     323.0
21        60  2020/12/21    108       131     364.2
23        60  2020/12/23    130       101     300.0
25        60  2020/12/25    102       126     334.5
26        60