### Replace empty values

In [1]:
import pandas as pd
data = pd.read_csv("data.csv")
data

Unnamed: 0,Duration,Pulse,Maxpulse,Calories
0,60,110,130,409.1
1,60,117,145,479.0
2,60,103,135,340.0
3,45,109,175,282.4
4,45,117,148,406.0
...,...,...,...,...
164,60,105,140,290.8
165,60,110,145,300.0
166,60,115,145,310.2
167,75,120,150,320.4


In [2]:
data.isna().sum()

Duration    0
Pulse       0
Maxpulse    0
Calories    5
dtype: int64

In [3]:
data["Calories"].mean()

np.float64(375.79024390243904)

#### fillna replaces null values with specified values, but if we want original data we have to call again

In [4]:
#update in script as per pandas 3
data.fillna({"Calories": data["Calories"].mean()},inplace=True)

In [5]:
data.isna().sum()

Duration    0
Pulse       0
Maxpulse    0
Calories    0
dtype: int64

In [6]:
data = pd.read_csv("data.csv")
#replaced missing values with median
data.fillna({"Calories": data["Calories"].median()},inplace=True)

In [7]:
data = pd.read_csv("data.csv")
data["Calories"].mode()

0    300.0
Name: Calories, dtype: float64

In [8]:
# mode always come this 0,1,2 index position so later we called with [0] to replace missing values with mode
data = pd.read_csv("data.csv")
data.fillna({"Calories": data["Calories"].mode()[0]},inplace=True)

In [9]:
df = pd.read_csv("dirtydata.csv")
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,'2020/12/01',110,130,409.1
1,60,'2020/12/02',117,145,479.0
2,60,'2020/12/03',103,135,340.0
3,45,'2020/12/04',109,175,282.4
4,45,'2020/12/05',117,148,406.0
5,60,'2020/12/06',102,127,300.0
6,60,'2020/12/07',110,136,374.0
7,450,'2020/12/08',104,134,253.3
8,30,'2020/12/09',109,133,195.1
9,60,'2020/12/10',98,124,269.0


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Duration  32 non-null     int64  
 1   Date      31 non-null     object 
 2   Pulse     32 non-null     int64  
 3   Maxpulse  32 non-null     int64  
 4   Calories  30 non-null     float64
dtypes: float64(1), int64(3), object(1)
memory usage: 1.4+ KB


In [11]:
# rectify date column error of object by datetime method
df["Date"] = pd.to_datetime(df["Date"],format="mixed")

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32 entries, 0 to 31
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype         
---  ------    --------------  -----         
 0   Duration  32 non-null     int64         
 1   Date      31 non-null     datetime64[ns]
 2   Pulse     32 non-null     int64         
 3   Maxpulse  32 non-null     int64         
 4   Calories  30 non-null     float64       
dtypes: datetime64[ns](1), float64(1), int64(3)
memory usage: 1.4 KB


In [13]:
# changed from NaN to NaT (not a time)
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,450,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


In [14]:
# here the dataset is small so we can see where the date is missing
df["Date"] = df["Date"].fillna("2020-12-22")
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,450,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0


In [15]:
# If multiple columns in one line :- df["Date"] = df["Date"].fillna("2020-12-22")

In [16]:
# As in a big dataset it will not be possible to see all the missing dates so we should drop
df.dropna(subset=["Date"],inplace=True)
df

Unnamed: 0,Duration,Date,Pulse,Maxpulse,Calories
0,60,2020-12-01,110,130,409.1
1,60,2020-12-02,117,145,479.0
2,60,2020-12-03,103,135,340.0
3,45,2020-12-04,109,175,282.4
4,45,2020-12-05,117,148,406.0
5,60,2020-12-06,102,127,300.0
6,60,2020-12-07,110,136,374.0
7,450,2020-12-08,104,134,253.3
8,30,2020-12-09,109,133,195.1
9,60,2020-12-10,98,124,269.0
