In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
df_weather = pd.read_csv("weatherdata.csv")

In [4]:
df_weather

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,7.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,31.0,2.0,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [76]:
df_weather.isna()

Unnamed: 0,day,temperature,windspeed,event
0,False,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,True,False,True
4,False,False,True,False
5,False,False,False,False
6,False,True,True,True
7,False,False,False,False
8,False,False,False,False


In [78]:
df_weather.isnull()

Unnamed: 0,day,temperature,windspeed,event
0,False,False,False,False
1,False,True,False,False
2,False,False,True,False
3,False,True,False,True
4,False,False,True,False
5,False,False,False,False
6,False,True,True,True
7,False,False,False,False
8,False,False,False,False


**Creating a dataframe with NaN values**

In [13]:
# Creating a DataFrame with NaN values
data = {
    'A': [1, 2, np.nan, 4],
    'B': [5, np.nan, np.nan, 8],
    'C': [10, 11, 12, np.nan]
}

df = pd.DataFrame(data)


In [15]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,10.0
1,2.0,,11.0
2,,,12.0
3,4.0,8.0,


In [17]:
# Using isna() to detect missing values
missing_data = df.isna()

print(missing_data)

       A      B      C
0  False  False  False
1  False   True  False
2   True   True  False
3  False  False   True


In [19]:
df.isnull()

Unnamed: 0,A,B,C
0,False,False,False
1,False,True,False
2,True,True,False
3,False,False,True


**Counting Missing Values**

In [22]:
df.isnull().sum()

A    1
B    2
C    1
dtype: int64

**Filtering Rows with Missing Values**

In [29]:
df

Unnamed: 0,A,B,C
0,1.0,5.0,10.0
1,2.0,,11.0
2,,,12.0
3,4.0,8.0,


In [31]:
df[df.isna().any(axis=1)]

Unnamed: 0,A,B,C
1,2.0,,11.0
2,,,12.0
3,4.0,8.0,


In [37]:
df.isna().any(axis = 1)

0    False
1     True
2     True
3     True
dtype: bool

**Drop Rows With Missing Values**

In [64]:
# define a dictionary with sample data which includes some missing values
data = {
    'A': [1, 2, 3, None, 5],  
    'B': [None, 2, 3, 4, 5],  
    'C': [1, 2, None, None, 5]
}
df = pd.DataFrame(data)

In [42]:
df

Unnamed: 0,A,B,C
0,1.0,,1.0
1,2.0,2.0,2.0
2,3.0,3.0,
3,,4.0,
4,5.0,5.0,5.0


In [44]:
df.dropna()

Unnamed: 0,A,B,C
1,2.0,2.0,2.0
4,5.0,5.0,5.0


In [66]:
# use dropna() to remove rows with any missing values
df_cleaned = df.dropna()
df_cleaned

Unnamed: 0,A,B,C
1,2.0,2.0,2.0
4,5.0,5.0,5.0


**Fill Missing Values**

In [68]:
df

Unnamed: 0,A,B,C
0,1.0,,1.0
1,2.0,2.0,2.0
2,3.0,3.0,
3,,4.0,
4,5.0,5.0,5.0


In [51]:
# filling NaN values with 0
df.fillna(0)

Unnamed: 0,A,B,C
0,1.0,0.0,1.0
1,2.0,2.0,2.0
2,3.0,3.0,0.0
3,0.0,4.0,0.0
4,5.0,5.0,5.0


In [62]:
df

Unnamed: 0,A,B,C
0,1.0,0.0,1.0
1,2.0,2.0,2.0
2,3.0,3.0,0.0
3,0.0,4.0,0.0
4,5.0,5.0,5.0


In [55]:
# filling NaN values with 0
# df.fillna(0, inplace=True)

In [57]:
df

Unnamed: 0,A,B,C
0,1.0,0.0,1.0
1,2.0,2.0,2.0
2,3.0,3.0,0.0
3,0.0,4.0,0.0
4,5.0,5.0,5.0


Note: 
> The **inplace=True** argument here means that the operation will modify the DataFrame directly, rather than returning a new DataFrame with the modifications.

**Use Aggregate Functions to Fill Missing Values**

In [70]:
# filling NaN values with the mean of each column
df.fillna(df.mean())

Unnamed: 0,A,B,C
0,1.0,3.5,1.0
1,2.0,2.0,2.0
2,3.0,3.0,2.666667
3,2.75,4.0,2.666667
4,5.0,5.0,5.0


In [121]:
# Sample DataFrame with NaN values
data = {
    'A': [1, 2, np.nan, 4],
    'B': [np.nan, 2, 3, 4],
    'C': [1, np.nan, np.nan, 4]
}

df = pd.DataFrame(data)
df


Unnamed: 0,A,B,C
0,1.0,,1.0
1,2.0,2.0,
2,,3.0,
3,4.0,4.0,4.0


In [123]:
# Fill NaN values with 0
df_filled = df.fillna(0)

print(df_filled)

     A    B    C
0  1.0  0.0  1.0
1  2.0  2.0  0.0
2  0.0  3.0  0.0
3  4.0  4.0  4.0


**Forward Fill (ffill)**
> Forward fill uses the last valid value to fill NaN.<br>
> ffill: propagate last valid observation forward to next valid.<br> 
backfill / bfill: use next valid observation to fill gap.

In [127]:
df


Unnamed: 0,A,B,C
0,1.0,,1.0
1,2.0,2.0,
2,,3.0,
3,4.0,4.0,4.0


In [133]:
# Forward fill NaN values
df_ffill = df.ffill()
print(df_ffill)


     A    B    C
0  1.0  NaN  1.0
1  2.0  2.0  1.0
2  2.0  3.0  1.0
3  4.0  4.0  4.0


> Notice that NaN in column B of the first row wasn't filled because there's no previous value to fill it with.

In [135]:
df

Unnamed: 0,A,B,C
0,1.0,,1.0
1,2.0,2.0,
2,,3.0,
3,4.0,4.0,4.0


In [137]:
df.bfill()

Unnamed: 0,A,B,C
0,1.0,2.0,1.0
1,2.0,2.0,4.0
2,4.0,3.0,4.0
3,4.0,4.0,4.0


**Filling NaN with a Dictionary**
> You can fill NaN values with different values for different columns using a dictionary.

In [146]:
# Fill NaN with different values for each column
df_fill_dict = df.fillna(
        {
            'A': 100, 
            'B': 200, 
            'C': 300
        }
)

print(df_fill_dict)


       A      B      C
0    1.0  200.0    1.0
1    2.0    2.0  300.0
2  100.0    3.0  300.0
3    4.0    4.0    4.0


In [89]:
df_weather

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,7.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,31.0,2.0,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [93]:
print(df_weather['day'].dtype) # it should be date time data type

object


In [99]:
df_weather.day[0]
df_weather['day'][0]
type(df_weather['day'][0])

str

In [103]:
# Convert the 'date_str' column to datetime format
df_weather['day'] = pd.to_datetime(df_weather['day'])
df_weather


Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,7.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,31.0,2.0,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [105]:
type(df_weather['day'][0])

pandas._libs.tslibs.timestamps.Timestamp

**Make the day as index**

In [83]:
df_weather.set_index(df_weather['day'])

Unnamed: 0_level_0,day,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-01-01,2017-01-01,32.0,6.0,Rain
2017-01-04,2017-01-04,,7.0,Sunny
2017-01-05,2017-01-05,28.0,,Snow
2017-01-06,2017-01-06,,7.0,
2017-01-07,2017-01-07,32.0,,Rain
2017-01-08,2017-01-08,31.0,2.0,Sunny
2017-01-09,2017-01-09,,,
2017-01-10,2017-01-10,34.0,8.0,Cloudy
2017-01-11,2017-01-11,40.0,12.0,Sunny


In [8]:
df.describe()

NameError: name 'df' is not defined

In [107]:
df_weather.set_index('day', inplace=True)

In [109]:
df_weather

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,7.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,31.0,2.0,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [111]:
new_df = df_weather.fillna(0)

In [113]:
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,7.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,0
2017-01-07,32.0,0.0,Rain
2017-01-08,31.0,2.0,Sunny
2017-01-09,0.0,0.0,0
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [117]:
new_df = df_weather.fillna({
    'temperature': 0,
    'windspeed': 0,
    'event': ' no event'
})

In [119]:
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,0.0,7.0,Sunny
2017-01-05,28.0,0.0,Snow
2017-01-06,0.0,7.0,no event
2017-01-07,32.0,0.0,Rain
2017-01-08,31.0,2.0,Sunny
2017-01-09,0.0,0.0,no event
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [150]:
new_df = df_weather.ffill()

In [152]:
new_df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,32.0,7.0,Sunny
2017-01-05,28.0,7.0,Snow
2017-01-06,28.0,7.0,Snow
2017-01-07,32.0,7.0,Rain
2017-01-08,31.0,2.0,Sunny
2017-01-09,31.0,2.0,Sunny
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [158]:
df_weather

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,7.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,31.0,2.0,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


**axis parameter**
> The ffill (forward fill) function in pandas is used to propagate the last valid observation forward to the next valid observation. When using ffill with the axis='columns' argument, the filling is done across columns for each row, which means it fills missing values (NaNs) from left to right.

In [169]:
# Creating a DataFrame with NaN values
data = {
    'A': [1, np.nan, 3],
    'B': [np.nan, 2, np.nan],
    'C': [np.nan, np.nan, 5],
    'D': [4, np.nan, np.nan]
}

df = pd.DataFrame(data)
df


Unnamed: 0,A,B,C,D
0,1.0,,,4.0
1,,2.0,,
2,3.0,,5.0,


In [173]:
# Applying forward fill along columns
df_filled = df.ffill(axis='columns')

df_filled

Unnamed: 0,A,B,C,D
0,1.0,1.0,1.0,4.0
1,,2.0,2.0,2.0
2,3.0,3.0,5.0,5.0


In [176]:
df

Unnamed: 0,A,B,C,D
0,1.0,,,4.0
1,,2.0,,
2,3.0,,5.0,


In [178]:
df.ffill(axis = 'index') #default - vertical

Unnamed: 0,A,B,C,D
0,1.0,,,4.0
1,1.0,2.0,,4.0
2,3.0,2.0,5.0,4.0


**limit parameter**
> The limit parameter in the ffill method of pandas controls how many consecutive NaN values are forward filled. It allows you to specify the maximum number of consecutive NaN values to be filled. If you have more NaNs than the specified limit, only the number of NaNs up to the limit will be filled, and the remaining NaNs will remain as they are.

In [181]:
# Creating a DataFrame with NaN values
data = {
    'A': [1, np.nan, np.nan, 4, 5],
    'B': [np.nan, np.nan, 3, np.nan, 6],
    'C': [7, np.nan, np.nan, np.nan, 10]
}

df = pd.DataFrame(data)
df

Unnamed: 0,A,B,C
0,1.0,,7.0
1,,,
2,,3.0,
3,4.0,,
4,5.0,6.0,10.0


In [183]:
# Applying forward fill with a limit of 1
df_filled_limit = df.ffill(limit=1)

df_filled_limit

Unnamed: 0,A,B,C
0,1.0,,7.0
1,1.0,,7.0
2,,3.0,
3,4.0,3.0,
4,5.0,6.0,10.0


**In this example:**
> For column 'A', only the NaN value in row 1 is filled (with the value 1 from row 0), but the NaN in row 2 is not filled because the limit is 1.<br>
> For column 'B', only the NaN value in row 2 is filled (with the value 3 from row 2), but the NaN in row 1 and row 3 are not filled due to the limit.<br>
> For column 'C', NaNs in rows 1 and 2 are filled with the value 7 from row 0, but the NaN in row 3 is not filled because of the limit.

In [187]:
df_weather

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,,7.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,,7.0,
2017-01-07,32.0,,Rain
2017-01-08,31.0,2.0,Sunny
2017-01-09,,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


**Interpolation**

In [205]:
# Interpolate the missing values
df_weather['temperature'] = df_weather['temperature'].interpolate()

print(df_weather)

            temperature  windspeed   event
day                                       
2017-01-01         32.0        6.0    Rain
2017-01-04         30.0        7.0   Sunny
2017-01-05         28.0        NaN    Snow
2017-01-06         30.0        7.0     NaN
2017-01-07         32.0        NaN    Rain
2017-01-08         31.0        2.0   Sunny
2017-01-09         32.5        NaN     NaN
2017-01-10         34.0        8.0  Cloudy
2017-01-11         40.0       12.0   Sunny


In [207]:
df_weather

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,30.0,7.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,,Rain
2017-01-08,31.0,2.0,Sunny
2017-01-09,32.5,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


In [221]:
df = pd.read_csv("weatherdata.csv", parse_dates=['day'])

In [223]:
df

Unnamed: 0,day,temperature,windspeed,event
0,2017-01-01,32.0,6.0,Rain
1,2017-01-04,,7.0,Sunny
2,2017-01-05,28.0,,Snow
3,2017-01-06,,7.0,
4,2017-01-07,32.0,,Rain
5,2017-01-08,31.0,2.0,Sunny
6,2017-01-09,,,
7,2017-01-10,34.0,8.0,Cloudy
8,2017-01-11,40.0,12.0,Sunny


In [229]:
df.set_index('day', inplace=True)

In [231]:
df['temperature'] = df['temperature'].interpolate(method = 'time')

In [233]:
df

Unnamed: 0_level_0,temperature,windspeed,event
day,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2017-01-01,32.0,6.0,Rain
2017-01-04,29.0,7.0,Sunny
2017-01-05,28.0,,Snow
2017-01-06,30.0,7.0,
2017-01-07,32.0,,Rain
2017-01-08,31.0,2.0,Sunny
2017-01-09,32.5,,
2017-01-10,34.0,8.0,Cloudy
2017-01-11,40.0,12.0,Sunny


**The interpolate(method='time')** function calculates the missing values by considering the time difference between the surrounding known values.

**Let's focus on the missing value on 2017-01-04:**
> The known values before and after 2017-01-04 are 32.0 (2017-01-01) and 28.0 (2017-01-05).
The time difference between 2017-01-01 and 2017-01-05 is 4 days.
The time difference between 2017-01-01 and 2017-01-04 is 3 days.
The time-weighted interpolation calculates the missing value on 2017-01-04 as:
apache

* Interpolated value = (32.0 * (4 - 3) + 28.0 * 3) / 4 = 30.0

> This calculation takes into account the time difference between the known values and the missing value, giving more weight to the value that is closer in time.

> Similarly, the missing value on 2017-01-06 is calculated using the known values on 2017-01-05 (28.0) and 2017-01-07 (32.0), with the time difference of 2 days between 2017-01-06 and each of the known values.

> The missing value on 2017-01-09 is calculated using the known values on 2017-01-08 (31.0) and 2017-01-10 (34.0), with the time difference of 1 day between 2017-01-09 and each of the known values.

In [192]:
# Creating a DataFrame with NaN values
data = {
    'A': [1, np.nan, 3, np.nan, 5],
    'B': [1, 2, np.nan, 4, 5]
}

df = pd.DataFrame(data)

df

Unnamed: 0,A,B
0,1.0,1.0
1,,2.0
2,3.0,
3,,4.0
4,5.0,5.0


In [194]:
# Interpolating missing values
df_interpolated = df.interpolate(method='linear')

df_interpolated


Unnamed: 0,A,B
0,1.0,1.0
1,2.0,2.0
2,3.0,3.0
3,4.0,4.0
4,5.0,5.0


In [198]:
# Creating a DataFrame with NaN values
data = {
    'X': [1, 2, np.nan, 4, 5],
    'Y': [2, np.nan, np.nan, 8, 10]
}

df = pd.DataFrame(data)

df

Unnamed: 0,X,Y
0,1.0,2.0
1,2.0,
2,,
3,4.0,8.0
4,5.0,10.0


In [200]:

# Applying polynomial interpolation of order 2 (quadratic)
df_interpolated_poly = df.interpolate(method='polynomial', order=2)

df_interpolated_poly

Unnamed: 0,X,Y
0,1.0,2.0
1,2.0,4.0
2,3.0,6.0
3,4.0,8.0
4,5.0,10.0


> Quadratic Polynomial Formula:
𝑃(𝑥) = 𝑎𝑥2 + 𝑏𝑥 + 𝑐
where 𝑎, 𝑏, and 𝑐 are coefficients determined by fitting the polynomial to the non-missing data.