## How to handle null values

1. Deleting Rows
2. Replacing With Mean/Median/Mode
3. Predicting The Missing Values
4. Using Algorithms Which Support Missing Values

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

In [2]:
series = pd.read_csv(r'https://raw.githubusercontent.com/armangh67/LSTM/main/Turbine_Data.csv')

In [3]:
df_selected = series[['Unnamed: 0','ActivePower','AmbientTemperatue','WindDirection','WindSpeed','BearingShaftTemperature','RotorRPM']]
df = df_selected.rename(columns = {'Unnamed: 0':'Date'})     ####### rename the date column ######
df['Date'] = pd.to_datetime(df['Date'])
dataset = df.set_index('Date')
dataset = dataset.loc['2020-01-01':]
dataset.head(10)

Unnamed: 0_level_0,ActivePower,AmbientTemperatue,WindDirection,WindSpeed,BearingShaftTemperature,RotorRPM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01 00:00:00+00:00,360.361251,22.485462,191.0,5.547,41.554237,9.516714
2020-01-01 00:10:00+00:00,550.814483,22.636076,198.5,6.375833,41.51749,10.967926
2020-01-01 00:20:00+00:00,658.713983,22.612143,,7.099773,41.731771,11.557699
2020-01-01 00:30:00+00:00,958.625087,22.565534,195.2,7.366228,42.267183,13.010639
2020-01-01 00:40:00+00:00,743.454246,22.489755,,6.956225,42.565918,11.956652
2020-01-01 00:50:00+00:00,556.442579,22.424343,188.0,6.476373,42.219699,10.957252
2020-01-01 01:00:00+00:00,496.701493,22.347917,194.0,6.158284,41.79387,10.504177
2020-01-01 01:10:00+00:00,525.987062,22.517698,194.0,6.265139,41.650928,10.683153
2020-01-01 01:20:00+00:00,371.789686,22.420327,194.0,5.659305,41.423102,9.569801
2020-01-01 01:30:00+00:00,381.242069,22.464487,195.8,5.895858,41.220459,9.705768


In [4]:
dataset.isnull().sum()

ActivePower                178
AmbientTemperatue          832
WindDirection              730
WindSpeed                  178
BearingShaftTemperature    835
RotorRPM                   842
dtype: int64

In [5]:
def fill_null(data,neighbors):
    knn = KNNImputer(n_neighbors = neighbors, add_indicator = True)
    knn.fit(data)
    df = pd.DataFrame(knn.transform(data))
    a = list(data)
    x = []
    for i in range(data.shape[1]):
        x.append(np.array(df.iloc[:,i]))
    for i in range(data.shape[1]):
        data[(a[i])] = x[i] 
    return data

In [6]:
df1 = fill_null(dataset,500)
df1.head(10)

Unnamed: 0_level_0,ActivePower,AmbientTemperatue,WindDirection,WindSpeed,BearingShaftTemperature,RotorRPM
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-01 00:00:00+00:00,360.361251,22.485462,191.0,5.547,41.554237,9.516714
2020-01-01 00:10:00+00:00,550.814483,22.636076,198.5,6.375833,41.51749,10.967926
2020-01-01 00:20:00+00:00,658.713983,22.612143,175.930932,7.099773,41.731771,11.557699
2020-01-01 00:30:00+00:00,958.625087,22.565534,195.2,7.366228,42.267183,13.010639
2020-01-01 00:40:00+00:00,743.454246,22.489755,175.338572,6.956225,42.565918,11.956652
2020-01-01 00:50:00+00:00,556.442579,22.424343,188.0,6.476373,42.219699,10.957252
2020-01-01 01:00:00+00:00,496.701493,22.347917,194.0,6.158284,41.79387,10.504177
2020-01-01 01:10:00+00:00,525.987062,22.517698,194.0,6.265139,41.650928,10.683153
2020-01-01 01:20:00+00:00,371.789686,22.420327,194.0,5.659305,41.423102,9.569801
2020-01-01 01:30:00+00:00,381.242069,22.464487,195.8,5.895858,41.220459,9.705768


In [7]:
df1.isna().sum()

ActivePower                0
AmbientTemperatue          0
WindDirection              0
WindSpeed                  0
BearingShaftTemperature    0
RotorRPM                   0
dtype: int64