# Data preprocessing

We exported the data in the exploration file after removing some obsolete rows. We further preprocess that data here.

### Load and further preprocess data

In [1]:
import pandas as pd
import numpy as np

In [2]:
processed_df = pd.read_csv('../data/processed_data.csv')
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4657128 entries, 0 to 4657127
Data columns (total 15 columns):
 #   Column    Dtype  
---  ------    -----  
 0   TurbID    int64  
 1   Day       int64  
 2   Tmstamp   object 
 3   Wspd      float64
 4   Wdir      float64
 5   Etmp      float64
 6   Itmp      float64
 7   Ndir      float64
 8   Pab1      float64
 9   Pab2      float64
 10  Pab3      float64
 11  Prtv      float64
 12  Patv      float64
 13  datetime  object 
 14  P_norm    float64
dtypes: float64(11), int64(2), object(2)
memory usage: 533.0+ MB


In [3]:
int_columns = processed_df.select_dtypes(include=['int64']).columns.tolist()
processed_df[int_columns] = processed_df[int_columns].apply(lambda arg: pd.to_numeric(arg, downcast='integer'))
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4657128 entries, 0 to 4657127
Data columns (total 15 columns):
 #   Column    Dtype  
---  ------    -----  
 0   TurbID    int16  
 1   Day       int16  
 2   Tmstamp   object 
 3   Wspd      float64
 4   Wdir      float64
 5   Etmp      float64
 6   Itmp      float64
 7   Ndir      float64
 8   Pab1      float64
 9   Pab2      float64
 10  Pab3      float64
 11  Prtv      float64
 12  Patv      float64
 13  datetime  object 
 14  P_norm    float64
dtypes: float64(11), int16(2), object(2)
memory usage: 479.7+ MB


In [4]:
float_columns = processed_df.select_dtypes(include=['float64']).columns.tolist()
processed_df[float_columns] = processed_df[float_columns].apply(lambda arg: pd.to_numeric(arg, downcast='float'))
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4657128 entries, 0 to 4657127
Data columns (total 15 columns):
 #   Column    Dtype  
---  ------    -----  
 0   TurbID    int16  
 1   Day       int16  
 2   Tmstamp   object 
 3   Wspd      float32
 4   Wdir      float32
 5   Etmp      float32
 6   Itmp      float32
 7   Ndir      float32
 8   Pab1      float32
 9   Pab2      float32
 10  Pab3      float32
 11  Prtv      float32
 12  Patv      float32
 13  datetime  object 
 14  P_norm    float32
dtypes: float32(11), int16(2), object(2)
memory usage: 284.2+ MB


In [5]:
processed_df['datetime'] = pd.to_datetime(processed_df['datetime']).astype('datetime64[s]')
processed_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4657128 entries, 0 to 4657127
Data columns (total 15 columns):
 #   Column    Dtype        
---  ------    -----        
 0   TurbID    int16        
 1   Day       int16        
 2   Tmstamp   object       
 3   Wspd      float32      
 4   Wdir      float32      
 5   Etmp      float32      
 6   Itmp      float32      
 7   Ndir      float32      
 8   Pab1      float32      
 9   Pab2      float32      
 10  Pab3      float32      
 11  Prtv      float32      
 12  Patv      float32      
 13  datetime  datetime64[s]
 14  P_norm    float32      
dtypes: datetime64[s](1), float32(11), int16(2), object(1)
memory usage: 284.2+ MB


In [6]:
processed_df

Unnamed: 0,TurbID,Day,Tmstamp,Wspd,Wdir,Etmp,Itmp,Ndir,Pab1,Pab2,Pab3,Prtv,Patv,datetime,P_norm
0,1,1,00:10,6.17,-3.99,30.730000,41.799999,25.920000,1.00,1.00,1.00,-0.250000,494.660004,2020-01-01 00:10:00,0.319720
1,1,1,00:20,6.27,-2.18,30.600000,41.630001,20.910000,1.00,1.00,1.00,-0.240000,509.760010,2020-01-01 00:20:00,0.329299
2,1,1,00:30,6.42,-0.73,30.520000,41.520000,20.910000,1.00,1.00,1.00,-0.260000,542.530029,2020-01-01 00:30:00,0.350087
3,1,1,00:40,6.25,0.89,30.490000,41.380001,20.910000,1.00,1.00,1.00,-0.230000,509.359985,2020-01-01 00:40:00,0.329045
4,1,1,00:50,6.10,-1.03,30.469999,41.220001,20.910000,1.00,1.00,1.00,-0.270000,482.209991,2020-01-01 00:50:00,0.311822
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4657123,134,245,23:10,7.79,2.80,-0.070000,3.950000,216.509995,6.03,6.03,6.03,-111.690002,601.440002,2020-09-01 23:10:00,0.387458
4657124,134,245,23:20,8.06,4.39,0.230000,3.940000,216.509995,5.81,5.81,5.81,-72.669998,673.789978,2020-09-01 23:20:00,0.433356
4657125,134,245,23:30,8.08,2.28,-0.160000,4.150000,216.509995,0.68,0.68,0.68,-118.699997,999.359985,2020-09-01 23:30:00,0.639890
4657126,134,245,23:40,8.46,0.80,-0.140000,4.320000,216.509995,0.02,0.02,0.02,-58.119999,1100.890015,2020-09-01 23:40:00,0.704298
