<a href="https://colab.research.google.com/github/bonchevap/aigraz_project/blob/master/Notebooks/Temperature_DataCleaning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import pickle
import numpy as np
from sklearn.impute import SimpleImputer 

**Read the temperature data and combine it into one dataframe.**

In [2]:
df_t2015=pd.read_csv("2015dec.csv", sep=';')
df_t2016=pd.read_csv("2016n.csv", sep=';')
df_t2017=pd.read_csv("2017n.csv", sep=';')
df_t2018=pd.read_csv("2018n.csv", sep=';')
df_t2019=pd.read_csv("2019n.csv", sep=';')
#drop the last two rows of 2019
#df_t2019 = df_t2019.iloc[:-1]
#only last row of december 2015
df_t2015=df_t2015.iloc[[-1,-2]]
df_total=[df_t2015, df_t2016, df_t2017, df_t2018, df_t2019]



df_temperature=pd.concat(df_total, ignore_index=True, axis=0)
df_temperature.reset_index(drop=True, inplace=True)
df_temperature.head(90000)


Unnamed: 0,Date,Time,Temperature,DewPoint,Humidity,Wind,WindSpeed,WindGust,Pressure,Precip.,Condition
0,2015-12-31,12:30AM,14F,9F,79%,SE,2mph,0mph,29.59in,0.0in,Fair
1,2015-12-31,12:00AM,16F,10F,79%,CALM,0mph,0mph,29.59in,0.0in,Fair
2,2016-1-1,1:00AM,14F,9F,79%,CALM,0mph,0mph,29.59in,0.0in,Fair
3,2016-1-1,1:30AM,14F,7F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair
4,2016-1-1,2:00AM,14F,7F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair
...,...,...,...,...,...,...,...,...,...,...,...
68621,2019-12-31,10:30PM,32F,25F,75%,CALM,0mph,0mph,29.48in,0.0in,LightRain
68622,2019-12-31,11:00PM,32F,27F,80%,CALM,0mph,0mph,29.48in,0.0in,Cloudy
68623,2019-12-31,11:30PM,32F,27F,80%,CALM,0mph,0mph,29.48in,0.0in,Cloudy
68624,2019-12-31,12:00AM,32F,27F,80%,CALM,0mph,0mph,29.45in,0.0in,PartlyCloudy


**Add year, month and day as separate columns from the Date column.**

In [0]:
columns=['Year', 'Month', 'Day']
df_temperature[columns]=df_temperature.Date.str.split('-', expand=True)
df_temperature=df_temperature.drop( "Date", axis=1)

In [4]:
df_temperature.head()

Unnamed: 0,Time,Temperature,DewPoint,Humidity,Wind,WindSpeed,WindGust,Pressure,Precip.,Condition,Year,Month,Day
0,12:30AM,14F,9F,79%,SE,2mph,0mph,29.59in,0.0in,Fair,2015,12,31
1,12:00AM,16F,10F,79%,CALM,0mph,0mph,29.59in,0.0in,Fair,2015,12,31
2,1:00AM,14F,9F,79%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1
3,1:30AM,14F,7F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1
4,2:00AM,14F,7F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1


**Formating the hour.**

In [5]:
df_temperature['Hour']=pd.to_datetime(df_temperature['Time'], format="%I:%M%p").dt.hour
df_temperature['Minute']=pd.to_datetime(df_temperature['Time'], format="%I:%M%p").dt.minute
df_temperature=df_temperature.drop( "Time", axis=1)



df_temperature.head(90000)

Unnamed: 0,Temperature,DewPoint,Humidity,Wind,WindSpeed,WindGust,Pressure,Precip.,Condition,Year,Month,Day,Hour,Minute
0,14F,9F,79%,SE,2mph,0mph,29.59in,0.0in,Fair,2015,12,31,0,30
1,16F,10F,79%,CALM,0mph,0mph,29.59in,0.0in,Fair,2015,12,31,0,0
2,14F,9F,79%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,1,0
3,14F,7F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,1,30
4,14F,7F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
68621,32F,25F,75%,CALM,0mph,0mph,29.48in,0.0in,LightRain,2019,12,31,22,30
68622,32F,27F,80%,CALM,0mph,0mph,29.48in,0.0in,Cloudy,2019,12,31,23,0
68623,32F,27F,80%,CALM,0mph,0mph,29.48in,0.0in,Cloudy,2019,12,31,23,30
68624,32F,27F,80%,CALM,0mph,0mph,29.45in,0.0in,PartlyCloudy,2019,12,31,0,0


**The temperature is recorded every half an hour. We want the hourly values, so we filter the "Minute" column.**

In [6]:

df_temperature=df_temperature[df_temperature.Minute==00]
df_temperature=df_temperature.drop( "Minute", axis=1)
df_temperature.head(90000)

Unnamed: 0,Temperature,DewPoint,Humidity,Wind,WindSpeed,WindGust,Pressure,Precip.,Condition,Year,Month,Day,Hour
1,16F,10F,79%,CALM,0mph,0mph,29.59in,0.0in,Fair,2015,12,31,0
2,14F,9F,79%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,1
4,14F,7F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,2
6,12F,7F,79%,SE,2mph,0mph,29.59in,0.0in,Fair,2016,1,1,3
8,10F,3F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
68616,32F,25F,75%,CALM,0mph,0mph,29.51in,0.0in,MostlyCloudy,2019,12,31,20
68618,34F,25F,70%,CALM,0mph,0mph,29.51in,0.0in,MostlyCloudy,2019,12,31,21
68620,32F,25F,75%,SSE,2mph,0mph,29.48in,0.0in,MostlyCloudy,2019,12,31,22
68622,32F,27F,80%,CALM,0mph,0mph,29.48in,0.0in,Cloudy,2019,12,31,23


**The day in the electricity data that will be merged with the temperature data, starts with hour=0 and finishes with hour=23. Here, the day starts with hour=1 and finishes with hour=0. So we do some shifting.**

In [7]:
df_temperaturedropfirst=df_temperature.iloc[1:]
df_temperaturedropfirst.head()

Unnamed: 0,Temperature,DewPoint,Humidity,Wind,WindSpeed,WindGust,Pressure,Precip.,Condition,Year,Month,Day,Hour
2,14F,9F,79%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,1
4,14F,7F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,2
6,12F,7F,79%,SE,2mph,0mph,29.59in,0.0in,Fair,2016,1,1,3
8,10F,3F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,4
9,9F,5F,85%,SE,2mph,0mph,29.59in,0.0in,Fair,2016,1,1,5


In [8]:
df_temperaturedroplast=df_temperature.iloc[:-1]
df_temperaturedroplast.head(90000)

Unnamed: 0,Temperature,DewPoint,Humidity,Wind,WindSpeed,WindGust,Pressure,Precip.,Condition,Year,Month,Day,Hour
1,16F,10F,79%,CALM,0mph,0mph,29.59in,0.0in,Fair,2015,12,31,0
2,14F,9F,79%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,1
4,14F,7F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,2
6,12F,7F,79%,SE,2mph,0mph,29.59in,0.0in,Fair,2016,1,1,3
8,10F,3F,73%,CALM,0mph,0mph,29.59in,0.0in,Fair,2016,1,1,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...
68614,32F,25F,75%,CALM,0mph,0mph,29.51in,0.0in,PartlyCloudy,2019,12,31,19
68616,32F,25F,75%,CALM,0mph,0mph,29.51in,0.0in,MostlyCloudy,2019,12,31,20
68618,34F,25F,70%,CALM,0mph,0mph,29.51in,0.0in,MostlyCloudy,2019,12,31,21
68620,32F,25F,75%,SSE,2mph,0mph,29.48in,0.0in,MostlyCloudy,2019,12,31,22


In [0]:
df_temperaturedroplast.reset_index(drop=True, inplace=True)
hours=df_temperaturedroplast.Hour
temperature=df_temperaturedroplast.Temperature

In [0]:
df_temperaturedropfirst=df_temperaturedropfirst.drop( ["DewPoint", "Humidity", "Wind", "WindSpeed", "WindGust", "Pressure", "Precip.", "Condition"], axis=1)

df_temperaturedroplast=df_temperaturedroplast.drop( ["DewPoint", "Humidity", "Wind", "WindSpeed", "WindGust", "Pressure", "Precip.", "Condition"], axis=1)

In [0]:
df_temperaturedropfirst.reset_index(drop=True, inplace=True)
df_temperaturedropfirst['Temperature']=temperature
df_temperaturedropfirst['Hour']=hours
df_temperaturedropfirst.head(90000)

Unnamed: 0,Temperature,Year,Month,Day,Hour
0,16F,2016,1,1,0
1,14F,2016,1,1,1
2,14F,2016,1,1,2
3,12F,2016,1,1,3
4,10F,2016,1,1,4
...,...,...,...,...,...
34402,32F,2019,12,31,19
34403,32F,2019,12,31,20
34404,34F,2019,12,31,21
34405,32F,2019,12,31,22


In [11]:
df_temperaturefinal=df_temperaturedropfirst
df_temperaturefinal.head(90000)

Unnamed: 0,Temperature,Year,Month,Day,Hour
2,14F,2016,1,1,1
4,14F,2016,1,1,2
6,12F,2016,1,1,3
8,10F,2016,1,1,4
9,9F,2016,1,1,5
...,...,...,...,...,...
68616,32F,2019,12,31,20
68618,34F,2019,12,31,21
68620,32F,2019,12,31,22
68622,32F,2019,12,31,23


**We want the temperature to be in celsius.**

In [0]:

df_temperaturefinal['Temperature']=df_temperaturefinal.Temperature.str.replace('F', '').astype(float)


In [0]:
df_temperaturefinal['Temperature']=(df_temperaturefinal['Temperature']-32)*(5/9)


In [0]:
df_temperaturefinal.to_csv("/content/MK_Temperature2016-2019C.csv", index=False)