In [1]:
import numpy as np
import pandas as pd
import datetime as dt
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
import holidays
import math

In [2]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.backend import square, mean
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, TensorBoard, ReduceLROnPlateau
from tensorflow.keras.layers import Input, Dense, GRU, Embedding

In [3]:
segment_speeds = pd.read_csv("/home/andrehoejmark/Desktop/GitHub/AVG-Speed-Prediction-of-cars-based-on-road-events/data/segment-data/60-min-intervals/226255131.csv", sep=";")
wind_speed = pd.read_csv("/home/andrehoejmark/Desktop/GitHub/AVG-Speed-Prediction-of-cars-based-on-road-events/data/weather data/smhi-weather-station-data-wind-speed.csv", sep=",")
rain = pd.read_csv("/home/andrehoejmark/Desktop/GitHub/AVG-Speed-Prediction-of-cars-based-on-road-events/data/weather data/smhi-weather-station-data-rain-amount.csv", sep=",")
snow_depth = pd.read_csv("/home/andrehoejmark/Desktop/GitHub/AVG-Speed-Prediction-of-cars-based-on-road-events/data/weather data/smhi-weather-station-data-snow-depth.csv", sep=",")
temperature = pd.read_csv("/home/andrehoejmark/Desktop/GitHub/AVG-Speed-Prediction-of-cars-based-on-road-events/data/weather data/smhi-weather-station-data-temperature.csv", sep=",")

In [4]:
wind_speed = wind_speed.loc[(wind_speed['Datum'] >= "2019-06-04")]
rain = rain.loc[(rain['Datum'] >= "2019-06-04")]
snow_depth = snow_depth.loc[(snow_depth['Datum'] >= "2019-06-04")]
temperature = temperature.loc[(temperature['Datum'] >= "2019-06-04")]

In [5]:
segment_speeds.head(5)

Unnamed: 0,SegmentId,StartTime,EndTime,Speed
0,226255131,2019-06-03 00:00:00,2019-06-03 01:00:00,
1,226255131,2019-06-03 01:00:00,2019-06-03 02:00:00,
2,226255131,2019-06-03 02:00:00,2019-06-03 03:00:00,
3,226255131,2019-06-03 03:00:00,2019-06-03 04:00:00,
4,226255131,2019-06-03 04:00:00,2019-06-03 05:00:00,


In [6]:
wind_speed.head(5)

Unnamed: 0,Datum,Tid (UTC),Vindriktning,Kvalitet,Vindhastighet,Kvalitet.1
297120,2019-06-04,00:00:00,192.0,G,4.7,G
297121,2019-06-04,01:00:00,204.0,G,5.0,G
297122,2019-06-04,02:00:00,211.0,G,4.8,G
297123,2019-06-04,03:00:00,214.0,G,4.9,G
297124,2019-06-04,04:00:00,231.0,G,6.7,G


In [7]:
rain.head(5)

Unnamed: 0,Datum,Tid (UTC),Nederbördsmängd,Kvalitet
206305,2019-06-04,00:00:00,0.0,G
206306,2019-06-04,01:00:00,0.0,G
206307,2019-06-04,02:00:00,0.0,G
206308,2019-06-04,03:00:00,0.0,G
206309,2019-06-04,04:00:00,0.0,G


In [8]:
snow_depth.head(5)

Unnamed: 0,Datum,Tid (UTC),Snödjup,Kvalitet,Markens tillstånd,Kvalitet.1
4731,2019-06-04,06:00:00,0.0,G,0.0,G
4732,2019-06-05,06:00:00,0.0,G,0.0,G
4733,2019-06-06,06:00:00,0.0,G,0.0,G
4734,2019-06-07,06:00:00,0.0,G,0.0,G
4735,2019-06-08,06:00:00,0.0,G,0.0,G


In [9]:
temperature.head(5)

Unnamed: 0,Datum,Tid (UTC),Lufttemperatur,Kvalitet
284343,2019-06-04,00:00:00,13.7,G
284344,2019-06-04,01:00:00,14.1,G
284345,2019-06-04,02:00:00,14.2,G
284346,2019-06-04,03:00:00,14.3,G
284347,2019-06-04,04:00:00,13.6,G


Removing the first 24 records because the first 24 records are null values for traffic speed

In [10]:
segment_speeds = segment_speeds[24:]

Adding days and hours to the dataset because that could make it easier for it to identify what time instead of receiving a date column which is more complex.

In [11]:
pd.to_datetime(segment_speeds['StartTime'], format="%Y-%m-%d %H:%M:%S")
segment_speeds['hour'] = pd.DatetimeIndex(segment_speeds['StartTime']).hour
segment_speeds['day'] = pd.DatetimeIndex(segment_speeds['StartTime']).dayofyear
segment_speeds['year'] = pd.DatetimeIndex(segment_speeds['StartTime']).year
segment_speeds['day_name'] = pd.DatetimeIndex(segment_speeds['StartTime']).day_name()

In [12]:
dummy1 = pd.get_dummies(segment_speeds['day_name'])

In [13]:
segment_speeds = pd.concat([segment_speeds, dummy1], axis=1).drop('day_name', axis=1)

#### Adding holidays

In [14]:
sweden_holidays = holidays.country_holidays('SE')

In [15]:
holiday = []
 
for date in segment_speeds[['StartTime']].values:
    
    year_month_day = date[0].split(' ')[0]
    
    res = sweden_holidays.get(str(year_month_day))
    
    if res != None and res != "Söndag":
        holiday.append(1)
    else:
        holiday.append(0)

segment_speeds['holiday'] = holiday

#### Interpolation is performed for null values

In [16]:
segment_speeds = segment_speeds.interpolate(method='linear', limit_direction='forward', axis=0)

##### Now we merge the traffic speeds with the weather data and first need to put the segment speeds in same format as the weather data

In [17]:
pd.to_datetime(temperature['Datum'], format="%Y-%m-%d")
pd.to_datetime(temperature['Tid (UTC)'], format="%H:%M:%S")
temperature['hour'] = pd.DatetimeIndex(temperature['Tid (UTC)']).hour
temperature['day'] = pd.DatetimeIndex(temperature['Datum']).dayofyear
temperature['year'] = pd.DatetimeIndex(temperature['Datum']).year

pd.to_datetime(rain['Datum'], format="%Y-%m-%d")
pd.to_datetime(rain['Tid (UTC)'], format="%H:%M:%S")
rain['hour'] = pd.DatetimeIndex(rain['Tid (UTC)']).hour
rain['day'] = pd.DatetimeIndex(rain['Datum']).dayofyear
rain['year'] = pd.DatetimeIndex(rain['Datum']).year

pd.to_datetime(wind_speed['Datum'], format="%Y-%m-%d")
pd.to_datetime(wind_speed['Tid (UTC)'], format="%H:%M:%S")
wind_speed['hour'] = pd.DatetimeIndex(wind_speed['Tid (UTC)']).hour
wind_speed['day'] = pd.DatetimeIndex(wind_speed['Datum']).dayofyear
wind_speed['year'] = pd.DatetimeIndex(wind_speed['Datum']).year

pd.to_datetime(snow_depth['Datum'], format="%Y-%m-%d")
pd.to_datetime(snow_depth['Tid (UTC)'], format="%H:%M:%S")
snow_depth['hour'] = pd.DatetimeIndex(snow_depth['Tid (UTC)']).hour
snow_depth['day'] = pd.DatetimeIndex(snow_depth['Datum']).dayofyear
snow_depth['year'] = pd.DatetimeIndex(snow_depth['Datum']).year

##### A left join is done to keep the values from the old data because for example snow_depth only have records certain days and then we could fill the NaN values from the left join with some approximation

In [18]:
#segment_speeds_weather = segment_speeds[['SegmentId', 'hour', 'day', 'year', 'Speed']]
segment_speeds_weather = segment_speeds

segment_speeds_weather = pd.merge(segment_speeds[['SegmentId', 'hour', 'day', 'year', 'Speed', 'Monday', 'Saturday', 'Sunday', 'Thursday', 'Tuesday', 'Wednesday', 'Friday', 'holiday']], temperature[['Lufttemperatur', 'hour', 'day', 'year']], on=['day', 'year', 'hour'], how='left')
segment_speeds_weather = pd.merge(segment_speeds_weather, rain[['Nederbördsmängd', 'hour', 'day', 'year']], on=['day', 'year', 'hour'], how='left')
segment_speeds_weather = pd.merge(segment_speeds_weather, wind_speed[['Vindhastighet', 'hour', 'day', 'year']], on=['day', 'year', 'hour'], how='left')
segment_speeds_weather = pd.merge(segment_speeds_weather, snow_depth[['Snödjup', 'hour', 'day', 'year']], on=['day', 'year'], how='left')
segment_speeds_weather.rename(columns = {'hour_x':'hour'}, inplace = True)
segment_speeds_weather.drop('hour_y', axis=1, inplace=True)

#### Adding temperature based on last known temperature when missing data

In [19]:
temp = []

last_known_temperatuer = None
for temperature in segment_speeds_weather[['Lufttemperatur']].values:


    #print(temperature[0], type(temperature[0]))
    if math.isnan(temperature[0]):
        
        if last_known_temperatuer == None:
            print("The first NaN values has no previous value to put there")
        else:
            temp.append(last_known_temperatuer)
    else:
        last_known_temperatuer = temperature[0]
        temp.append(temperature[0])
        

segment_speeds_weather['Temperature-with-last-known-value-on-nan'] = temp

#### Adding AVG wind speed for NaN Values

In [20]:
avg_wind_speed = segment_speeds_weather['Vindhastighet'].mean()


wind_speeds = []

for wind_speed in segment_speeds_weather[['Vindhastighet']].values:
    
    if math.isnan(wind_speed[0]):
        wind_speeds.append(avg_wind_speed)
    else:
        wind_speeds.append(wind_speed[0])
        

segment_speeds_weather['Wind-speed-avg-when-nan'] = wind_speeds

#### Handle NAN values

In [21]:
segment_speeds_weather['Lufttemperatur'] = segment_speeds_weather['Lufttemperatur'].interpolate(method='linear', limit_direction='forward', axis=0)
segment_speeds_weather['Nederbördsmängd'] = segment_speeds_weather['Nederbördsmängd'].interpolate(method='linear', limit_direction='forward', axis=0)
segment_speeds_weather['Vindhastighet'] = segment_speeds_weather['Vindhastighet'].interpolate(method='linear', limit_direction='forward', axis=0)
segment_speeds_weather['Snödjup'] = segment_speeds_weather['Snödjup'].fillna(0)

In [22]:
segment_speeds_weather.isna().sum()

SegmentId                                   0
hour                                        0
day                                         0
year                                        0
Speed                                       0
Monday                                      0
Saturday                                    0
Sunday                                      0
Thursday                                    0
Tuesday                                     0
Wednesday                                   0
Friday                                      0
holiday                                     0
Lufttemperatur                              0
Nederbördsmängd                             0
Vindhastighet                               0
Snödjup                                     0
Temperature-with-last-known-value-on-nan    0
wind-speed-avg-when-nan                     0
dtype: int64

#### Make it into CSV for us to use

In [23]:
segment_speeds_weather.to_csv("segment_speeds_weather.csv", sep=",")

In [24]:
segment_speeds_weather.head(5)

Unnamed: 0,SegmentId,hour,day,year,Speed,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,Friday,holiday,Lufttemperatur,Nederbördsmängd,Vindhastighet,Snödjup,Temperature-with-last-known-value-on-nan,wind-speed-avg-when-nan
0,226255131,0,155,2019,91.383333,0,0,0,0,1,0,0,0,13.7,0.0,4.7,0.0,13.7,4.7
1,226255131,1,155,2019,92.0,0,0,0,0,1,0,0,0,14.1,0.0,5.0,0.0,14.1,5.0
2,226255131,2,155,2019,93.666667,0,0,0,0,1,0,0,0,14.2,0.0,4.8,0.0,14.2,4.8
3,226255131,3,155,2019,86.216667,0,0,0,0,1,0,0,0,14.3,0.0,4.9,0.0,14.3,4.9
4,226255131,4,155,2019,90.116667,0,0,0,0,1,0,0,0,13.6,0.0,6.7,0.0,13.6,6.7
