In [104]:
import pandas as pd
from pandas.io.json import json_normalize
import numpy as np
import matplotlib.pyplot as plt
import requests
import json


In [105]:
def fetch_data(start, end):
    response = requests.get('https://tie.digitraffic.fi/api/v1/data/tms-data')
    return response.json()
def fetch_stations():
    response = requests.get('https://tie.digitraffic.fi/api/v3/metadata/tms-stations')
    stations = json_normalize(response.json()["features"])
    return stations

def fetch_sensors():
    response = requests.get('https://tie.digitraffic.fi/api/v3/metadata/tms-sensors')
    sensors = json_normalize(response.json()["roadStationSensors"])
    return sensors

In [106]:
stations = fetch_stations()

In [107]:
helsinki = stations[stations['properties.municipality']=='Helsinki']

In [108]:
a = fetch_data(1,2)

In [109]:
#a['tmsStations']


In [110]:
import plotly.express as px

In [111]:
coords = pd.DataFrame(helsinki['geometry.coordinates'].to_list(), columns=['lon','lat', 'z'])
fig = px.scatter_mapbox(coords, lat="lat", lon="lon")
fig.update_layout(mapbox_style="open-street-map")
fig.show()

In [112]:
def load_data(filepath):
    data = pd.read_csv(filepath, sep= ';', encoding='utf-16')
    data.rename(columns={' Mittauspiste Sijainti Päivä Suuntakoodi Ajoneuvoluokka': 'TMS', 'Unnamed: 1': 'Location', "Unnamed: 2": 'Date', "Unnamed: 3":'Direction', "Unnamed: 4":'Type'}, inplace=True)
    data.columns = data.columns.str.replace(' KLO_', '')
    data.Date = pd.to_datetime(data.Date)
    for column in data.columns[5:]:
        data[column] = data[column].replace(' ', 0)
        data[column] = pd.to_numeric(data[column])
    return data

In [113]:
#groups = data.groupby(['Date','Type'])

In [114]:
import requests
import re
from bs4 import BeautifulSoup

def get_file_paths(url, params={}, years = r'(.*2020[0-9]{4}.*|.*2021[0-9]{4}.*)'):
    response = requests.get(url, params=params)
    if response.ok:
        response_text = response.text
    else:
        return response.raise_for_status()
    soup = BeautifulSoup(response_text, 'html.parser')
    parent = [url + node.get('href') for node in soup.find_all('a') if node.get('href')]
    r = re.compile(years)
    filenames = []
    
    folders =  list(filter(r.search, parent))
    for folder in folders:
        response = requests.get(folder, params=params)
        if response.ok:
            response_text = response.text
        else:
            return response.raise_for_status()
        soup = BeautifulSoup(response_text, 'html.parser')
        parent = [url + node.get('href') for node in soup.find_all('a')]
        station = ".*117.*Munkkiniemi.*"
        r = re.compile(station)
        file =  list(filter(r.match, parent))
        # Each folder has two instances of the same file. We only need to have one of them to reduce number of duplicates 
        if file:
            filenames.append(str(folder + file[0].split("/")[-1]))
    return filenames
        
url = 'https://aineistot.vayla.fi/lam/reports/LAM/'
result = get_file_paths(url)
result

['https://aineistot.vayla.fi/lam/reports/LAM/20200111/117_vt1_Munkkiniemi_20191201_20191231.xls',
 'https://aineistot.vayla.fi/lam/reports/LAM/20200111/117_vt1_Munkkiniemi_20191201_20191231.xls',
 'https://aineistot.vayla.fi/lam/reports/LAM/20200211/117_vt1_Munkkiniemi_20200101_20200131.xls',
 'https://aineistot.vayla.fi/lam/reports/LAM/20200211/117_vt1_Munkkiniemi_20200101_20200131.xls',
 'https://aineistot.vayla.fi/lam/reports/LAM/20200311/117_vt1_Munkkiniemi_20200201_20200229.xls',
 'https://aineistot.vayla.fi/lam/reports/LAM/20200311/117_vt1_Munkkiniemi_20200201_20200229.xls',
 'https://aineistot.vayla.fi/lam/reports/LAM/20200411/117_vt1_Munkkiniemi_20200301_20200331.xls',
 'https://aineistot.vayla.fi/lam/reports/LAM/20200411/117_vt1_Munkkiniemi_20200301_20200331.xls',
 'https://aineistot.vayla.fi/lam/reports/LAM/20200511/117_vt1_Munkkiniemi_20200401_20200430.xls',
 'https://aineistot.vayla.fi/lam/reports/LAM/20200511/117_vt1_Munkkiniemi_20200401_20200430.xls',
 'https://aineistot.

In [115]:
#stats = groups.describe()

In [116]:
dfs = []
for url in result:
    d = load_data(url)
    dfs.append(d)
data_set = pd.concat(dfs, ignore_index=True)

In [117]:
data_set

Unnamed: 0,TMS,Location,Date,Direction,Type,00-01,01-02,02-03,03-04,04-05,...,14-15,15-16,16-17,17-18,18-19,19-20,20-21,21-22,22-23,23-00
0,117,vt1_Munkkiniemi,2019-01-12,1,11 HA-PA,216,186,158,147,117,...,976,974,978,899,829,675,525,362,189,108
1,117,vt1_Munkkiniemi,2019-01-12,1,12 KAIP,0,2,0,1,0,...,5,2,2,4,2,4,1,0,0,0
2,117,vt1_Munkkiniemi,2019-01-12,1,13 Linja-autot,1,2,1,0,2,...,15,8,12,8,11,9,8,5,6,4
3,117,vt1_Munkkiniemi,2019-01-12,1,14 KAPP,1,0,0,0,0,...,0,0,0,0,1,1,1,0,1,1
4,117,vt1_Munkkiniemi,2019-01-12,1,15 KATP,0,0,0,0,0,...,0,1,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18669,117,vt1_Munkkiniemi,2021-09-30,2,13 Linja-autot,2,1,4,1,1,...,16,16,17,14,13,6,8,6,7,0
18670,117,vt1_Munkkiniemi,2021-09-30,2,14 KAPP,0,1,0,0,1,...,1,2,6,5,4,3,1,0,1,1
18671,117,vt1_Munkkiniemi,2021-09-30,2,15 KATP,1,0,0,0,0,...,3,0,1,1,0,1,0,0,0,0
18672,117,vt1_Munkkiniemi,2021-09-30,2,16 HA + PK,0,0,0,1,0,...,1,7,4,5,11,6,1,1,0,1


In [118]:
from workalendar.europe import Finland

In [119]:
cal = Finland()
data_set['holiday'] = data_set.Date.apply(lambda x: not cal.is_working_day(x))

In [120]:
id_vars=list(data_set.columns[:5])
id_vars.append(data_set.columns[-1])
value_vars=list(data_set.columns[5:][:-1])
data_set = pd.melt(data_set, id_vars=id_vars, value_vars=value_vars)

In [121]:
data_set.drop_duplicates(['Date', 'variable', 'Direction', 'Type'], inplace = True)
data_set

Unnamed: 0,TMS,Location,Date,Direction,Type,holiday,variable,value
0,117,vt1_Munkkiniemi,2019-01-12,1,11 HA-PA,True,00-01,216
1,117,vt1_Munkkiniemi,2019-01-12,1,12 KAIP,True,00-01,0
2,117,vt1_Munkkiniemi,2019-01-12,1,13 Linja-autot,True,00-01,1
3,117,vt1_Munkkiniemi,2019-01-12,1,14 KAPP,True,00-01,1
4,117,vt1_Munkkiniemi,2019-01-12,1,15 KATP,True,00-01,0
...,...,...,...,...,...,...,...,...
447752,117,vt1_Munkkiniemi,2021-09-30,2,13 Linja-autot,False,23-00,0
447753,117,vt1_Munkkiniemi,2021-09-30,2,14 KAPP,False,23-00,1
447754,117,vt1_Munkkiniemi,2021-09-30,2,15 KATP,False,23-00,0
447755,117,vt1_Munkkiniemi,2021-09-30,2,16 HA + PK,False,23-00,1


In [144]:
car_total = data_set.groupby(['Date', 'Direction', 'variable'])
data = car_total[['value', 'holiday']].sum().reset_index()

In [145]:
conve_dict = dict(zip(data.variable.unique(), list(range(24))))
data.head(48)

Unnamed: 0,Date,Direction,variable,value,holiday
0,2019-01-12,1,00-01,218,7.0
1,2019-01-12,1,01-02,190,7.0
2,2019-01-12,1,02-03,159,7.0
3,2019-01-12,1,03-04,148,7.0
4,2019-01-12,1,04-05,119,7.0
5,2019-01-12,1,05-06,93,7.0
6,2019-01-12,1,06-07,84,7.0
7,2019-01-12,1,07-08,102,7.0
8,2019-01-12,1,08-09,190,7.0
9,2019-01-12,1,09-10,316,7.0


In [146]:
data.variable = data.variable.transform(lambda x: conve_dict[x])
data.holiday = data.holiday.astype(bool)*1

In [147]:
data['Date'] = pd.to_datetime(data['Date'])
#data['Date']= data['Date'].map(dt.datetime.toordinal)
data['timestamp'] = data['Date'] + data.variable.apply(lambda x: np.timedelta64(x, 'h'))


In [152]:
data['month'] = pd.DatetimeIndex(data['Date']).month
# take a snapshot of the dataframe for further inspection
data_copy = data.copy()

## Linear regression

In [159]:
import numpy as np
from sklearn.model_selection import train_test_split
column_names = ['month', 'variable', 'Direction', 'holiday']
X = data[column_names]
y = data['value']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1337, shuffle=True)

In [156]:
# Date is irrelevant, what we are interested in is the time of day and whether the day is a holiday
X_train

Unnamed: 0,month,variable,Direction,holiday
9988,6,4,1,0
18878,12,14,1,0
12287,8,23,2,0
1892,1,20,1,0
4426,3,10,1,0
...,...,...,...,...
19623,1,15,2,0
9448,6,16,2,1
860,12,20,2,0
24765,5,21,2,0


In [160]:
from sklearn.linear_model import LinearRegression

model = LinearRegression(fit_intercept=True)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
display(y_pred)

array([274.29037416, 595.1679225 , 512.94034417, ..., 657.70450989,
       573.21529391, 685.80420368])

In [161]:
from sklearn.metrics import mean_absolute_error

mean_absolute_error(y_pred, y_test)

329.10537991803864

In [131]:
import joblib

filename = 'model.sav'
joblib.dump(model, filename)

['model.sav']

## LSTM

In [132]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
from keras.models import Sequential
from keras.preprocessing.sequence import TimeseriesGenerator
from matplotlib.pylab import rcParams
rcParams['figure.figsize']=8,6
from keras.layers import Dense
from keras.layers import LSTM
import math
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error

In [133]:
from keras.layers import Dropout
from keras.layers import Bidirectional
look_back = 24
model=Sequential()
model.add(LSTM(50,activation='relu',return_sequences=True,input_shape=(look_back, 2)))
model.add(LSTM(50, activation='relu', return_sequences=True))
model.add(LSTM(50, activation='sigmoid', return_sequences=False))
model.add(Dense(20))
model.add(Dropout(0.2))
model.add(Dense(1))
model.compile(optimizer='adam',loss='mean_squared_error')

In [134]:
data = data.set_index('timestamp')

In [135]:
data = data.drop(columns=['Date', 'variable'])

In [136]:
y_scaler = MinMaxScaler(feature_range=(0, 1))
x_scaler = MinMaxScaler(feature_range=(0, 1))
#dataX = x_scaler.fit_transform(data[['Direction', 'holiday']])
data.value = y_scaler.fit_transform(data[['value']])


In [137]:
train_size = int(len(data) * 0.7)
display(train_size)
test_size = len(data) - train_size
train = data[0:train_size]
test = data[train_size:len(data)]


22512

In [138]:
data.Direction = data.Direction-1

In [139]:
def create_dataset(dataset, look_back):
    dataX, dataY = [], []
    for i in range(len(dataset)-look_back-1):
        a = dataset[i:(i+look_back)].values
        dataX.append(a[:, [0,2]])
        dataY.append(dataset.iloc[i + look_back, 1])
    return np.array(dataX), np.array(dataY)
  # reshape into X=t and Y=t+1
# One day of data
look_back = 24
trainX, trainY = create_dataset(train, look_back)
testX, testY = create_dataset(test, look_back)

In [140]:
trainX = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 2))
testX = np.reshape(testX, (testX.shape[0], testX.shape[1], 2))

In [141]:
trainY.reshape(-1,1).reshape(trainY.shape).shape

(22487,)

In [142]:
Xdata_train = np.reshape(trainX, (trainX.shape[0], trainX.shape[1], 2))

#training for all data
history = model.fit(Xdata_train, trainY, batch_size=10,epochs=3,shuffle=False)

Epoch 1/3


KeyboardInterrupt: 

In [None]:
trainPredict = model.predict(trainX)
testPredict = model.predict(testX)


# invert predictions
trainPredict = y_scaler.inverse_transform(trainPredict)
trainY_val = y_scaler.inverse_transform(trainY.reshape(-1,1)).reshape(trainY.shape)
testPredict = y_scaler.inverse_transform(testPredict)
testY_val = y_scaler.inverse_transform(testY.reshape(-1,1)).reshape(testY.shape)
# calculate root mean squared error
trainScore = math.sqrt(mean_squared_error(trainY_val, trainPredict))
print('Train Score: %.2f RMSE' % (trainScore))
testScore = math.sqrt(mean_squared_error(testY_val, testPredict))
print('Test Score: %.2f RMSE' % (testScore))

#RMSE of test > RMSE of train => OVER FITTING of the data.
#RMSE of test < RMSE of train => UNDER FITTING of the data.

In [None]:
data

In [None]:

#TODO:clean this up
from pandas.tseries.offsets import DateOffset
add_dates = [data.index[-1] + DateOffset(hours=x) for x in range(0,look_back+1) ]



future_dates = pd.DataFrame(index=add_dates[1:],columns=data.columns)
display(train)
pred_list = []
batch = train[-look_back:].reshape((1, look_back, 1))
for i in range(look_back):
    pred_list.append(model.predict(batch)[0])
    batch = np.append(batch[:,1:,:],[[pred_list[i]]],axis=1)
df_predict = pd.DataFrame(scaler.inverse_transform(pred_list),
                          index=future_dates[-look_back:].index, columns=['Prediction'])
display(df_predict)

In [None]:
plt.plot(data.head(24).timestamp, data.head(24).value)

In [None]:
data.iloc[data['timestamp'].idxmin()]

In [None]:
data.timestamp = data.timestamp-data.timestamp.min()

In [None]:
snapshot = data_set[(data_set.Date=='2021-01-03')]

In [None]:
snapshot.drop_duplicates(['Date', 'variable', 'Direction', 'Type'], inplace = True)

In [None]:
snapshot[(snapshot.Type=='11 HA-PA') & (snapshot.Direction==1)].shape

In [None]:
fig, (ax1,ax2) = plt.subplots(2,1, figsize=(8,15))
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 15})
ax1.set_xlabel('hour')
ax1.set_ylabel('num cars')
ax2.set_xlabel('hour')
ax2.set_ylabel('num cars')
for car_type in snapshot.Type.unique():
    d1 = snapshot[(snapshot.Type==car_type) & (snapshot.Direction==2)]
    d2 = snapshot[(snapshot.Type==car_type) & (snapshot.Direction==1)]
    values1 = d1['value']
    line1, = ax1.plot(range(0,24), values1)
    line1.set_label(car_type)
    values2 = d2['value']
    line2, = ax2.plot(range(0,24), values2)
    line2.set_label(car_type)
ax1.legend()
ax2.legend()
ax1.set_title('To Helsinki')
ax2.set_title('From Helsinki')