In [2]:
import tensorflow
import keras.losses
import pandas
import requests


def fetch(**kwargs):
    auth = kwargs["auth"]
    link = kwargs["url"]

    return requests.get(link, headers={
        'Authorization': f'Token {auth}'
    })


def save(**kwargs):
    with open(kwargs["file"], "wb") as file:
        res = fetch(**kwargs)
        res.raise_for_status()
        file.write(res.content)


2022-05-23 12:31:20.902403: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-05-23 12:31:20.902431: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Data Fetching

If the data.csv file doesn't exists, we fetch it from [data.gov.gr](https://data.gov.gr/). This way, we don't have to rehit the endpoint each time we run the code.

To enforce a retrieval from the endpoint, delete or empty out the `data.csv` file.

In [3]:
from time import sleep
import pandas as pd
from os.path import exists, getsize

if not exists('data.csv') or getsize('data.csv') == 0:
    save(
        auth = '96db63ce26cec6c95e7ff8e1b882528001830450',
        url = 'https://data.gov.gr/api/v1/query/oasa_ridership?date_from=2022-01-01&date_to=2022-05-10',
        file = '2022.json'
    )
    json = pd.read_json('2022.json')
    json.to_csv('data.csv')

data = pd.read_csv('data.csv')
data

Unnamed: 0.1,Unnamed: 0,dv_validations,dv_agency,dv_platenum_station,dv_route,routes_per_hour,load_dt,date_hour
0,0,3319,1,UKN,,,2022-01-06T06:15:36Z,2022-01-05T23:00:00Z
1,1,96,2,KΑT,,,2022-01-06T06:15:51Z,2022-01-05T23:00:00Z
2,2,213,2,KΑTΕΧΑKΗ,,,2022-01-06T06:15:51Z,2022-01-05T23:00:00Z
3,3,404,2,KΑTΩ ΠΑTΗΣΙΑ,,,2022-01-06T06:15:51Z,2022-01-05T23:00:00Z
4,4,326,2,KΑΛΛΙΘΕΑ,,,2022-01-06T06:15:51Z,2022-01-05T23:00:00Z
...,...,...,...,...,...,...,...,...
181424,181424,860,4,TRAM,,,2022-05-10T05:48:25Z,2022-05-09T20:00:00Z
181425,181425,729,4,TRAM,,,2022-05-10T05:48:25Z,2022-05-09T21:00:00Z
181426,181426,554,4,TRAM,,,2022-05-10T05:48:25Z,2022-05-09T22:00:00Z
181427,181427,444,4,TRAM,,,2022-05-10T05:48:25Z,2022-05-09T23:00:00Z


## Data cleanup

We remove unneeded columns, like the first column which is unnamed and is just the index, or columns filled with null values. We also split the date/time columns as needed.

In [4]:
data[['date', 'hour']] = data['date_hour'].str.split('T', expand=True)
data['hour'] = data['hour'].map(lambda x: x[:-1] if x[-1] == "Z" else x)
data = data[['dv_validations', 'dv_platenum_station', 'date', 'hour']]
data = data.rename(columns = {
    'dv_validations': 'validations',
    'dv_platenum_station': 'station'
})
data

Unnamed: 0,validations,station,date,hour
0,3319,UKN,2022-01-05,23:00:00
1,96,KΑT,2022-01-05,23:00:00
2,213,KΑTΕΧΑKΗ,2022-01-05,23:00:00
3,404,KΑTΩ ΠΑTΗΣΙΑ,2022-01-05,23:00:00
4,326,KΑΛΛΙΘΕΑ,2022-01-05,23:00:00
...,...,...,...,...
181424,860,TRAM,2022-05-09,20:00:00
181425,729,TRAM,2022-05-09,21:00:00
181426,554,TRAM,2022-05-09,22:00:00
181427,444,TRAM,2022-05-09,23:00:00


## Preprocessing

We turn string data into numbered ones

In [5]:
class Indexer(dict):
    items = {}
    __next_id = 0

    def __getitem__(self, item):
        if item in self.items:
            return self.items[item]
        else:
            self.items[item] = self.__next_id
            self.__next_id += 1
            return self.__next_id - 1

    def __invert__(self):
        return {i: item for item, i in self.items}

stations = Indexer()

data['station_id'] = data['station'].map(stations.__getitem__)
data

Unnamed: 0,validations,station,date,hour,station_id
0,3319,UKN,2022-01-05,23:00:00,0
1,96,KΑT,2022-01-05,23:00:00,1
2,213,KΑTΕΧΑKΗ,2022-01-05,23:00:00,2
3,404,KΑTΩ ΠΑTΗΣΙΑ,2022-01-05,23:00:00,3
4,326,KΑΛΛΙΘΕΑ,2022-01-05,23:00:00,4
...,...,...,...,...,...
181424,860,TRAM,2022-05-09,20:00:00,73
181425,729,TRAM,2022-05-09,21:00:00,73
181426,554,TRAM,2022-05-09,22:00:00,73
181427,444,TRAM,2022-05-09,23:00:00,73


we split dates and times

In [6]:
data[['year', 'day', 'month']] = data['date'].str.split('-', expand=True)
data['year'] = data['year'].map(int)
data['month'] = data['month'].map(int)
data['day'] = data['day'].map(int)
data['hour'] = data['hour'].map(lambda time: int(str(time).split(':')[0]))
data

Unnamed: 0,validations,station,date,hour,station_id,year,day,month
0,3319,UKN,2022-01-05,23,0,2022,1,5
1,96,KΑT,2022-01-05,23,1,2022,1,5
2,213,KΑTΕΧΑKΗ,2022-01-05,23,2,2022,1,5
3,404,KΑTΩ ΠΑTΗΣΙΑ,2022-01-05,23,3,2022,1,5
4,326,KΑΛΛΙΘΕΑ,2022-01-05,23,4,2022,1,5
...,...,...,...,...,...,...,...,...
181424,860,TRAM,2022-05-09,20,73,2022,5,9
181425,729,TRAM,2022-05-09,21,73,2022,5,9
181426,554,TRAM,2022-05-09,22,73,2022,5,9
181427,444,TRAM,2022-05-09,23,73,2022,5,9


Replace numeric data in `validations` with new `congestion` column, holding ordered nominal classes for classification

In [7]:
from math import floor

N_CLASSES = 10

max_validations = data['validations'].max()
data['congestion'] = data['validations'].map(lambda x: floor(N_CLASSES * x / max_validations))
data = data.drop(columns='validations')

data

Unnamed: 0,station,date,hour,station_id,year,day,month,congestion
0,UKN,2022-01-05,23,0,2022,1,5,1
1,KΑT,2022-01-05,23,1,2022,1,5,0
2,KΑTΕΧΑKΗ,2022-01-05,23,2,2022,1,5,0
3,KΑTΩ ΠΑTΗΣΙΑ,2022-01-05,23,3,2022,1,5,0
4,KΑΛΛΙΘΕΑ,2022-01-05,23,4,2022,1,5,0
...,...,...,...,...,...,...,...,...
181424,TRAM,2022-05-09,20,73,2022,5,9,0
181425,TRAM,2022-05-09,21,73,2022,5,9,0
181426,TRAM,2022-05-09,22,73,2022,5,9,0
181427,TRAM,2022-05-09,23,73,2022,5,9,0


In [8]:
data.loc[data['congestion'] > 0]

Unnamed: 0,station,date,hour,station_id,year,day,month,congestion
0,UKN,2022-01-05,23,0,2022,1,5,1
77,UKN,2022-01-05,22,0,2022,1,5,2
154,UKN,2022-01-05,21,0,2022,1,5,2
164,ΑTTΙKΗ,2022-01-05,21,10,2022,1,5,1
214,ΣΥΝTΑΓΜΑ,2022-01-05,21,60,2022,1,5,1
...,...,...,...,...,...,...,...,...
181123,ΣΥΝTΑΓΜΑ,2022-05-09,20,60,2022,5,9,2
181124,ΣΥΝTΑΓΜΑ,2022-05-09,21,60,2022,5,9,2
181125,ΣΥΝTΑΓΜΑ,2022-05-09,22,60,2022,5,9,1
181126,ΣΥΝTΑΓΜΑ,2022-05-09,23,60,2022,5,9,1


Shuffle the data around

In [9]:
data = data.sample(frac=1, random_state=42).reset_index(drop=True)

## The training model

We prepare a decision tree model for training based on our data. In the following codecell all the parameters are defined declaratively using the sklearn API for Python

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder

In [11]:
x = data[['station', 'hour', 'day']]
y = data[['congestion']]
x = pd.get_dummies(x, columns=['station'], prefix="station", prefix_sep=" ")
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)

x

Unnamed: 0,hour,day,station KΑT,station KΑTΕΧΑKΗ,station KΑTΩ ΑΧΑΡΝΑΙ,station KΑTΩ ΠΑTΗΣΙΑ,station KΑΛΛΙΘΕΑ,station KΕΡΑΜΕΙKOΣ,station KΗΦΙΣΙΑ,station KΗΦΙΣΙΑΣ,...,station ΠΕΥKΑKΙΑ,station ΡΕΝΤΗΣ,station ΣKΑ - ΑΧΑΡΝΑΙ,station ΣTΑΘΜOΣ ΛΑΡΙΣΗΣ,station ΣΕΠOΛΙΑ,station ΣΥΓΓΡΟΥ ΦΙΞ,station ΣΥΝTΑΓΜΑ,station ΦΑΛΗΡΟ,station ΧΑΛΑΝΔΡΙ,station ΧΟΛΑΡΓOΣ
0,12,2,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,3,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
2,22,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,13,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,3,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181424,14,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181425,14,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181426,9,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
181427,16,4,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
cls = DecisionTreeClassifier()

estimator = cls.fit(x_train, y_train)