In [7]:
import os
import pandas as pd
import numpy as np
import pickle

from sklearn.preprocessing import LabelEncoder 

In [2]:
with open('stations.pkl', 'rb') as f:
    wroclaw_stations = pickle.load(f)
wroclaw_stations += ['Fabryczna (WSB)']

cols = ['bike_number', 'start_time', 'end_time', 'rental_place', 'return_place']

In [3]:
def trim_and_remove_slash(s):
    return s.strip().replace('/', '-').replace('"', '').replace(',', ' -').replace('\xa0', '')

In [4]:
def load_and_clear_data(filename):
    print(filename)
    data = pd.read_csv(filename, usecols=cols)
    data['return_place'] = data['return_place'].apply(trim_and_remove_slash)
    data['rental_place'] = data['rental_place'].apply(trim_and_remove_slash)
    data = data[data.return_place.isin(wroclaw_stations) & data.rental_place.isin(wroclaw_stations)]
    data.dropna()
    return data

In [5]:
def remove_longer_than_24(data):
    data['rental_time'] = pd.to_timedelta(pd.DatetimeIndex(data['end_time']) - pd.DatetimeIndex(data['start_time']), unit='m')
    data = data[data['rental_time'] < pd.to_timedelta(24, unit='h')]
    data = data[data['rental_time'] > pd.to_timedelta(1, unit='m')]
    return data.drop(columns=['rental_time'])

In [6]:
data = load_and_clear_data('data/historia_przejazdow_2019-03.csv')
data = remove_longer_than_24(data)


data/historia_przejazdow_2019-03.csv


In [9]:
le = LabelEncoder()
le.fit(data['rental_place'])

LabelEncoder()

In [37]:
data_directory = 'data'
preprocessed_directory = 'data_preprocessed'
stations = np.array([])
os.makedirs(preprocessed_directory, exist_ok=True)
for filename in os.listdir(data_directory):
    if filename.endswith(".csv"):
        data = load_and_clear_data(f'{data_directory}/{filename}')
        data = remove_longer_than_24(data)
        month_stations = np.concatenate((data['rental_place'].unique(), data['return_place'].unique()))
        stations = np.unique(np.concatenate((stations, month_stations)))
        data.to_csv(f'{preprocessed_directory}/{filename}', index=False)

data/historia_przejazdow_2019-11.csv
data/historia_przejazdow_2019-12.csv
data/historia_przejazdow_2019-03.csv
data/historia_przejazdow_2019-07.csv
data/historia_przejazdow_2019-08.csv
data/historia_przejazdow_2019-06.csv
data/historia_przejazdow_2019-09.csv
data/historia_przejazdow_2019-04.csv
data/historia_przejazdow_2019-10.csv
data/historia_przejazdow_2019-05.csv


In [32]:
for filename in os.listdir(preprocessed_directory):
    if filename.endswith(".csv"):
        data = pd.read_csv(f'{preprocessed_directory}/{filename}', usecols=cols)
        le = LabelEncoder()
        le.fit(stations)
        data['rental_place'] = le.transform(data['rental_place'])
        data['return_place'] = le.transform(data['return_place'])
        data.to_csv(f'{preprocessed_directory}/{filename}', index=False)

       bike_number           start_time             end_time  rental_place  \
0           650748  2019-10-31 23:50:52  2019-11-01 00:02:32            56   
1           650994  2019-11-01 00:00:37  2019-11-01 00:03:17           151   
2           650210  2019-10-31 23:52:18  2019-11-01 00:04:04            50   
3           650924  2019-10-31 23:54:13  2019-11-01 00:04:05            46   
4           650840  2019-10-31 23:52:41  2019-11-01 00:04:13            50   
...            ...                  ...                  ...           ...   
90446       650350  2019-12-01 11:42:37  2019-12-01 11:55:40           159   
90447        57984  2019-12-01 11:43:04  2019-12-01 11:55:40           159   
90448       650700  2019-12-01 16:57:26  2019-12-01 17:10:07            48   
90449        57976  2019-12-01 19:04:48  2019-12-01 19:15:46            43   
90450        57713  2019-12-01 22:56:00  2019-12-01 23:04:14            79   

       return_place  
0               194  
1                29

In [40]:
mapping_dict = {stations[i]: i for i in range(0, len(le.classes_))}
stations_pd = pd.DataFrame(stations, columns=['name'])
stations_pd.index.rename('value')
stations_pd.to_csv(f'{preprocessed_directory}/nodes.csv', index=True, index_label='value')

In [47]:
range(le.transform(le.classes_)

array([  0,   1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,
        13,  14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,
        26,  27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,
        39,  40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,
        52,  53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
        65,  66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,
        78,  79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,  90,
        91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102, 103,
       104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
       117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129,
       130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142,
       143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155,
       156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168,
       169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 18

In [45]:
stations_pd

Unnamed: 0,name
0,Aleja Bielany
1,Aleja Hallera - Mielecka
2,Aleja Kromera
3,Bacciarellego
4,Bacciarellego - pętla autobusowa
...,...
199,Żernicka
200,Żeromskiego - Daszyńskiego
201,Żeromskiego - Kluczborska
202,Żmigrodzka - Broniewskiego


In [7]:
# import requests
# import lxml.html as lh
# import pandas as pd

# url='https://wroclawskirower.pl/mapa-stacji/'
# page = requests.get(url)
# doc = lh.fromstring(page.content)
# tr_elements = doc.xpath('//tr')
# col=[]
# for t in tr_elements[1:]:
#     col.append(t[1].text_content())