In [1]:
import os
import pandas as pd
import numpy as np
import pickle

In [2]:
with open('stations.pkl', 'rb') as f:
    wroclaw_stations = pickle.load(f)
wroclaw_stations += ['Fabryczna (WSB)']

cols = ['bike_number', 'start_time', 'end_time', 'rental_place', 'return_place']

In [3]:
def trim_and_remove_slash(s):
    return s.strip().replace('/', '-').replace('"', '').replace(',', ' -').replace('\xa0', '')

In [4]:
def load_and_clear_data(filename):
    print(filename)
    data = pd.read_csv(filename, usecols=cols)
    data['return_place'] = data['return_place'].apply(trim_and_remove_slash)
    data['rental_place'] = data['rental_place'].apply(trim_and_remove_slash)
    data = data[data.return_place.isin(wroclaw_stations) & data.rental_place.isin(wroclaw_stations)]
    data.dropna()
    return data

In [5]:
def remove_longer_than_24(data):
    data['rental_time'] = pd.to_timedelta(pd.DatetimeIndex(data['end_time']) - pd.DatetimeIndex(data['start_time']), unit='m')
    data[data['rental_time'] > pd.to_timedelta(24, unit='h')]
    return data.drop(columns=['rental_time'])

In [6]:
data_directory = 'data'
preprocessed_directory = 'data_preprocessed'
os.makedirs(preprocessed_directory, exist_ok=True)
for filename in os.listdir(data_directory):
    if filename.endswith(".csv"):
        data = load_and_clear_data(f'{data_directory}/{filename}')
        data = remove_longer_than_24(data)
        data.to_csv(f'{preprocessed_directory}/{filename}_preprocessed.csv', index=False)

data/historia_przejazdow_2019-11.csv
data/historia_przejazdow_2019-12.csv
data/historia_przejazdow_2019-03.csv
data/historia_przejazdow_2019-07.csv
data/historia_przejazdow_2019-08.csv
data/historia_przejazdow_2019-06.csv
data/historia_przejazdow_2019-09.csv
data/historia_przejazdow_2019-04.csv
data/historia_przejazdow_2019-10.csv
data/historia_przejazdow_2019-05.csv


In [108]:
# import requests
# import lxml.html as lh
# import pandas as pd

# url='https://wroclawskirower.pl/mapa-stacji/'
# page = requests.get(url)
# doc = lh.fromstring(page.content)
# tr_elements = doc.xpath('//tr')
# col=[]
# for t in tr_elements[1:]:
#     col.append(t[1].text_content())