# From beginning

Start from the beginning, the goal is to have a dynamic map with selection for the day of the week and the time of the day with all the stations and the number of bicycles available.

In [2]:
%reload_ext autoreload
%autoreload 2

In [3]:
from gira import load_data
from gira import preprocess

In [4]:
# loading data from january as a sample
df = load_data.load_sample()

In [5]:
df.head()

Unnamed: 0,desigcomercial,numbicicletas,numdocas,position,entity_ts,estado
0,135 - Avenida Cidade de Lourenço Marques / Val...,7,20,"{""coordinates"":[-9.118689,38.764067],""type"":""P...",2022-07-27 15:53:45.206000+00:00,active
1,456 - Entrecampos / Av. das Forças Armadas,18,41,"{""coordinates"":[-9.14872,38.74877],""type"":""Poi...",2022-07-27 15:53:46.819000+00:00,active
2,132 - Avenida de Berlim / Rua Cidade de Cabinda,2,25,"{""coordinates"":[-9.11255,38.76829],""type"":""Poi...",2022-07-27 15:53:44.936000+00:00,active
3,206 - Av. Brasília / Doca de Alcântara,10,22,"{""coordinates"":[-9.17502,38.70141],""type"":""Poi...",2022-07-27 15:53:40.553000+00:00,active
4,555 - Alameda das Linhas de Torres,1,33,"{""coordinates"":[-9.160248,38.770657],""type"":""P...",2022-07-27 15:53:46.255000+00:00,active


In [6]:
# preprocessing the data
df_clean = preprocess.cleaning(df)

In [7]:
df_clean.shape

(158429, 6)

In [8]:
df_transform = preprocess.processing_columns(df_clean)

Extracting the station name

-------------------------

Setting the station ID as an integer

-------------------------

Extracting geographical position

-------------------------

Dropping the columns position and designcomercial

-------------------------

Preprocess completed! 


In [9]:
df_transform.head()

Unnamed: 0,numbicicletas,numdocas,entity_ts,estado,station_name,stationID,lon,lat
0,7,20,2022-07-27 15:53:45.206000+00:00,active,Avenida Cidade de Lourenço Marques / Vale do,135,-9.118689,38.764067
1,18,41,2022-07-27 15:53:46.819000+00:00,active,Entrecampos / Av. das Forças Armadas,456,-9.14872,38.74877
2,2,25,2022-07-27 15:53:44.936000+00:00,active,Avenida de Berlim / Rua Cidade de Cabinda,132,-9.11255,38.76829
3,10,22,2022-07-27 15:53:40.553000+00:00,active,Av. Brasília / Doca de Alcântara,206,-9.17502,38.70141
4,1,33,2022-07-27 15:53:46.255000+00:00,active,Alameda das Linhas de Torres,555,-9.160248,38.770657


In [10]:
df_transform.shape

(158429, 8)

In [11]:
df_transform.isna().sum()

numbicicletas    0
numdocas         0
entity_ts        0
estado           0
station_name     0
stationID        0
lon              0
lat              0
dtype: int64

In [15]:
import pandas as pd
def time_step(df):
    """
    Creates a column with the time step between register.
    Won't be necessary after resamplying. Only useful during
    EDA.
    """
    df_copy = df.copy()
    # one problem: The step time shoul be calculated for every station. If
    # done for the whole df will have a huge step time at the
    # beginning of every new station
    print('Calculating the time step')
    df_copy['diff_time'] = df_copy['entity_ts'].diff().replace({pd.NaT: pd.Timedelta("0 days")})
    df_copy['total_seconds'] = df_copy['diff_time'].apply(lambda x: int(x.total_seconds()))
    print(f'\n{"-" * 25}\n')

    return df_copy

In [16]:
df_transform = time_step(df_transform)

Calculating the time step

-------------------------



In [12]:
df_transform.shape

(158429, 8)

In [13]:
df_transform.isna().sum()

numbicicletas    0
numdocas         0
entity_ts        0
estado           0
station_name     0
stationID        0
lon              0
lat              0
dtype: int64

In [23]:
def process_station(df: pd.DataFrame, station: int) -> pd.DataFrame:
    """
    Processing the information selecting by station:
        - Sort by time
        - Set datetime column as index
        - Resample hourly
        - Fill the gaps with interpolation
        - Creates a column with the day of the week
        - Calculating the difference between num. bicicles
        from one register to the other

    We split into station so the time step is correct. Otherwise it is
    mixed between different stations

    ---
    return
        DataFrame
    """

    df_ = df.copy()
    df_station = df_[df_['stationID'] == station]

    print('Sorting values by datetime')
    df_sorted = df_station.sort_values(by=['entity_ts'])
    print(f'\n{"-" * 25}\n')

    print('Setting the time column as index')
    # how do we sample the time? every hour: what to do with the 20 secs
    # records? the gaps we can fill with bfill
    # bfill() is used to backward fill the missing values in the dataset.
    # It will backward fill the NaN values that are present in the pandas
    # dataframe. ffill() function is used forward fill the
    # missing value in the dataframe.
    # it also need to be done throught the station values otherwise will mix
    # with the next station record causing inconstancy
    df_sorted.set_index('entity_ts',
                 inplace=True)
    print(f'\n{"-" * 25}\n')

    print('Resampling to 1 hour timestep')
    # applying different aggregation function to each column
    df_hour = df_sorted.resample('H').agg({'numbicicletas': 'mean',
                                           'numdocas':'mean',
                                           'station_name': 'last',
                                           'lat': 'last',
                                           'lon': 'last',
                                          'stationID': 'last',
                                          'estado':'last'})
    # interpolating when there's no data
    df_hour[['numbicicletas', 'numdocas']] = df_hour[['numbicicletas', 'numdocas']].interpolate()
    df_hour[['station_name', 'lat', 'lon', 'stationID', 'estado']] = df_hour[['station_name', 'lat', 
                                                                              'lon', 'stationID', 
                                                                              'estado']].fillna(method='ffill')


    # create a column with the day of the week to be used later as a
    # selector
    print('Creating day of the week')
    df_hour['day_of_week'] = df_hour.index.day_name()
    print(f'\n{"-" * 25}\n')

    print('Calculating the diff in the n. bike column')
    df_hour['bike_taken'] = df_hour['numbicicletas'].diff().fillna(0)
    print(f'\n{"-" * 25}\n')

    return df_hour

In [24]:
def process_all_station(df: pd.DataFrame) -> pd.DataFrame:
    """
    Get the entire dataframe and process for each station separatelly.
    Uses the function process_station defined previously

    ---
    return: new dataframe processed by station
    """
    stations = df['stationID'].unique()
    processed_stations = pd.DataFrame()

    for station in stations:
        processed_stations = pd.concat([processed_stations,
                                        process_station(df, station)],
                                       axis=0)

    return processed_stations

In [14]:
df_station = preprocess.process_all_station(df_transform)

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

-------------------------

Calculating the diff in the n. bike column

-------------------------

Sorting values by datetime

-------------------------

Setting the time column as index

-------------------------

Resampling to 1 hour timestep
Creating day of the week

------------------

In [15]:
df_station.shape

(100146, 9)

In [16]:
df_transform.shape

(158429, 8)

In [29]:
df_station.shape[0] - df_transform.shape[0]

-58283

In [17]:
df_station.isna().sum()

numbicicletas    0
numdocas         0
station_name     0
lat              0
lon              0
stationID        0
estado           0
day_of_week      0
bike_taken       0
dtype: int64

In [24]:
df_station

Unnamed: 0_level_0,numbicicletas,numdocas,station_name,lat,lon,stationID,estado,day_of_week,bike_taken
entity_ts,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2022-07-01 00:00:00+00:00,17,20,Avenida Cidade de Lourenço Marques / Vale do,38.764067,-9.118689,135.0,active,Friday,0.0
2022-07-01 01:00:00+00:00,19,20,Avenida Cidade de Lourenço Marques / Vale do,38.764067,-9.118689,135.0,active,Friday,2.0
2022-07-01 02:00:00+00:00,19,20,Avenida Cidade de Lourenço Marques / Vale do,38.764067,-9.118689,135.0,active,Friday,0.0
2022-07-01 03:00:00+00:00,19,20,Avenida Cidade de Lourenço Marques / Vale do,38.764067,-9.118689,135.0,active,Friday,0.0
2022-07-01 04:00:00+00:00,19,20,Avenida Cidade de Lourenço Marques / Vale do,38.764067,-9.118689,135.0,active,Friday,0.0
...,...,...,...,...,...,...,...,...,...
2022-07-11 09:00:00+00:00,0,17,Rua Tomás da Anunciação / Rua Almeida e S.,38.717313,-9.165653,351.0,repair,Monday,0.0
2022-07-11 10:00:00+00:00,0,17,Rua Tomás da Anunciação / Rua Almeida e S.,38.717313,-9.165653,351.0,repair,Monday,0.0
2022-07-11 11:00:00+00:00,0,17,Rua Tomás da Anunciação / Rua Almeida e S.,38.717313,-9.165653,351.0,repair,Monday,0.0
2022-07-11 12:00:00+00:00,0,17,Rua Tomás da Anunciação / Rua Almeida e S.,38.717313,-9.165653,351.0,repair,Monday,0.0


In [25]:
# creating the grouped df with mean of num bicicletas by station, day of the week and time of the day
# lets start with just station and day of the week
df_grouped = df_station.groupby(by=['stationID', 'day_of_week', df_station.index.hour]).agg({'numbicicletas': 'mean',
                                                                                            'station_name':'last',
                                                                                            'lat': 'last',
                                                                                            'lon': 'last'})
df_grouped['numbicicletas'] = df_grouped['numbicicletas'].astype(int)

In [26]:
df_grouped

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,numbicicletas,station_name,lat,lon
stationID,day_of_week,entity_ts,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
101.0,Friday,0,8,Alameda dos Oceanos / Rua dos Argonautas,38.756161,-9.096804
101.0,Friday,1,9,Alameda dos Oceanos / Rua dos Argonautas,38.756161,-9.096804
101.0,Friday,2,10,Alameda dos Oceanos / Rua dos Argonautas,38.756161,-9.096804
101.0,Friday,3,10,Alameda dos Oceanos / Rua dos Argonautas,38.756161,-9.096804
101.0,Friday,4,9,Alameda dos Oceanos / Rua dos Argonautas,38.756161,-9.096804
...,...,...,...,...,...,...
556.0,Wednesday,19,9,Alamedas das Linhas de Torres / Estrada da T,38.773489,-9.161462
556.0,Wednesday,20,8,Alamedas das Linhas de Torres / Estrada da T,38.773489,-9.161462
556.0,Wednesday,21,7,Alamedas das Linhas de Torres / Estrada da T,38.773489,-9.161462
556.0,Wednesday,22,8,Alamedas das Linhas de Torres / Estrada da T,38.773489,-9.161462


In [38]:
df_grouped.reset_index(inplace=True)

In [39]:
df_grouped

Unnamed: 0,stationID,day_of_week,entity_ts,numbicicletas,station_name,lat,lon
0,101.0,Friday,0,8,Alameda dos Oceanos / Rua dos Argonautas,38.756161,-9.096804
1,101.0,Friday,1,9,Alameda dos Oceanos / Rua dos Argonautas,38.756161,-9.096804
2,101.0,Friday,2,10,Alameda dos Oceanos / Rua dos Argonautas,38.756161,-9.096804
3,101.0,Friday,3,10,Alameda dos Oceanos / Rua dos Argonautas,38.756161,-9.096804
4,101.0,Friday,4,9,Alameda dos Oceanos / Rua dos Argonautas,38.756161,-9.096804
...,...,...,...,...,...,...,...
23179,556.0,Wednesday,19,9,Alamedas das Linhas de Torres / Estrada da T,38.773489,-9.161462
23180,556.0,Wednesday,20,8,Alamedas das Linhas de Torres / Estrada da T,38.773489,-9.161462
23181,556.0,Wednesday,21,7,Alamedas das Linhas de Torres / Estrada da T,38.773489,-9.161462
23182,556.0,Wednesday,22,8,Alamedas das Linhas de Torres / Estrada da T,38.773489,-9.161462


In [42]:
df_grouped.columns

Index(['stationID', 'day_of_week', 'entity_ts', 'numbicicletas',
       'station_name', 'lat', 'lon'],
      dtype='object')

In [44]:
import plotly.express as px

df = px.data.gapminder()
fig = px.scatter_geo(df_grouped, lon="lon", lat="lat", 
                     animation_frame='day_of_week', animation_group="numbicicletas",
           size="numbicicletas", color="numbicicletas", 
                     hover_name="station_name",
                     projection="natural earth",
           #size_max=55, range_x=[100,100000], range_y=[25,90]
                )
fig.update_geos(fitbounds="locations")
fig["layout"].pop("updatemenus") # optional, drop animation buttons
fig.show()