In [1]:
import pandas as pd
import os

subfolder = 'data/'
files = os.listdir(subfolder)
files = [f for f in files if f.endswith('.csv')]
files

['2020_01_Gener_BicingNou_ESTACIONS.csv',
 '2020_02_Febrer_BicingNou_ESTACIONS.csv',
 '2020_03_Marc_BicingNou_ESTACIONS.csv',
 '2020_04_Abril_BicingNou_ESTACIONS.csv',
 '2020_05_Maig_BicingNou_ESTACIONS.csv',
 '2020_06_Juny_BicingNou_ESTACIONS.csv',
 '2020_07_Juliol_BicingNou_ESTACIONS.csv',
 '2020_08_Agost_BicingNou_ESTACIONS.csv',
 '2020_09_Setembre_BicingNou_ESTACIONS.csv',
 '2020_10_Octubre_BicingNou_ESTACIONS.csv',
 '2020_11_Novembre_BicingNou_ESTACIONS.csv',
 '2020_12_Desembre_BicingNou_ESTACIONS.csv',
 '2021_01_Gener_BicingNou_ESTACIONS.csv',
 '2021_02_Febrer_BicingNou_ESTACIONS.csv',
 '2021_03_Març_BicingNou_ESTACIONS.csv',
 '2021_04_Abril_BicingNou_ESTACIONS.csv',
 '2021_05_Maig_BicingNou_ESTACIONS.csv',
 '2021_06_Juny_BicingNou_ESTACIONS.csv',
 '2021_07_Juliol_BicingNou_ESTACIONS.csv',
 '2021_08_Agost_BicingNou_ESTACIONS.csv',
 '2021_09_Setembre_BicingNou_ESTACIONS.csv',
 '2021_10_Octubre_BicingNou_ESTACIONS.csv',
 '2021_11_Novembre_BicingNou_ESTACIONS.csv',
 '2021_12_Desembr

In [5]:
df = pd.read_csv(subfolder + files[-2])
df

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,last_reported,is_charging_station,status,is_installed,is_renting,is_returning,traffic,last_updated,ttl
0,1,36,20,16,7,1701384950,True,IN_SERVICE,1,1,1,,1701385139,1
1,2,3,2,1,26,1701385060,True,IN_SERVICE,1,1,1,,1701385139,1
2,3,7,7,0,19,1701385029,True,IN_SERVICE,1,1,1,,1701385139,1
3,4,4,4,0,14,1701385110,True,IN_SERVICE,1,1,1,,1701385139,1
4,5,13,9,4,26,1701384962,True,IN_SERVICE,1,1,1,,1701385139,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4525541,515,5,5,0,19,1704064377,True,IN_SERVICE,1,1,1,,1704064502,1
4525542,516,10,2,8,10,1704064278,True,IN_SERVICE,1,1,1,,1704064502,1
4525543,517,8,1,7,11,1704064498,True,IN_SERVICE,1,1,1,,1704064502,1
4525544,518,3,1,2,24,1704064445,True,IN_SERVICE,1,1,1,,1704064502,1


In [6]:
# Transform last_reported from unix timestamp to datetime
df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s')
df['last_updated'] = pd.to_datetime(df['last_updated'], unit='s')
df

Unnamed: 0,station_id,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available,last_reported,is_charging_station,status,is_installed,is_renting,is_returning,traffic,last_updated,ttl
0,1,36,20,16,7,2023-11-30 22:55:50,True,IN_SERVICE,1,1,1,,2023-11-30 22:58:59,1
1,2,3,2,1,26,2023-11-30 22:57:40,True,IN_SERVICE,1,1,1,,2023-11-30 22:58:59,1
2,3,7,7,0,19,2023-11-30 22:57:09,True,IN_SERVICE,1,1,1,,2023-11-30 22:58:59,1
3,4,4,4,0,14,2023-11-30 22:58:30,True,IN_SERVICE,1,1,1,,2023-11-30 22:58:59,1
4,5,13,9,4,26,2023-11-30 22:56:02,True,IN_SERVICE,1,1,1,,2023-11-30 22:58:59,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4525541,515,5,5,0,19,2023-12-31 23:12:57,True,IN_SERVICE,1,1,1,,2023-12-31 23:15:02,1
4525542,516,10,2,8,10,2023-12-31 23:11:18,True,IN_SERVICE,1,1,1,,2023-12-31 23:15:02,1
4525543,517,8,1,7,11,2023-12-31 23:14:58,True,IN_SERVICE,1,1,1,,2023-12-31 23:15:02,1
4525544,518,3,1,2,24,2023-12-31 23:14:05,True,IN_SERVICE,1,1,1,,2023-12-31 23:15:02,1


## Data cleaning process
### Filter
Remove rows with:
* status!=IN_SERVICE (to decide)
* is_renting=0
* is_returning=0
* is_installed=0

Keep columns:
- station_id
- last_reported
- num_bikes_available
- num_bikes_available_types.mechanical
- num_bikes_available_types.ebike
- num_docks_available

## Transform

timestamp -> hour, day, month, year

Aggregate (mean) by station_id, hour, day, month, year

In [24]:
def clean_df(df):    
    df = df[df["is_renting"] != 0]
    df = df[df["is_returning"] != 0]
    df = df[df["is_installed"] != 0]
    df = df[df["status"] == "IN_SERVICE"]

    df['last_reported'] = pd.to_datetime(df['last_reported'], unit='s')
    df['last_updated'] = pd.to_datetime(df['last_updated'], unit='s')

    cols_to_keep = ['station_id', 'last_reported', 'num_bikes_available', 'num_bikes_available_types.mechanical', 'num_bikes_available_types.ebike','num_docks_available']
    df = df[cols_to_keep]

    # Timestamp to hour, day, month, year
    df['hour'] = df['last_reported'].dt.hour
    df['day'] = df['last_reported'].dt.day
    df['month'] = df['last_reported'].dt.month
    df['year'] = df['last_reported'].dt.year
    # Drop last_reported
    df = df.drop('last_reported', axis=1)
    # station id to int
    df['station_id'] = df['station_id'].astype(int)
    # Merge by taking the mean of the values
    df = df.groupby(['station_id', 'hour', 'day', 'month', 'year']).mean().reset_index()
    
    return df

In [26]:
from tqdm import tqdm
big_df = pd.DataFrame()
for file in tqdm(files[:-1]):
    df = pd.read_csv(subfolder + file)
    df = clean_df(df)
    big_df = pd.concat([big_df, df])
# big_df.to_csv('data/cleaned_data.csv', index=False)
big_df.to_parquet('data/cleaned_data.parquet', index=False)

  df = pd.read_csv(subfolder + file)
  df = pd.read_csv(subfolder + file)
100%|██████████| 48/48 [02:58<00:00,  3.71s/it]


In [28]:
big_df

Unnamed: 0,station_id,hour,day,month,year,num_bikes_available,num_bikes_available_types.mechanical,num_bikes_available_types.ebike,num_docks_available
0,1,0,1,1,2020,23.333333,23.166667,0.166667,20.666667
1,1,0,2,1,2020,25.583333,22.583333,3.000000,17.416667
2,1,0,3,1,2020,40.416667,36.500000,3.916667,3.583333
3,1,0,4,1,2020,10.769231,10.769231,0.000000,33.230769
4,1,0,5,1,2020,15.750000,15.500000,0.250000,27.250000
...,...,...,...,...,...,...,...,...,...
377480,519,23,28,12,2023,3.333333,0.000000,3.333333,19.666667
377481,519,23,29,12,2023,8.166667,0.000000,8.166667,13.833333
377482,519,23,30,11,2023,13.500000,0.000000,13.500000,10.500000
377483,519,23,30,12,2023,6.583333,0.000000,6.583333,16.416667
