In [41]:
import pandas as pd
import holidays

In [42]:
all_rides = pd.read_csv('../Project_datasets/Aggregated_all_rides_codes.csv')

### For the target route 

#### 1. Date/Calendar Features
    1.1. day_of_week	
    1.2. is_weekend	
    1.3. day_of_month	
    1.4. month	
    1.5. year 
    1.6. is_holiday
    1.7. season

#### 2. Lag Features
    2.1.lag_1	trips on the previous day
    2.2. lag_2	2 days ago
    2.3.lag_7	same weekday last week
    2.4. lag_14	same weekday 2 weeks ago
    2.5. lag_month same day month ago
    2.6. lag_year same dat 1 year ago

#### 3. Rolling Window Features
    3.1. rolling_mean_3	Avg of last 3 days
    3.2. rolling_std_7	Std dev of last 7 days
    3.3. rolling_max_7	Max of last 7 days
    3.5. rolling_mean_7_lag_7	Rolling mean from 7–14 days ago (baseline)

### 4. Other Routes
    4.1. total_trips_all_routes	Overall system demand previous day
    4.2. trips_from_mont_royal_all	All trips starting from Metro Mont Royal previous day
    4.3. trips_to_berri_all	All trips ending at Berri previous day
    4.4. total_rolling_mean_3	Avg of last 3 days	actoss the system 
    4.5. perc_target_route  Percentage of Mont Royal → Berri route on previous day from total rides
    4.6. total_rolling_std_7	Overall system demand Std dev of last 7 days 
    4.7. total_lag_7	Overall system demandsame weekday weeks ago

#### For the target route 

In [43]:
all_rides[all_rides['name_start'].str.contains('Métro Mont-Royal')].head(3)

Unnamed: 0.1,Unnamed: 0,date,start_station_code,end_station_code,name_start,name_end,duration_sec,trip_count,memb_count
511,511,2014-04-15,56,4,Métro Mont-Royal (Rivard / du Mont-Royal),St-Dominique / Rachel,619,2,2
512,512,2014-04-15,56,5,Métro Mont-Royal (Rivard / du Mont-Royal),Chambord / Laurier,517,1,1
513,513,2014-04-15,56,8,Métro Mont-Royal (Rivard / du Mont-Royal),Chapleau / du Mont-Royal,2477,3,3


In [44]:
all_rides[all_rides['name_end'].str.contains('Berri / de Maisonneuve')].head(3)

Unnamed: 0.1,Unnamed: 0,date,start_station_code,end_station_code,name_start,name_end,duration_sec,trip_count,memb_count
71,71,2014-04-15,5,106,Chambord / Laurier,Berri / de Maisonneuve,723,1,1
283,283,2014-04-15,29,106,Mackay /de Maisonneuve (Sud),Berri / de Maisonneuve,847,1,1
323,323,2014-04-15,33,106,St-André / Duluth,Berri / de Maisonneuve,417,1,1



Target route 56 ---> 106


In [45]:
target_rides = all_rides[
    (all_rides['start_station_code'] == 56) &
    (all_rides['end_station_code'] == 106)
].copy()

In [46]:
target_rides.head(2)

Unnamed: 0.1,Unnamed: 0,date,start_station_code,end_station_code,name_start,name_end,duration_sec,trip_count,memb_count
525,525,2014-04-15,56,106,Métro Mont-Royal (Rivard / du Mont-Royal),Berri / de Maisonneuve,517,1,1
2821,2821,2014-04-16,56,106,Métro Mont-Royal (Rivard / du Mont-Royal),Berri / de Maisonneuve,1705,3,3


In [47]:
target_rides = target_rides[['date', 'trip_count']]

In [48]:
target_rides['date'] = pd.to_datetime(target_rides['date'])

In [49]:
# 1.1 day_of_week
target_rides['day_of_week'] = target_rides['date'].dt.weekday

# 1.2 is_weekend 
target_rides['is_weekend'] = target_rides['date'].dt.weekday.isin([5, 6]).astype(int)

# 1.3. day_of_month 
target_rides['day_of_month'] = target_rides['date'].dt.day

# 1.4. month 
target_rides['month'] = target_rides['date'].dt.month

# 1.5. year 
target_rides['year'] = target_rides['date'].dt.year

# 1.6. is_holiday
canada_holidays = holidays.CA(years=target_rides['year'].unique())
target_rides['is_holiday'] = target_rides['date'].dt.date.isin(canada_holidays).astype(int)

# 1.7. season
def get_season(month):
    if month in [12, 1, 2]:
        return 1
    elif month in [3, 4, 5]:
        return 2
    elif month in [6, 7, 8]:
        return 3
    else:
        return 4

target_rides['season'] = target_rides['month'].apply(get_season)

In [50]:
target_rides.head(2)

Unnamed: 0,date,trip_count,day_of_week,is_weekend,day_of_month,month,year,is_holiday,season
525,2014-04-15,1,1,0,15,4,2014,0,2
2821,2014-04-16,3,2,0,16,4,2014,0,2


In [51]:
# 2.1.lag_1    trips on the previous day
target_rides = target_rides.sort_values(by='date')
target_rides['lag_1'] = target_rides['trip_count'].shift(1)

# 2.2. lag_2    2 days ago
target_rides['lag_2'] = target_rides['trip_count'].shift(2)

# 2.3.lag_7    same weekday last week
target_rides['lag_7'] = target_rides['trip_count'].shift(7)

# 2.4. lag_14    same weekday 2 weeks ago
target_rides['lag_14'] = target_rides['trip_count'].shift(14)

# 2.5. lag_month same day month ago
# 2.6. lag_year same dat 1 year ago
target_rides['date_lag_month'] = target_rides['date'] - pd.DateOffset(months=1)
target_rides['date_lag_year'] = target_rides['date'] - pd.DateOffset(years=1)

lag_month = target_rides[['date', 'trip_count']].rename(columns={
    'date': 'date_lag_month',
    'trip_count': 'lag_month'
})

lag_year = target_rides[['date', 'trip_count']].rename(columns={
    'date': 'date_lag_year',
    'trip_count': 'lag_year'
})
target_rides = target_rides.merge(lag_month, on='date_lag_month', how='left')
target_rides = target_rides.merge(lag_year, on='date_lag_year', how='left')


In [52]:
target_rides[target_rides.year == 2015].head(4)

Unnamed: 0,date,trip_count,day_of_week,is_weekend,day_of_month,month,year,is_holiday,season,lag_1,lag_2,lag_7,lag_14,date_lag_month,date_lag_year,lag_month,lag_year
188,2015-04-15,3,2,0,15,4,2015,0,2,1.0,2.0,1.0,2.0,2015-03-15,2014-04-15,,1.0
189,2015-04-16,1,3,0,16,4,2015,0,2,3.0,1.0,3.0,2.0,2015-03-16,2014-04-16,,3.0
190,2015-04-17,4,4,0,17,4,2015,0,2,1.0,3.0,2.0,5.0,2015-03-17,2014-04-17,,3.0
191,2015-04-19,1,6,1,19,4,2015,0,2,4.0,1.0,2.0,9.0,2015-03-19,2014-04-19,,1.0


In [53]:
#3.1. rolling_mean_3    Avg of last 3 days
target_rides['rolling_mean_3'] = target_rides['trip_count'].shift(1).rolling(window=3).mean()

#3.2. rolling_std_7    Std dev of last 7 days
target_rides['rolling_std_7'] = target_rides['trip_count'].shift(1).rolling(window=7).std()

#3.3. rolling_max_7    Max of last 7 days
target_rides['rolling_max_7'] = target_rides['trip_count'].shift(1).rolling(window=7).max()

#3.5. rolling_mean_7_lag_7    Rolling mean from 7–14 days ago (baseline)
target_rides['rolling_mean_7_lag_7'] = (
    target_rides['trip_count']
    .shift(7)                         
    .rolling(window=7)
    .mean()
)

In [54]:
target_rides.head(2)

Unnamed: 0,date,trip_count,day_of_week,is_weekend,day_of_month,month,year,is_holiday,season,lag_1,...,lag_7,lag_14,date_lag_month,date_lag_year,lag_month,lag_year,rolling_mean_3,rolling_std_7,rolling_max_7,rolling_mean_7_lag_7
0,2014-04-15,1,1,0,15,4,2014,0,2,,...,,,2014-03-15,2013-04-15,,,,,,
1,2014-04-16,3,2,0,16,4,2014,0,2,1.0,...,,,2014-03-16,2013-04-16,,,,,,


In [55]:
all_rides.head()

Unnamed: 0.1,Unnamed: 0,date,start_station_code,end_station_code,name_start,name_end,duration_sec,trip_count,memb_count
0,0,2014-04-15,1,2,Milton / Clark,Côte St-Antoine / Clarke,1061,1,1
1,1,2014-04-15,1,3,Milton / Clark,Square St-Louis,173,1,1
2,2,2014-04-15,1,33,Milton / Clark,St-André / Duluth,363,1,1
3,3,2014-04-15,1,44,Milton / Clark,Gauthier / Papineau,576,1,1
4,4,2014-04-15,1,100,Milton / Clark,University / Prince-Arthur,616,2,2


#### For the others routes 

In [56]:
#4.1. total_trips_all_routes    Overall system demand on previous day

all_rides['date'] = pd.to_datetime(all_rides['date'])
target_rides['date'] = pd.to_datetime(target_rides['date'])

  # group (calculation), shift one day before, merge to the main dataset
daily_demand = (
    all_rides.groupby('date')
    .size()
    .reset_index(name='total_trips_all_routes')
)

daily_demand['date'] = daily_demand['date'] + pd.Timedelta(days=1)

target_rides = target_rides.merge(daily_demand, on='date', how='left')



In [57]:
#4.2. trips_from_mont_royal_all    All trips starting from Metro Mont Royal

mont_royal_rides = all_rides[all_rides['start_station_code'] == 56]

# Group by date
trips_from_mont_royal = (
    mont_royal_rides
    .groupby('date')
    .size()
    .reset_index(name='trips_from_mont_royal_all')
)

# Shift date 
trips_from_mont_royal['date'] = pd.to_datetime(trips_from_mont_royal['date']) + pd.Timedelta(days=1)

# merge
target_rides = target_rides.merge(trips_from_mont_royal, on='date', how='left')

In [58]:
#4.3. trips_to_berri_all    All trips ending at Berri day before

berri_rides = all_rides[all_rides['end_station_code'] == 106]

# Group by date
trips_to_berri = (
    berri_rides
    .groupby('date')
    .size()
    .reset_index(name='trips_to_berri_all')
)

# Shift date 
trips_to_berri['date'] = pd.to_datetime(trips_to_berri['date']) + pd.Timedelta(days=1)

#  Merge into target_rides
target_rides = target_rides.merge(trips_to_berri, on='date', how='left')

In [59]:
#4.4. total_rolling_mean_3    Avg of last 3 days actoss the system 


daily_trips = (
    all_rides.groupby('date')
    .size()
    .reset_index(name='total_trips')
)

# Shift by 1 to exclude current day
daily_trips['total_rolling_mean_3'] = (
    daily_trips['total_trips']
    .shift(1)                     
    .rolling(window=3)
    .mean()
)

#  Merge 
target_rides = target_rides.merge(
    daily_trips[['date', 'total_rolling_mean_3']],
    on='date',
    how='left'
)

In [60]:
#4.5. perc_target_route  Percentage of Mont Royal → Berri route on previous day from total rides

# Group and count trips per day
daily_trips = (
    all_rides.groupby('date')
    .size()
    .reset_index(name='total_trips')
)
# shift
daily_trips['date'] = pd.to_datetime(daily_trips['date']) + pd.Timedelta(days=1)

#  Merge into target_rides on shifted date
target_rides['date'] = pd.to_datetime(target_rides['date'])
target_rides = target_rides.merge(daily_trips, on='date', how='left')

#  Compute percent of system traffic the target route represents
target_rides['perc_target_route'] = (
    target_rides['trip_count'] / target_rides['total_trips']
) * 100

In [61]:
#4.6 total_rolling_std_7    Overall system demand Std dev of last 7 days

daily_demand['total_rolling_std_7'] = (
    daily_demand['total_trips_all_routes']
    .shift(1)
    .rolling(window=7, min_periods=1)
    .std()
)

#  Merge with target_rides
target_rides = target_rides.merge(
    daily_demand[['date', 'total_rolling_std_7']],
    on='date',
    how='left'
)


In [62]:
#4.7 total_lag_7    Overall system demandsame weekday weeks ago

daily_demand_lag7 = daily_demand[['date', 'total_trips_all_routes']].copy()
daily_demand_lag7['date'] = daily_demand_lag7['date'] + pd.Timedelta(days=7)

#  Merge with target_rides
target_rides = target_rides.merge(
    daily_demand_lag7.rename(columns={'total_trips_all_routes': 'total_lag_7'}),
    on='date',
    how='left'
)

In [63]:
target_rides.columns

Index(['date', 'trip_count', 'day_of_week', 'is_weekend', 'day_of_month',
       'month', 'year', 'is_holiday', 'season', 'lag_1', 'lag_2', 'lag_7',
       'lag_14', 'date_lag_month', 'date_lag_year', 'lag_month', 'lag_year',
       'rolling_mean_3', 'rolling_std_7', 'rolling_max_7',
       'rolling_mean_7_lag_7', 'total_trips_all_routes',
       'trips_from_mont_royal_all', 'trips_to_berri_all',
       'total_rolling_mean_3', 'total_trips', 'perc_target_route',
       'total_rolling_std_7', 'total_lag_7'],
      dtype='object')

In [64]:
features = target_rides[['date',
                       'day_of_week',
                        'is_weekend', 
                        'day_of_month', 
                        'month', 
                        'year', 
                        'is_holiday', 
                        'season', 
                        'lag_1',
                        'lag_2',
                        'lag_7', 
                        'lag_14', 
                        'lag_month', 
                        'lag_year',
                        'rolling_mean_3',
                        'rolling_std_7',
                        'rolling_max_7',
                        'rolling_mean_7_lag_7',
                        'total_trips_all_routes',
                        'trips_from_mont_royal_all',
                        'trips_to_berri_all',
                        'total_rolling_mean_3',
                        'total_rolling_std_7', 
                        'total_lag_7',
                        'trip_count' ]]

In [66]:
features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 718 entries, 0 to 717
Data columns (total 25 columns):
 #   Column                     Non-Null Count  Dtype         
---  ------                     --------------  -----         
 0   date                       718 non-null    datetime64[ns]
 1   day_of_week                718 non-null    int32         
 2   is_weekend                 718 non-null    int64         
 3   day_of_month               718 non-null    int32         
 4   month                      718 non-null    int32         
 5   year                       718 non-null    int32         
 6   is_holiday                 718 non-null    int64         
 7   season                     718 non-null    int64         
 8   lag_1                      717 non-null    float64       
 9   lag_2                      716 non-null    float64       
 10  lag_7                      711 non-null    float64       
 11  lag_14                     704 non-null    float64       
 12  lag_mont

In [67]:
features.to_csv('../Project_datasets/features.csv')