## [Aux] Parquet File Creation

#### 1 Import necessary modules and libraries and initiate the project

In [1]:
import os
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from IPython.display import display, Markdown
import warnings
warnings.filterwarnings('ignore')

# Add path to the Static Data directory
path_to_dir = "../data/cleaned_data/"

#### 2 Load the data

In [2]:
routes = pd.read_csv(path_to_dir + 'routes.csv')
stops = pd.read_csv(path_to_dir + 'stops.csv')
trips = pd.read_csv(path_to_dir + 'trips.csv')
stop_times = pd.read_csv(path_to_dir +  'stop_times.csv')
shapes = pd.read_csv(path_to_dir +  'shapes.csv')
agency = pd.read_csv(path_to_dir + 'agency.csv')
calendar = pd.read_csv(path_to_dir + 'calendar.csv')

#### 3 Merge the datasets

##### Step 1: Get unique trip_id for each stop_id from stop_times

In [3]:
stop_trip_agg = stop_times.groupby('stop_id').agg({
    'trip_id': lambda x: list(x.unique()),  # Gather unique trip_ids for each stop
    'stop_sequence': 'first',               # Take the first stop sequence
    'shape_dist_traveled': 'first'         # Take the first distance travelled
}).reset_index()

##### Step 2: Use trip_id to find associated route_id and shape_id in trips

In [4]:
# We will expand the trip_ids first and then merge with trips
stop_trip_agg_expanded = stop_trip_agg.explode('trip_id')  # Explode trip_id list into individual rows

# Merge with trips to get route_id and shape_id
stop_trip_agg_routes = pd.merge(stop_trip_agg_expanded,
                                trips[['trip_id', 'route_id', 'shape_id']],
                                on='trip_id',
                                how='left')

##### Step 3: Merge with routes to get route_long_name

In [5]:
stop_trip_routes_names = pd.merge(stop_trip_agg_routes,
                                  routes[['route_id', 'route_long_name']],
                                  on='route_id',
                                  how='left')

##### Step 4: Now merge with shapes to get the shape details

In [6]:
stop_trip_routes_shapes = pd.merge(stop_trip_routes_names,
                                   shapes[['shape_id', 'shape_pt_sequence', 'shape_dist_traveled']],
                                   on='shape_id',
                                   how='left')

#### 4 Export Data to Parquet

In [7]:
stop_trip_routes_shapes.to_parquet('../data/processed_data/metro_network_data.parquet', index=False)

#### 5 Play Along

In [8]:
metro_data = pd.read_parquet('../data/processed_data/metro_network_data.parquet')

print("Shape of metro_data", metro_data.shape)

metro_data.head()

Shape of metro_data (31306068, 9)


Unnamed: 0,stop_id,trip_id,stop_sequence,shape_dist_traveled_x,route_id,shape_id,route_long_name,shape_pt_sequence,shape_dist_traveled_y
0,1,0,20,24382.402,0,shp_1_30,RED_Rithala to Dilshad Garden,1,0.0
1,1,0,20,24382.402,0,shp_1_30,RED_Rithala to Dilshad Garden,2,1202.405
2,1,0,20,24382.402,0,shp_1_30,RED_Rithala to Dilshad Garden,3,1202.405
3,1,0,20,24382.402,0,shp_1_30,RED_Rithala to Dilshad Garden,4,2480.75
4,1,0,20,24382.402,0,shp_1_30,RED_Rithala to Dilshad Garden,5,2480.75


In [13]:
display(Markdown(f"Number of rows **{metro_data.shape[0]:,d}**"))

Number of rows **31,306,068**