In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import glob
import logging
import sys
import pandas as pd
from data import (
    download_data_files,
    validate_and_save_data,
    transform_data,
    add_missing_times,
    transform_save_data_into_ts_data,
    generate_training_set,
    join_parquet_files
)
import inspect
import re
from pathlib import Path

In [None]:
# Configure logging for Jupyter Notebook
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)]
)

url = 'https://d37ci6vzurychx.cloudfront.net/trip-data/'
rides = download_data_files(url, year=2023)
rides

In [None]:
path = './data/raw/2024/yellow_tripdata_2024-01.parquet'
rides = validate_and_save_data(path, 2023, 1)
rides.tail()

In [None]:
# Define the directory containing the raw data files
raw_data_dir = './data/raw/2023/'

# Get a list of all .parquet files in the directory
parquet_files = glob.glob(f"{raw_data_dir}*.parquet")

# Regular expression to extract year and month from the filename
pattern = re.compile(r'yellow_tripdata_(\d{4})-(\d{2})\.parquet')

# Process each file
for path in parquet_files:
    # Extract year and month from the file name using regex
    filename = path.split('/')[-1]
    match = pattern.match(filename)
    if match:
        year, month = map(int, match.groups())
        
        # Validate and save data
        rides = validate_and_save_data(path, year, month)
        
        # Display the last few rows of the DataFrame
        print(f"File: {filename}")
        print(rides.tail())
    else:
        print(f"Filename does not match expected pattern: {filename}")

In [None]:
f = Path('./data/raw/2023')
g = Path('./data/raw/2023')
join_parquet_files(f,g, 'yellow_tripdata_2023.parquet')

In [None]:

a = transform_data(rides) 
a

In [None]:
b = add_missing_times(a, 'h')
b

In [None]:
transform_save_data_into_ts_data(rides)

In [None]:
# Define the directory containing the raw data files
raw_data_dir = './data/bronze/2023/'

# Get a list of all .parquet files in the directory
parquet_files = glob.glob(f"{raw_data_dir}*.parquet")

# Regular expression to extract year and month from the filename
pattern = re.compile(r'validated_yellow_tripdata_(\d{4})-(\d{2})\.parquet')

# Process each file
for path in parquet_files:
    # Extract year and month from the file name using regex
    filename = path.split('/')[-1]
    match = pattern.match(filename)
    if match:
        year, month = map(int, match.groups())
        
        # Load the data into a DataFrame
        rides = pd.read_parquet(path)
        
        # Transform and save data into time series data
        ts_data = transform_save_data_into_ts_data(rides)
        
        # Display the last few rows of the transformed DataFrame
        print(f"File: {filename}")
        print(ts_data.tail())
    else:
        print(f"Filename does not match expected pattern: {filename}")

In [None]:
f1 = Path('./data/bronze/2023')
g1 = Path('./data/bronze/2023')
join_parquet_files(f1,g1, 'validated_yellow_tripdata_2023.parquet')

In [None]:
f = Path('./data/silver/2023')
g = Path('./data/silver/2023')
join_parquet_files(f,g, 'ts_data_2023.parquet')

In [None]:
from data import slice_and_slide
j = transform_save_data_into_ts_data(rides)
slice_and_slide(j, start_position = 0, n_features = 24*7, step_size = 1, target_col= 'ride_count')


In [None]:
e = pd.read_parquet("./data/silver/2023/ts_data_2023-02.parquet")
f = generate_training_set(e, start_position = 0, n_features = 24*7*3*1, step_size = 24, pickup_location_id=43, target_col='ride_count')


In [None]:
pd.read_parquet('./data/silver/2023/ts_data_2023-03.parquet').tail()

In [None]:
# Define the directory containing the raw data files
raw_data_dir = './data/silver/2023/'

# Get a list of all .parquet files in the directory
parquet_files = glob.glob(f"{raw_data_dir}*.parquet")

# Regular expression to extract year and month from the filename
pattern = re.compile(r'ts_data_(\d{4})-(\d{2})\.parquet')

# Process each file
for path in parquet_files:
    # Extract year and month from the file name using regex
    filename = path.split('/')[-1]
    match = pattern.match(filename)
    if match:
        year, month = map(int, match.groups())
        
        # Load the data into a DataFrame
        rides = pd.read_parquet(path)
        
        # Generate the training set
        training_set = generate_training_set(rides, start_position=0, n_features=24*7*4*1, step_size=24, target_col='ride_count')
        
        # Check if the training set was successfully generated
        if training_set is not None:
            # Continue processing the training_set
            # (Add your processing code here)
            pass
        else:
            print(f"Skipping file {filename} due to insufficient data.")

In [None]:
f = Path('./data/gold/2023/')
g = Path('./data/gold/2023/')
join_parquet_files(f,g, 'model_data_2023.parquet')

In [None]:

signature = inspect.signature(validate_and_save_data)
parameters = signature.parameters
# Print the parameters using a list comprehension
parameters_list = [f"Parameter: {param_name}, Default: {param.default}" for param_name, param in parameters.items()]
print(parameters_list)