# Imports

In [1]:
import datetime
import numpy as np
import pandas as pd

# Config

In [2]:
INPUT_PATH = '/Users/maedeh/Desktop/demand_project/demand_project/shoofer-demand-prediction/ride_data'
OUTPUT_PATH = '/Users/maedeh/Desktop/demand_project/demand_project/shoofer-demand-prediction/data/labels_phase2.parquet'
START_DATE = '2023-01-01'
END_DATE = '2023-04-30'

# Loading Dataset

In [3]:
def load_data(path, start_date: str, end_date: str):
    df = pd.read_parquet(path) 
    start_date = datetime.date.fromisoformat(start_date)
    end_date = datetime.date.fromisoformat(end_date)
    filtered_df = df[(df['tpep_pickup_datetime'].dt.date >= start_date) &
                     (df['tpep_pickup_datetime'].dt.date <= end_date)]
    dataset = filtered_df.filter(items=['tpep_pickup_datetime', 'PULocationID'])
    dataset['PU_date'] = pd.to_datetime(dataset['tpep_pickup_datetime'].dt.date)
    return dataset

In [4]:
rides_df = load_data(INPUT_PATH, START_DATE, END_DATE)
print(f'rides_df shape : {rides_df.shape}')
rides_df.head()

rides_df shape : (12672629, 3)


Unnamed: 0,tpep_pickup_datetime,PULocationID,PU_date
0,2023-01-01 00:32:10,161,2023-01-01
1,2023-01-01 00:55:08,43,2023-01-01
2,2023-01-01 00:25:04,48,2023-01-01
3,2023-01-01 00:03:48,138,2023-01-01
4,2023-01-01 00:10:29,107,2023-01-01


# Labeling

In [5]:
def labeling(dataset):
    dataset=rides_df
    dataset_labels = (
        dataset
        .groupby([dataset.PU_date.dt.date,(dataset.tpep_pickup_datetime.dt.hour//3)*3,'PULocationID'])['PULocationID']
        .count()
        .to_frame('Demand')
        .sort_values(['PULocationID','PU_date','tpep_pickup_datetime'])
        .reset_index()
        .rename(columns={'PULocationID': 'Location', 'PU_date': 'Date', 'tpep_pickup_datetime': 'Hour_interval'})
    )
    
    
    locations = pd.DataFrame(dataset_labels['Location'].unique(), columns=['Location'])
    dates = pd.DataFrame(dataset_labels['Date'].unique(), columns=['Date'])
    hour = pd.DataFrame(dataset_labels['Hour_interval'].unique(), columns=['Hour_interval'])\
        .sort_values('Hour_interval').reset_index(drop=True)
    
    
    location_date_df = (
        locations
        .merge(dates, how='cross')
        .sort_values(['Location', 'Date'])
        .reset_index(drop=True)
    )
    
    
    location_date_hour_df = (
        location_date_df
        .merge(hour, how='cross')
        .sort_values(['Location', 'Date', 'Hour_interval'])
        .reset_index(drop=True)
    )
    
    labels_df = (
        location_date_hour_df
        .merge(dataset_labels, how='left', on=['Location', 'Date', 'Hour_interval'])
        .fillna(value=0)
    )
    
    labels_df['Date'] = pd.to_datetime(labels_df['Date'])

    return labels_df

In [6]:
labels_df = labeling(rides_df)
print(f'labels_df shape : {labels_df.shape}')
labels_df.head()

labels_df shape : (251520, 4)


Unnamed: 0,Location,Date,Hour_interval,Demand
0,1,2023-01-01,0,0.0
1,1,2023-01-01,3,1.0
2,1,2023-01-01,6,1.0
3,1,2023-01-01,9,1.0
4,1,2023-01-01,12,13.0


# File saving

In [7]:
def save_labels(dataset, path):
    labels_df = dataset.to_parquet(path, index=False)

In [8]:
save_labels(labels_df, OUTPUT_PATH)