# Creating features for delivery datasets

In [2]:
import pandas as pd

In [5]:
deli_cq = pd.read_csv("../LaDe/delivery/delivery_cq.csv")
deli_cq.head(2)


Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,accept_gps_lng,accept_gps_lat,delivery_time,delivery_gps_time,delivery_gps_lng,delivery_gps_lat,ds
0,2031782,10,Chongqing,73,108.71571,30.90228,50,14,10-22 10:26:00,10-22 10:26:00,108.71826,30.95587,10-22 17:04:00,10-22 17:04:00,108.66361,30.96702,1022
1,4285071,10,Chongqing,3605,108.71639,30.90269,50,14,09-07 10:13:00,09-07 10:13:00,108.71791,30.95635,09-09 15:44:00,09-09 15:44:00,108.71644,30.90266,907


In [None]:
# deli_hz_grouped_by_ds = deli_cq.groupby(['ds','courier_id']).agg({
#     'order_id': 'count'
# }).reset_index()

# print(deli_hz_grouped_by_ds.head(10))

# assumption 

- adding year as 2021 since working with datetime objects needs year . 

Since timestamps in LaDe do not include year information, we reconstruct absolute time using the provided MMDD field to ensure temporal consistency.

In [6]:
df = deli_cq.copy(deep=True)

def parse_lade_time(time_str, ds):
    mm = ds // 100
    dd = ds % 100
    return pd.to_datetime(
        f"2018-{mm:02d}-{dd:02d} {time_str.split()[1]}"
    )

df["accept_time"] = df.apply(
    lambda x: parse_lade_time(x["accept_time"], x["ds"]),
    axis=1
)

df["delivery_time"] = df.apply(
    lambda x: parse_lade_time(x["delivery_time"], x["ds"]),
    axis=1
)

# essentially label_time_min is the target ETA variable 

df["label_time_min"] = (
    df["delivery_time"] - df["accept_time"]
).dt.total_seconds() / 60    # difference is a datetime object so subtracting and converting to seconds 

df["label_time_min"].describe()

count    931351.000000
mean        182.032418
std         160.987852
min       -1438.000000
25%          79.000000
50%         144.000000
75%         240.000000
max        1053.000000
Name: label_time_min, dtype: float64

In [5]:
df.to_csv("../processed/featRecons/chq1.csv")

we need order_id with less unique orders for a particular day .< 10 .  ds = date here starts from 01-05 and goes to 31-10 for hanzhou city 

In [3]:
df = pd.read_csv("../processed/featRecons/chq1.csv")

In [4]:
# calculating haversine distance 
import numpy as np 

def haversine(lon1, lat1, lon2, lat2):
    R = 6371  # km
    lon1, lat1, lon2, lat2 = map(
        np.radians, [lon1, lat1, lon2, lat2]
    )

    dlon = lon2 - lon1
    dlat = lat2 - lat1

    a = np.sin(dlat/2)**2 + np.cos(lat1)*np.cos(lat2)*np.sin(dlon/2)**2
    c = 2 * np.arcsin(np.sqrt(a))

    return R * c


df["haversine_km"] = haversine(
    df["lng"], df["lat"],
    df["delivery_gps_lng"], df["delivery_gps_lat"]
)



# temporal features corresponding to starting 
df["hour"] = df["accept_time"].dt.hour
df["weekday"] = df["accept_time"].dt.weekday
df["is_weekend"] = df["weekday"].isin([5, 6]).astype(int)

df["hour_sin"] = np.sin(2 * np.pi * df["hour"] / 24)
df["hour_cos"] = np.cos(2 * np.pi * df["hour"] / 24)

AttributeError: Can only use .dt accessor with datetimelike values

In [7]:
df.head()


Unnamed: 0,order_id,region_id,city,courier_id,lng,lat,aoi_id,aoi_type,accept_time,accept_gps_time,...,delivery_gps_lng,delivery_gps_lat,ds,label_time_min,haversine_km,hour,weekday,is_weekend,hour_sin,hour_cos
0,2031782,10,Chongqing,73,108.71571,30.90228,50,14,2018-10-22 10:26:00,10-22 10:26:00,...,108.66361,30.96702,1022,398.0,8.747284,10,0,0,0.5,-0.866025
1,4285071,10,Chongqing,3605,108.71639,30.90269,50,14,2018-09-07 10:13:00,09-07 10:13:00,...,108.71644,30.90266,907,331.0,0.005821,10,4,0,0.5,-0.866025
2,4056800,10,Chongqing,3605,108.71645,30.90259,50,14,2018-06-26 09:49:00,06-26 09:49:00,...,108.71647,30.90251,626,374.0,0.009098,9,1,0,0.707107,-0.707107
3,3589481,10,Chongqing,3605,108.7165,30.90347,50,14,2018-09-11 11:01:00,09-11 11:01:00,...,108.7165,30.90341,911,373.0,0.006672,11,1,0,0.258819,-0.965926
4,2752329,10,Chongqing,3605,108.71608,30.90409,50,14,2018-10-01 09:52:00,10-01 09:52:00,...,108.71413,30.90397,1001,518.0,0.186524,9,0,0,0.707107,-0.707107


In [2]:
df["is_short_distance"] = (df["haversine_km"] < 0.02).astype(int)
df["log_dist"] = np.log(df["haversine_km"] + 1e-4)  # log transforming distance to prevent using shorter distance 




NameError: name 'df' is not defined

NameError: name 'df' is not defined