In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [2]:
# Load dataset
file_path = "/Users/adaguan/Desktop/HopSkipDrive_Take_Home/boost_df.csv"
df = pd.read_csv(file_path)

In [3]:
# Convert timestamps to datetime
datetime_cols = ["boost_timestamp", "created_at", "claimed_at", "scheduled_starts_at", 
                 "scheduled_ends_at", "unclaimed_at", "trip_completed_at"]
for col in datetime_cols:
    df[col] = pd.to_datetime(df[col])

In [4]:
# Sort data before aggregation to ensure chronological order
df.sort_values(by=['trip_id', 'created_at'], inplace=True)

In [5]:
# Aggregate data by trip_id
df_grouped = df.groupby("trip_id").agg({
    "cumulative_boost_amount_cents": "max",
    "seq_boost_count": "max",
    "single_boost_amount_cents": "sum",
    "manual_boost": "max",
    "boost_ind": "max",
    "boost_timestamp": "max",
    "created_at": "first",
    "claimed_at": "first",
    "scheduled_starts_at": "first",
    "trip_completed_at": "first",
    "unclaimed_at": "first",
    "total_predicted_duration_mins": "max",
    "total_predicted_distance_miles": "max",
    "total_predicted_distance_miles_for_fare": "max",
    "dollars_paid_to_driver": "max",
    "ever_unclaimed": "max",
    "origin_metro_area_name": "first",
    "commute_minutes": "first",
    "commute_distance": "first",
    "is_same_day_ride": "first",
    "trip_starts_during_peak_hours": "first"
}).reset_index()

In [6]:
df_grouped.to_pickle("df_grouped_raw.pkl")  # Save before feature engineering

In [7]:
# Split data BEFORE feature engineering
train_df, test_df = train_test_split(df_grouped, test_size=0.2, random_state=42)

### Feature Engineering

In [8]:
# Compute time-based features separately for train and test
train_df['time_to_claim'] = (train_df['claimed_at'] - train_df['created_at']).dt.total_seconds() / 60
test_df['time_to_claim'] = (test_df['claimed_at'] - test_df['created_at']).dt.total_seconds() / 60

train_df['time_to_unclaim'] = (train_df['unclaimed_at'] - train_df['created_at']).dt.total_seconds() / 60
test_df['time_to_unclaim'] = (test_df['unclaimed_at'] - test_df['created_at']).dt.total_seconds() / 60

In [9]:
# Fill NaN values with a default value
train_df['time_to_unclaim'] = train_df['time_to_unclaim'].fillna(0)
test_df['time_to_unclaim'] = test_df['time_to_unclaim'].fillna(0)

In [10]:
# Temporal features
train_df['hour_of_day'] = train_df['created_at'].dt.hour
test_df['hour_of_day'] = test_df['created_at'].dt.hour

# Day of the week (Monday=0, Sunday=6)
train_df['day_of_week'] = train_df['created_at'].dt.dayofweek
test_df['day_of_week'] = test_df['created_at'].dt.dayofweek

train_df['month'] = train_df['created_at'].dt.month
test_df['month'] = test_df['created_at'].dt.month

In [11]:
# Normalize boosts (preventing division errors)
train_df['boost_per_mile'] = train_df['cumulative_boost_amount_cents'] / train_df['total_predicted_distance_miles'].replace(0, np.nan)
test_df['boost_per_mile'] = test_df['cumulative_boost_amount_cents'] / test_df['total_predicted_distance_miles'].replace(0, np.nan)

train_df['boost_per_mile'].fillna(0, inplace=True)
test_df['boost_per_mile'].fillna(0, inplace=True)

In [12]:
# Calculate the means and modes from train_df
train_means = train_df.mean()  # Mean for numerical columns
train_modes = train_df.mode().iloc[0]  # Mode for categorical columns

# Fill missing values in train_df using the computed means/modes
train_df.fillna(train_means, inplace=True)
train_df.fillna(train_modes, inplace=True)

# Fill missing values in test_df using the statistics from train_df
test_df.fillna(train_means, inplace=True)  
test_df.fillna(train_modes, inplace=True)  

  train_means = train_df.mean()  # Mean for numerical columns
  train_means = train_df.mean()  # Mean for numerical columns


In [13]:
# Drop leakage-prone columns (future info)
drop_cols = ['claimed_at', 'unclaimed_at', 'trip_completed_at']
train_df.drop(columns=drop_cols, inplace=True)
test_df.drop(columns=drop_cols, inplace=True)

In [14]:
# Define features and target
features = [
    'cumulative_boost_amount_cents', 'total_predicted_duration_mins', 'total_predicted_distance_miles',
    'is_same_day_ride', 'trip_starts_during_peak_hours', 'hour_of_day', 'day_of_week', 'month',
    'commute_minutes', 'commute_distance', 'seq_boost_count', 'manual_boost',
    'boost_ind', 'total_predicted_distance_miles_for_fare', 'dollars_paid_to_driver'
]

# Targets for optimization
target_claim = 'time_to_claim'
target_cost = 'boost_per_mile'

In [15]:
# Extract y_train and y_test before selecting features
y_train_claim = train_df[target_claim]  
y_test_claim = test_df[target_claim]
y_train_cost = train_df[target_cost]
y_test_cost = test_df[target_cost]

# Select only feature columns for X_train/X_test
X_train = train_df[features]
X_test = test_df[features]

In [16]:
X_train.to_csv("X_train.csv", index=False)
X_test.to_csv("X_test.csv", index=False)
y_train_claim.to_csv("y_train_claim.csv", index=False)
y_test_claim.to_csv("y_test_claim.csv", index=False)
y_train_cost.to_csv("y_train_cost.csv", index=False)
y_test_cost.to_csv("y_test_cost.csv", index=False)

print("Prepared data saved to CSV files.")

Prepared data saved to CSV files.
