In [None]:
!pip install --upgrade scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (10.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.8/10.8 MB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.2.2
    Uninstalling scikit-learn-1.2.2:
      Successfully uninstalled scikit-learn-1.2.2
Successfully installed scikit-learn-1.3.2


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.cluster import DBSCAN

In [None]:
df = pd.read_csv("/content/historical_data.csv")

In [None]:
df.head()

1.0

In [None]:
df.dtypes

market_id                                       float64
created_at                                       object
actual_delivery_time                             object
store_id                                          int64
store_primary_category                           object
order_protocol                                  float64
total_items                                       int64
subtotal                                          int64
num_distinct_items                                int64
min_item_price                                    int64
max_item_price                                    int64
total_onshift_dashers                           float64
total_busy_dashers                              float64
total_outstanding_orders                        float64
estimated_order_place_duration                    int64
estimated_store_to_consumer_driving_duration    float64
dtype: object

In [None]:
store_id_unique = df["store_id"].unique().tolist()
store_id_and_category = {store_id: df[df.store_id == store_id].store_primary_category.mode()
                         for store_id in store_id_unique}
def fill(store_id):
    try:
        return store_id_and_category[store_id].values[0]
    except:
        return np.nan
# fill null values
df["store_primary_category"] = df.store_id.apply(fill)
df.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,indian,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,indian,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,indian,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,indian,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0


In [None]:
df['total_order_time'] = (pd.to_datetime(df['actual_delivery_time']) - pd.to_datetime(df['created_at'])).dt.total_seconds()
df.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,store_primary_category,order_protocol,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,total_order_time
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,american,1.0,4,3441,4,557,1239,33.0,14.0,21.0,446,861.0,3779.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,indian,2.0,1,1900,1,1400,1400,1.0,2.0,2.0,446,690.0,4024.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,indian,1.0,1,1900,1,1900,1900,1.0,0.0,0.0,446,690.0,1781.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,indian,1.0,6,6900,5,600,1800,1.0,1.0,2.0,446,289.0,3075.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,indian,1.0,3,3900,3,1100,1600,6.0,6.0,9.0,446,650.0,2390.0


In [None]:
df = df[df['total_order_time'] < 86400] # the equivalent of an entire day

## One-Hot Encoding

In [None]:
df.dropna(inplace=True)
# df.drop(columns=['store_id'], inplace=True)
# df.drop(columns=['created_at', 'actual_delivery_time'], inplace=True)

df_encoded = pd.get_dummies(df, columns=['market_id', 'store_primary_category', 'order_protocol'])

# Scaling

In [None]:

# train test split
train_df, test_df = train_test_split(df_encoded, test_size=0.2, random_state=42)

y_train = train_df["total_order_time"] # same thing for training data
X_train = train_df.drop(["total_order_time"], axis = 1)

y_test = test_df["total_order_time"] # set our output equal to the median house value column
X_test = test_df.drop(["total_order_time"], axis = 1) # remove that column from the input. axis = 1 means to remove the column

In [None]:
# Scaling to be done on numerical features
numerical_columns = ['total_items', 'subtotal', 'num_distinct_items', 'min_item_price', 'max_item_price',
       'total_onshift_dashers', 'total_busy_dashers',
       'total_outstanding_orders', 'estimated_order_place_duration',
       'estimated_store_to_consumer_driving_duration']
len(X_train)

142825

In [None]:
# Scale only the numerical columns
scaler = StandardScaler()
X_train[numerical_columns] = scaler.fit_transform(X_train[numerical_columns])

## K-Means Clustering

In [None]:
# Initialize K-means with the desired number of clusters
num_clusters = 5  # Replace this with your desired number of clusters
kmeans = KMeans(n_clusters=num_clusters)

# Fit K-means to your data
kmeans.fit(X_train)

# Get cluster labels for each data point
cluster_labels = kmeans.labels_

# Add the cluster labels to your dataset or perform further analysis
X_train['Cluster'] = cluster_labels

# View the clusters
print(X_train['Cluster'].value_counts())

## DBSCAN

In [None]:
dbscan = DBSCAN(eps=1.7, min_samples=5)  # Adjust parameters as needed

# Fit DBSCAN to the scaled data
dbscan.fit(X_train)

# Get cluster labels
cluster_labels = dbscan.labels_

# Analyze the results
# For example, count the number of clusters and outliers
n_clusters_ = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
n_noise_ = list(cluster_labels).count(-1)

print(f'Number of clusters: {n_clusters_}')
print(f'Number of noise points: {n_noise_}')

Number of clusters: 28
Number of noise points: 7403


## Prediction

In [None]:
X_DB = X_train
X_DB['Clusters'] = cluster_labels
DB = X_DB
DB['total_order_time'] = y_train
# Group by 'clusters' column and calculate the mean of 'total_order_time' for each cluster
cluster_means = DB.groupby('Clusters')['total_order_time'].mean().reset_index()

# Rename the columns for clarity (if needed)
cluster_means = cluster_means.rename(columns={'total_order_time': 'Mean_Total_Order_Time'})

DB = DB.merge(cluster_means, on='Clusters', how='left')
DB.head()


Unnamed: 0,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,...,order_protocol_1.0,order_protocol_2.0,order_protocol_3.0,order_protocol_4.0,order_protocol_5.0,order_protocol_6.0,order_protocol_7.0,Clusters,total_order_time,Mean_Total_Order_Time
0,-0.443743,-1.05791,-0.41496,-0.805581,-1.175297,1.620653,1.525089,1.568642,-0.639346,-0.795838,...,0,0,0,1,0,0,0,0,3045.0,2761.372689
1,0.65895,0.546859,0.814575,-0.35797,0.067074,2.314951,2.581353,1.98584,-0.639346,0.086602,...,0,0,1,0,0,0,0,0,2279.0,2761.372689
2,0.65895,1.15207,1.429342,0.026515,-0.108884,0.463489,0.406692,0.165338,-0.639346,-0.023131,...,0,0,0,0,1,0,0,0,2589.0,2761.372689
3,-0.811308,-1.061196,-1.029727,0.139374,-0.714962,-0.520101,-0.49424,-0.194969,1.524334,-0.077998,...,1,0,0,0,0,0,0,1,4267.0,3010.58205
4,-0.443743,-0.682734,-0.41496,-0.786452,-0.06445,-0.635817,-0.742773,-0.915584,-0.639346,-1.339933,...,0,0,1,0,0,0,0,0,906.0,2761.372689


## Root Mean Squared Error

In [None]:
rmse = np.sqrt(((DB['total_order_time'] - DB['Mean_Total_Order_Time']) ** 2).mean())
rmse

1145.2330994448823

In [None]:
df_encoded.head()

Unnamed: 0,total_items,subtotal,num_distinct_items,min_item_price,max_item_price,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_order_place_duration,estimated_store_to_consumer_driving_duration,...,store_primary_category_vietnamese,order_protocol_1.0,order_protocol_2.0,order_protocol_3.0,order_protocol_4.0,order_protocol_5.0,order_protocol_6.0,order_protocol_7.0,Clusters,total_order_time
92430,-0.443743,-1.05791,-0.41496,-0.805581,-1.175297,1.620653,1.525089,1.568642,-0.639346,-0.795838,...,0,0,0,0,1,0,0,0,0,3045.0
100843,0.65895,0.546859,0.814575,-0.35797,0.067074,2.314951,2.581353,1.98584,-0.639346,0.086602,...,0,0,0,1,0,0,0,0,0,2279.0
180890,0.65895,1.15207,1.429342,0.026515,-0.108884,0.463489,0.406692,0.165338,-0.639346,-0.023131,...,0,0,0,0,0,1,0,0,0,2589.0
12800,-0.811308,-1.061196,-1.029727,0.139374,-0.714962,-0.520101,-0.49424,-0.194969,1.524334,-0.077998,...,0,1,0,0,0,0,0,0,1,4267.0
119146,-0.443743,-0.682734,-0.41496,-0.786452,-0.06445,-0.635817,-0.742773,-0.915584,-0.639346,-1.339933,...,0,0,0,1,0,0,0,0,0,906.0


In [None]:

params = {
    'objective': 'regression',
    'metric': 'rmse',  # Root Mean Squared Error for evaluation
    'num_leaves': 31,
    'learning_rate': 0.1,
}

# Train the model
num_round = 100  # Number of boosting rounds
bst = lgb.train(params, train_data, num_round, valid_sets=[test_data], early_stopping_rounds=10)

# Predict on test set
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred, squared=False))

Ensmble

In [None]:
def getEuclideanDistance(row1, row2):
  euclidean_distance = np.linalg.norm(row1 - row2)

In [None]:
#HyperParameter as d distance
d = 4

x = df[(df['market_id']==1.0) & (df['subtotal'] == 3441)]['market_id']
def predictWithEnsemble(X):
  for cluster in cluster_labels:
    if getEuclideanDistance(cluster_means[cluster_means['Clusters'] == cluster][0], X) < d:
      return cluster_means['Mean_Order_Time']
  return regressor.predict(X)