## Target

- run the pcmci+ causal discovery algorithm on the transformed data

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.ml.feature import StringIndexer

spark = SparkSession.builder.appName("pcmci_queue_model").getOrCreate()

26/02/02 00:48:25 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [4]:
!pip install tigramite


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.3.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [7]:
from tigramite.pcmci import PCMCI

import pandas as pd
import numpy as np

from tigramite import data_processing as pp
from tigramite.pcmci import PCMCI
from tigramite.independence_tests.parcorr import ParCorr



## Raw tables in dataset

- order_idx
- order_id
- from_dipan_id
- from_city_name
- delivery_user_id
- poi_lng
- poi_lat
- aoi_id
- typecode
- receipt_time
- receipt_lng
- receipt_lat
- sign_time
- sign_lng
- sign_lat
- ds
- receipt_datetime
- sign_datetime
- i
- j
- finish_time_minute
- accept_time_minute
- time_to_last_package
- dis_to_last_package
- todo_task
- todo_task_num
- dis_avg_day
- time_avg_order
- relative_dis_to_last_package
- grid_x
- grid_y
- grid_id
- days

# final columns for algorithm

- todo_task
- todo_task_num
- dis_to_last_package
- relative_dis_to_last_package
- dis_avg_day
- time_to_last_package
- time_avg_order
- typecode_encoded
- city_encoded
- days

# Target variable

finish_time_minute

In [12]:

schema = """
order_idx STRING,
order_id STRING,
from_dipan_id STRING,
from_city_name STRING,
delivery_user_id STRING,
poi_lng DOUBLE,
poi_lat DOUBLE,
aoi_id STRING,
typecode STRING,
receipt_time STRING,
receipt_lng DOUBLE,
receipt_lat DOUBLE,
sign_time STRING,
sign_lng DOUBLE,
sign_lat DOUBLE,
ds STRING,
receipt_datetime TIMESTAMP,
sign_datetime TIMESTAMP,
i INT,
j INT,
finish_time_minute DOUBLE,
accept_time_minute DOUBLE,
time_to_last_package DOUBLE,
dis_to_last_package DOUBLE,
todo_task DOUBLE,
todo_task_num DOUBLE,
dis_avg_day DOUBLE,
time_avg_order DOUBLE,
relative_dis_to_last_package DOUBLE,
grid_x DOUBLE,
grid_y DOUBLE,
grid_id STRING,
days STRING
"""

df = spark.read.csv(
    "./raw_data/package_feature.csv",
    header=True,
    schema=schema
)


df = df.orderBy("receipt_datetime")                                # ordering metric


# Encode categorical causal variables

cat_cols = ["typecode", "from_city_name", "days"]

for col in cat_cols:
    indexer = StringIndexer(
        inputCol=col,
        outputCol=f"{col}_enc",
        handleInvalid="keep"
    )
    df = indexer.fit(df).transform(df)

# Select ONLY causal variables

df_pcmci = df.select(
    "todo_task",
    "todo_task_num",
    "time_avg_order",
    "dis_avg_day",
    "time_to_last_package",
    "dis_to_last_package",
    "relative_dis_to_last_package",
    "typecode_enc",
    "from_city_name_enc",
    "days_enc",
    "finish_time_minute"
)


# Create timestep index (implicit time)


w = Window.orderBy(F.monotonically_increasing_id())

df_pcmci = df_pcmci.withColumn(
    "timestep",
    F.row_number().over(w)
)

# Final ordering


df_pcmci = df_pcmci.select(
    "todo_task",
    "todo_task_num",
    "time_avg_order",
    "dis_avg_day",
    "time_to_last_package",
    "dis_to_last_package",
    "relative_dis_to_last_package",
    "typecode_enc",
    "from_city_name_enc",
    "days_enc",
    "finish_time_minute"
)




In [13]:
# drop na values

df_pcmci = df_pcmci.dropna()

# Export

# coalesce 1 = creates one single file for algo
df_pcmci.coalesce(1).write.mode("overwrite") \
    .option("header", True) \
    .csv("./pcmci_ready")

In [18]:
df = pd.read_csv(
    "./pcmci_ready/part-00000-ad3e52f7-824a-4915-9043-0ee06b0077bb-c000.csv"
)

var_names = df.columns.tolist()
data = df.values

In [19]:
df.isna().sum()


todo_task                       0
todo_task_num                   0
time_avg_order                  0
dis_avg_day                     0
time_to_last_package            0
dis_to_last_package             0
relative_dis_to_last_package    0
typecode_enc                    0
from_city_name_enc              0
days_enc                        0
finish_time_minute              0
dtype: int64

In [11]:
data

array([[           nan, 1.90000000e+01, 1.54761905e+01, ...,
        0.00000000e+00, 0.00000000e+00, 4.86000000e+02],
       [2.93000000e+03, 1.00000000e+00, 1.26945876e+01, ...,
        0.00000000e+00, 0.00000000e+00, 5.37000000e+02],
       [           nan, 2.00000000e+00, 1.45000000e+01, ...,
        0.00000000e+00, 0.00000000e+00, 4.99000000e+02],
       ...,
       [           nan, 3.00000000e+00, 1.50910931e+01, ...,
        0.00000000e+00, 2.40000000e+01, 1.21700000e+03],
       [7.84870000e+04, 1.00000000e+00, 1.50910931e+01, ...,
        0.00000000e+00, 2.40000000e+01, 1.22800000e+03],
       [           nan, 0.00000000e+00, 1.50910931e+01, ...,
        0.00000000e+00, 2.40000000e+01, 1.23500000e+03]],
      shape=(78633, 11))

In [20]:
dataframe = pp.DataFrame(
    data=data,
    var_names=var_names
)

In [None]:
# Momentary Conditional Independence (MCI)

ci_test = ParCorr(significance="analytic")

pcmci = PCMCI(
    dataframe=dataframe,
    cond_ind_test=ci_test,
    verbosity=1
)

results = pcmci.run_pcmciplus(
    tau_min=1,
    tau_max=10,
    pc_alpha=0.05
)


## Significant links at alpha = 0.05:

    Variable todo_task has 10 link(s):
        (todo_task -1): pval = 0.00000 | val =  0.195
        (todo_task -2): pval = 0.00000 | val =  0.149
        (todo_task -3): pval = 0.00000 | val =  0.128
        (todo_task -4): pval = 0.00000 | val =  0.088
        (todo_task -5): pval = 0.00000 | val =  0.082
        (todo_task -7): pval = 0.00000 | val =  0.063
        (todo_task -6): pval = 0.00001 | val =  0.050
        (todo_task -8): pval = 0.00001 | val =  0.049
        (todo_task -9): pval = 0.01187 | val =  0.028
        (todo_task -10): pval = 0.01282 | val =  0.028

    Variable todo_task_num has 0 link(s):

    Variable time_avg_order has 1 link(s):
        (dis_avg_day -2): pval = 0.01079 | val =  0.029

    Variable dis_avg_day has 5 link(s):
        (dis_avg_day -9): pval = 0.00004 | val = -0.046
        (typecode_enc -9): pval = 0.01287 | val =  0.028
        (relative_dis_to_last_package -3): pval = 0.01311 | val =  0.028
        (t

## output

##
## Step 1: PC1 algorithm for selecting lagged conditions
##

Parameters:
independence test = par_corr
tau_min = 1
tau_max = 10
pc_alpha = [0.05]
max_conds_dim = None
max_combinations = 1

 Resulting lagged parent (super)sets:


    Variable todo_task has 10 link(s):
        (todo_task -1): max_pval = 0.00000, |min_val| =  0.197
        (todo_task -2): max_pval = 0.00000, |min_val| =  0.152
        (todo_task -3): max_pval = 0.00000, |min_val| =  0.133
        (todo_task -4): max_pval = 0.00000, |min_val| =  0.095
        (todo_task -5): max_pval = 0.00000, |min_val| =  0.090
        (todo_task -7): max_pval = 0.00000, |min_val| =  0.076
        (todo_task -8): max_pval = 0.00000, |min_val| =  0.065
        (todo_task -6): max_pval = 0.00000, |min_val| =  0.061
        (todo_task -10): max_pval = 0.00000, |min_val| =  0.054
        (todo_task -9): max_pval = 0.00001, |min_val| =  0.049

    Variable todo_task_num has 0 link(s):

    Variable time_avg_order has 1 link(s):
        (dis_avg_day -2): max_pval = 0.02054, |min_val| =  0.026

    Variable dis_avg_day has 5 link(s):
        (dis_avg_day -9): max_pval = 0.00009, |min_val| =  0.044
        (todo_task -6): max_pval = 0.00269, |min_val| =  0.034
        (typecode_enc -9): max_pval = 0.01077, |min_val| =  0.029
        (relative_dis_to_last_package -3): max_pval = 0.01803, |min_val| =  0.026
        (dis_to_last_package -4): max_pval = 0.04804, |min_val| =  0.022

    Variable time_to_last_package has 6 link(s):
        (time_avg_order -7): max_pval = 0.00304, |min_val| =  0.033
        (relative_dis_to_last_package -9): max_pval = 0.00566, |min_val| =  0.031
        (typecode_enc -2): max_pval = 0.00742, |min_val| =  0.030
        (time_avg_order -8): max_pval = 0.01799, |min_val| =  0.026
        (dis_avg_day -8): max_pval = 0.03197, |min_val| =  0.024
        (time_to_last_package -5): max_pval = 0.03480, |min_val| =  0.024

    Variable dis_to_last_package has 3 link(s):
        (time_avg_order -2): max_pval = 0.00911, |min_val| =  0.029
        (typecode_enc -2): max_pval = 0.01478, |min_val| =  0.027
        (typecode_enc -9): max_pval = 0.02402, |min_val| =  0.025

    Variable relative_dis_to_last_package has 3 link(s):
        (relative_dis_to_last_package -8): max_pval = 0.00000, |min_val| =  0.053
        (relative_dis_to_last_package -9): max_pval = 0.00385, |min_val| =  0.032
        (dis_to_last_package -6): max_pval = 0.02212, |min_val| =  0.026

    Variable typecode_enc has 5 link(s):
        (dis_to_last_package -1): max_pval = 0.00791, |min_val| =  0.030
        (finish_time_minute -8): max_pval = 0.01158, |min_val| =  0.028
        (days_enc -2): max_pval = 0.01230, |min_val| =  0.028
        (time_avg_order -1): max_pval = 0.01494, |min_val| =  0.027
        (dis_to_last_package -9): max_pval = 0.02954, |min_val| =  0.024

    Variable from_city_name_enc has 0 link(s):

    Variable days_enc has 2 link(s):
        (days_enc -1): max_pval = 0.00000, |min_val| =  0.702
        (time_avg_order -3): max_pval = 0.01090, |min_val| =  0.028

    Variable finish_time_minute has 9 link(s):
        (finish_time_minute -1): max_pval = 0.00000, |min_val| =  0.311
        (finish_time_minute -2): max_pval = 0.00000, |min_val| =  0.195
        (finish_time_minute -3): max_pval = 0.00000, |min_val| =  0.106
        (finish_time_minute -4): max_pval = 0.00000, |min_val| =  0.098
        (finish_time_minute -5): max_pval = 0.00000, |min_val| =  0.058
        (finish_time_minute -6): max_pval = 0.00030, |min_val| =  0.040
        (time_to_last_package -1): max_pval = 0.00208, |min_val| =  0.034
        (finish_time_minute -9): max_pval = 0.00228, |min_val| =  0.034
        (finish_time_minute -7): max_pval = 0.01837, |min_val| =  0.026

##
## Step 2: PC algorithm with contemp. conditions and MCI tests
##

Parameters:

independence test = par_corr
tau_min = 1
tau_max = 10
pc_alpha = 0.05
contemp_collider_rule = majority
conflict_resolution = True
reset_lagged_links = False
max_conds_dim = None
max_conds_py = None
max_conds_px = None
max_conds_px_lagged = None
## Significant links at alpha = 0.05:


    Variable todo_task has 10 link(s):
        (todo_task -1): pval = 0.00000 | val =  0.195
        (todo_task -2): pval = 0.00000 | val =  0.149
        (todo_task -3): pval = 0.00000 | val =  0.128
        (todo_task -4): pval = 0.00000 | val =  0.088
        (todo_task -5): pval = 0.00000 | val =  0.082
        (todo_task -7): pval = 0.00000 | val =  0.063
        (todo_task -6): pval = 0.00001 | val =  0.050
        (todo_task -8): pval = 0.00001 | val =  0.049
        (todo_task -9): pval = 0.01187 | val =  0.028
        (todo_task -10): pval = 0.01282 | val =  0.028

    Variable todo_task_num has 0 link(s):

    Variable time_avg_order has 1 link(s):
        (dis_avg_day -2): pval = 0.01079 | val =  0.029

    Variable dis_avg_day has 5 link(s):
        (dis_avg_day -9): pval = 0.00004 | val = -0.046
        (typecode_enc -9): pval = 0.01287 | val =  0.028
        (relative_dis_to_last_package -3): pval = 0.01311 | val =  0.028
        (todo_task -6): pval = 0.03045 | val =  0.024
        (dis_to_last_package -4): pval = 0.03397 | val = -0.024

    Variable time_to_last_package has 6 link(s):
        (time_avg_order -7): pval = 0.00282 | val = -0.033
        (relative_dis_to_last_package -9): pval = 0.00431 | val =  0.032
        (time_avg_order -8): pval = 0.00595 | val = -0.031
        (dis_avg_day -8): pval = 0.00974 | val = -0.029
        (typecode_enc -2): pval = 0.01082 | val = -0.029
        (time_to_last_package -5): pval = 0.02863 | val =  0.024

    Variable dis_to_last_package has 3 link(s):
        (time_avg_order -2): pval = 0.00970 | val =  0.029
        (typecode_enc -2): pval = 0.01587 | val =  0.027
        (typecode_enc -9): pval = 0.02827 | val =  0.025

    Variable relative_dis_to_last_package has 3 link(s):
        (relative_dis_to_last_package -8): pval = 0.00000 | val =  0.053
        (relative_dis_to_last_package -9): pval = 0.00406 | val =  0.032
        (dis_to_last_package -6): pval = 0.02261 | val = -0.026

    Variable typecode_enc has 5 link(s):
        (dis_to_last_package -1): pval = 0.01021 | val =  0.029
        (time_avg_order -1): pval = 0.01300 | val =  0.028
        (dis_to_last_package -9): pval = 0.02787 | val = -0.025
        (days_enc -2): pval = 0.03343 | val =  0.024
        (finish_time_minute -8): pval = 0.04725 | val = -0.022

    Variable from_city_name_enc has 0 link(s):

    Variable days_enc has 2 link(s):
        (days_enc -1): pval = 0.00000 | val =  0.702
        (time_avg_order -3): pval = 0.01167 | val = -0.028

    Variable finish_time_minute has 7 link(s):
        (finish_time_minute -1): pval = 0.00000 | val =  0.310
        (finish_time_minute -2): pval = 0.00000 | val =  0.193
        (finish_time_minute -3): pval = 0.00000 | val =  0.104
        (finish_time_minute -4): pval = 0.00000 | val =  0.095
        (finish_time_minute -5): pval = 0.00000 | val =  0.055
        (time_to_last_package -1): pval = 0.00000 | val = -0.055
        (finish_time_minute -6): pval = 0.00184 | val =  0.035

finish_time(t−1) → finish_time(t) --> delay propagation with coefficient ≈ 0.31

time_to_last_package(t−1) → finish_time(t) --> has negatice coefficient = if last package is closer in time → finish time reduces & if last package is far → delay increases