In [1]:
#Load the Cleaned Data
import pandas as pd
orders_full_data = pd.read_csv("orders_full_data_cleaned.csv")

In [2]:
# Convert Dates to Datetime Format
orders_full_data["order_purchase_timestamp"] = pd.to_datetime(orders_full_data["order_purchase_timestamp"])

In [3]:
# Define the Last Purchase per Customer
last_purchase = orders_full_data.groupby("customer_unique_id")["order_purchase_timestamp"].max().reset_index()
last_purchase.columns = ["customer_unique_id", "last_purchase_date"]

In [4]:
# Define Reference Date (Snapshot Date)
## We'll use the max date in the dataset as our snapshot reference
snapshot_date = orders_full_data["order_purchase_timestamp"].max()

In [5]:
# Calculate Days Since Last Purchase
last_purchase["days_since_last_purchase"] = (snapshot_date - last_purchase["last_purchase_date"]).dt.days

In [6]:
# Create Churn Label
# We'll say a customer churned if they haven’t made a purchase in the last 180 days (6 months)
last_purchase["churn"] = last_purchase["days_since_last_purchase"] > 180
last_purchase["churn"] = last_purchase["churn"].astype(int)  # 1 = churned, 0 = active

In [7]:
# Join Churn Label Back to Orders Data
orders_labeled = orders_full_data.merge(last_purchase[["customer_unique_id", "churn"]], on="customer_unique_id", how="left")
# Preview the result
orders_labeled.head()

Unnamed: 0,order_id,customer_id,order_status,order_purchase_timestamp,order_approved_at,order_delivered_carrier_date,order_delivered_customer_date,order_estimated_delivery_date,customer_unique_id,customer_zip_code_prefix,...,payment_type,payment_installments,payment_value,order_item_id,product_id,seller_id,shipping_limit_date,price,freight_value,churn
0,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,7c396fd4830fd04220f754e42b4e5bff,3149,...,credit_card,1.0,18.12,1.0,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-10-06 11:07:15,29.99,8.72,1
1,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,7c396fd4830fd04220f754e42b4e5bff,3149,...,voucher,1.0,2.0,1.0,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-10-06 11:07:15,29.99,8.72,1
2,e481f51cbdc54678b7cc49136f2d6af7,9ef432eb6251297304e76186b10a928d,delivered,2017-10-02 10:56:33,2017-10-02 11:07:15,2017-10-04 19:55:00,2017-10-10 21:25:13,2017-10-18 00:00:00,7c396fd4830fd04220f754e42b4e5bff,3149,...,voucher,1.0,18.59,1.0,87285b34884572647811a353c7ac498a,3504c0cb71d7fa48d967e0e4c94d59d9,2017-10-06 11:07:15,29.99,8.72,1
3,53cdb2fc8bc7dce0b6741e2150273451,b0830fb4747a6c6d20dea0b8c802d7ef,delivered,2018-07-24 20:41:37,2018-07-26 03:24:27,2018-07-26 14:31:00,2018-08-07 15:27:45,2018-08-13 00:00:00,af07308b275d755c9edb36a90c618231,47813,...,boleto,1.0,141.46,1.0,595fac2a385ac33a80bd5114aec74eb8,289cdb325fb7e7f891c38608bf9e0962,2018-07-30 03:24:27,118.7,22.76,0
4,47770eb9100c2d0c44946d9cf07ec65d,41ce2a54c0b03bf3443c3d931a367089,delivered,2018-08-08 08:38:49,2018-08-08 08:55:23,2018-08-08 13:50:00,2018-08-17 18:06:29,2018-09-04 00:00:00,3a653a41f6f9fc3d2a113cf8398680e8,75265,...,credit_card,3.0,179.12,1.0,aa4383b373c6aca5d8797843e5594415,4869f7a5dfa277a7dca6462dcf3b52b2,2018-08-13 08:55:23,159.9,19.22,0


In [8]:
# Select 5 random rows
orders_labeled[["customer_unique_id", "order_purchase_timestamp", "churn"]].sample(5)

Unnamed: 0,customer_unique_id,order_purchase_timestamp,churn
83417,d64729c1f8834e961018eb9378724361,2018-01-21 13:11:51,1
73442,a4f095a4403c30562bc368a7dfc5d3e7,2017-10-17 12:45:51,1
19973,a7115a4c93766e22c9aeb8c22f094dba,2018-03-19 21:46:23,0
90701,f7ca6b044cc46ab9fe26471d63cfe270,2018-08-11 14:31:30,0
38029,7fde9b9f9d4350aa6d569b24c82c49ab,2017-07-19 01:08:24,1


In [9]:
# Save the resulting data with churn label
orders_labeled.to_csv("orders_labeled.csv", index=False)