In [1]:
import pandas as pd
import sqlite3
from datetime import datetime

conn = sqlite3.connect("ecommerce.db")


In [3]:
query_customer_base = """
SELECT
    c.customer_unique_id,
    COUNT(DISTINCT o.order_id) AS total_orders,
    SUM(p.payment_value) AS total_spent,
    AVG(r.review_score) AS avg_review_score,
    MAX(o.order_purchase_timestamp) AS last_purchase_date
FROM orders o
JOIN customers c
    ON o.customer_id = c.customer_id
LEFT JOIN payments p
    ON o.order_id = p.order_id
LEFT JOIN reviews r
    ON o.order_id = r.order_id
GROUP BY c.customer_unique_id
"""
customer_base = pd.read_sql_query(query_customer_base, conn)


In [4]:
customer_base["last_purchase_date"] = pd.to_datetime(
    customer_base["last_purchase_date"]
)


In [7]:
reference_date = customer_base["last_purchase_date"].max()

customer_base["recency_days"] = (
    reference_date - customer_base["last_purchase_date"]
).dt.days


In [9]:
customer_base.rename(
    columns={
        "total_orders": "frequency",
        "total_spent": "monetary"
    },
    inplace=True
)


In [11]:
customer_base["is_repeat_customer"] = (
    customer_base["frequency"] > 1
).astype(int)


In [13]:
value_threshold = customer_base["monetary"].quantile(0.75)
customer_base["is_high_value"] = (
    customer_base["monetary"] >= value_threshold
).astype(int)


In [15]:
customer_base["avg_review_score"] = customer_base[
    "avg_review_score"
].fillna(customer_base["avg_review_score"].median())


In [17]:
customer_base[
    ["recency_days", "frequency", "monetary", "avg_review_score"]
].describe()


Unnamed: 0,recency_days,frequency,monetary,avg_review_score
count,96096.0,96096.0,96095.0,96096.0
mean,287.735691,1.034809,167.349193,4.091808
std,153.414676,0.214384,232.882541,1.338904
min,0.0,1.0,0.0,1.0
25%,163.0,1.0,63.16,4.0
50%,268.0,1.0,108.16,5.0
75%,397.0,1.0,184.075,5.0
max,772.0,17.0,13664.08,5.0


“Most customers purchase only once and do not return for long periods, indicating high churn risk. A small segment of repeat, high-spending customers drives a disproportionate share of revenue.”

This notebook engineers customer-level RFM and behavioral features for churn prediction.

In [26]:
customer_base.to_sql("customer_base", conn, if_exists="replace", index=False)


96096

In [28]:
import sqlite3

conn = sqlite3.connect("ecommerce.db")

customer_base.to_sql("customer_base", conn, if_exists="replace", index=False)

conn.close()


In [31]:
import os
print(os.path.abspath("ecommerce.db"))


/Users/ananyaravikumar/anaconda_projects/ecommerce-retention-system/notebooks/ecommerce.db
