**Linear Regression Mathematical Approach**

In [0]:
from pyspark.sql import functions as F
df_toner = spark.table("toner_regression_features")

df_math_toner = df_toner.select(
    "deviceId",
    "color",
    "timestamp",
    "days_since_start",
    F.col("toner_pct_remaining").alias("typical")
)
df_stats_toner = (
    df_math_toner
    .groupBy("deviceId", "color")
    .agg(
        F.count("*").alias("n"),
        F.sum("days_since_start").alias("sum_t"),
        F.sum("typical").alias("sum_y"),
        F.sum(F.col("days_since_start") * F.col("typical")).alias("sum_ty"),
        F.sum(F.col("days_since_start") ** 2).alias("sum_t2"),
        F.min("timestamp").alias("start_date")
    )
)
df_regression_toner = (
    df_stats_toner
    .withColumn(
        "m",
        (
            F.col("n") * F.col("sum_ty") -
            F.col("sum_t") * F.col("sum_y")
        ) /
        (
            F.col("n") * F.col("sum_t2") -
            F.col("sum_t") ** 2
        )
    )
    .withColumn(
        "c",
        (F.col("sum_y") - F.col("m") * F.col("sum_t")) / F.col("n")
    )
)
df_prediction_toner = df_regression_toner.withColumn(
    "days_to_empty",
    -F.col("c") / F.col("m")
)
df_prediction1 = (
    df_prediction_toner
    .withColumn(
        "predicted_end_date",
        F.from_unixtime(
            F.unix_timestamp(F.col("start_date")) +
            (F.col("days_to_empty") * F.lit(86400))
        )
    )
)
df_prediction1.filter(F.col("m") < 0).display()



deviceId,color,n,sum_t,sum_y,sum_ty,sum_t2,start_date,m,c,days_to_empty,predicted_end_date
mn=QlA1MEM1NQ==:sn=NDMwMDY5MzcwMA==,black,30,837.1456828703704,2408.0,63383.545810185184,29985.376968292258,2025-10-03T00:14:01.833Z,-0.5753023500526138,96.3203959563908,167.42569528454368,2026-03-19 10:27:01
mn=QlA1MEM1NQ==:sn=NDMwMDY5MzcwMA==,cyan,19,540.8654166666668,1563.0,42434.05409722222,20145.94107593825,2025-10-08T18:08:41.101Z,-0.4335849998667876,94.60584903228244,218.19446950735997,2026-05-14 22:48:43
mn=QlA1MEM1NQ==:sn=NDMwMDY5MzcwMA==,magenta,20,618.7375578703704,1646.0,48744.81574074074,24838.31039299648,2025-10-09T18:14:01.325Z,-0.3822144233884846,94.12452094551114,246.26103879351123,2026-06-13 00:29:54
mn=QlA1MEM1NQ==:sn=NDMwMDY5MzcwMA==,yellow,16,499.7383680555556,1298.0,38857.65313657408,20606.806575198127,2025-10-09T18:14:01.325Z,-0.3368487391666217,91.64601494954368,272.0687486504473,2026-07-08 19:53:00
mn=QlA1MEM1NQ==:sn=NDMwMDY5OTYwMA==,black,17,305.0610532407408,1496.0,25784.145104166662,8389.289464859692,2025-10-29T06:10:09.734Z,-0.3640525325739666,94.53283817776314,259.66812401876746,2026-07-15 22:12:14
mn=QlA1MEM1NQ==:sn=NDMwMDY5OTYwMA==,cyan,3,82.44924768518518,225.0,6140.9922916666665,3177.76471098961,2025-11-03T21:10:38.132Z,-0.0468315856622417,76.28707633525205,1628.9663323691154,2030-04-20 20:22:09
mn=QlA1MEM1NQ==:sn=NDMwMDY5OTYwMA==,magenta,3,67.49509259259258,219.0,4908.89181712963,1687.4052168319722,2025-11-10T15:08:02.838Z,-0.1080670881352244,75.43133270663294,698.0046747650466,2027-10-09 15:14:45
mn=QlA1MEM1NQ==:sn=NDMwMDY5OTYwMA==,yellow,4,90.37185185185186,290.0,6504.649606481482,2497.151582582358,2025-11-04T18:12:30.312Z,-0.1038896534341337,74.84717509227244,720.4487898280046,2027-10-26 04:58:45
mn=QlA1MEM1NQ==:sn=NDMwMDYwOTcwMA==,black,62,4819.618622685186,2840.0,194518.8464583333,494231.754093603,2025-07-03T21:12:53.219Z,-0.2195341683074751,62.872112353071145,286.38873318805537,2026-04-16 06:32:39
mn=QlA1MEM1NQ==:sn=NDMwMDYwOTcwMA==,cyan,53,4329.982083333332,2371.0,154619.2961226852,469477.336743865,2025-07-02T15:14:29.185Z,-0.3377429828195278,72.32869932792667,214.1530779532889,2026-02-01 18:54:54


In [0]:
from pyspark.sql import functions as F
df_lr_model = spark.table("toner_regression_features")

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(
    inputCols=["days_since_start"],
    outputCol="features"
)

df_ml = assembler.transform(df_lr_model).select(
    "deviceId",
    "color",
    "timestamp",
    F.col("toner_pct_remaining").alias("label"),
    "features"
)

from pyspark.ml.regression import LinearRegression
results = []
distinct_pairs = df_ml.select("deviceId", "color").distinct().collect()
for row in distinct_pairs:
    device = row["deviceId"]
    color = row["color"]

    df_group = df_ml.filter(
        (F.col("deviceId") == device) &
        (F.col("color") == color)
    )

    # Need at least 2 points for regression
    if df_group.count() < 2:
        continue

    lr1 = LinearRegression(
        featuresCol="features",
        labelCol="label",
        fitIntercept=True
    )

    model = lr1.fit(df_group)

    m1 = model.coefficients[0]   # slope
    c1 = model.intercept        # intercept

    # Skip invalid models
    if m1 >= 0:
        continue

    start_date = df_group.agg(F.min("timestamp")).first()[0]
    days_to_empty = -c1 / m1

    results.append(
        (device, color, m1, c1, days_to_empty, start_date)
    )

from pyspark.sql.types import (
    StructType, StructField,
    StringType, DoubleType, TimestampType
)
schema1 = StructType([
    StructField("deviceId", StringType(), True),
    StructField("color", StringType(), True),
    StructField("slope_m", DoubleType(), True),
    StructField("intercept_c", DoubleType(), True),
    StructField("days_to_empty", DoubleType(), True),
    StructField("start_date", TimestampType(), True)
])

df_predictions1 = spark.createDataFrame(results, schema1)
df_predictions1 = df_predictions1.withColumn(
    "predicted_end_date",
    F.from_unixtime(
        F.unix_timestamp("start_date") +
        (F.col("days_to_empty") * 86400)
    ).cast("timestamp")
)
df_predictions1.select(
    "deviceId",
    "color",
    "days_to_empty",
    "start_date",
    "predicted_end_date"
).display()


deviceId,color,days_to_empty,start_date,predicted_end_date
mn=QlA1MEM1NQ==:sn=NDMwMDY5MzcwMA==,black,167.42569528454348,2025-10-03T00:14:01.833Z,2026-03-19T10:27:01.000Z
mn=QlA1MEM1NQ==:sn=NDMwMDY5MzcwMA==,cyan,218.19446950736096,2025-10-08T18:08:41.101Z,2026-05-14T22:48:43.000Z
mn=QlA1MEM1NQ==:sn=NDMwMDY5MzcwMA==,magenta,246.26103879351183,2025-10-09T18:14:01.325Z,2026-06-13T00:29:54.000Z
mn=QlA1MEM1NQ==:sn=NDMwMDY5MzcwMA==,yellow,272.0687486504478,2025-10-09T18:14:01.325Z,2026-07-08T19:53:00.000Z
mn=QlA1MEM1NQ==:sn=NDMwMDY5OTYwMA==,black,259.66812401876746,2025-10-29T06:10:09.734Z,2026-07-15T22:12:14.000Z
mn=QlA1MEM1NQ==:sn=NDMwMDY5OTYwMA==,cyan,1628.9663323690856,2025-11-03T21:10:38.132Z,2030-04-20T20:22:09.000Z
mn=QlA1MEM1NQ==:sn=NDMwMDY5OTYwMA==,magenta,698.0046747650362,2025-11-10T15:08:02.838Z,2027-10-09T15:14:45.000Z
mn=QlA1MEM1NQ==:sn=NDMwMDY5OTYwMA==,yellow,720.4487898279989,2025-11-04T18:12:30.312Z,2027-10-26T04:58:45.000Z
mn=QlA1MEM1NQ==:sn=NDMwMDYwOTcwMA==,black,286.3887331880554,2025-07-03T21:12:53.219Z,2026-04-16T06:32:39.000Z
mn=QlA1MEM1NQ==:sn=NDMwMDYwOTcwMA==,cyan,214.15307795328889,2025-07-02T15:14:29.185Z,2026-02-01T18:54:54.000Z
