### 🛠️ Task 1: Train a simple regression model

In [0]:
import mlflow
from mlflow import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score

In [0]:
# Load gold data
df = spark.table("ecommerce_catalog.default.events_gold").toPandas()

# Features & target
X = df[["views", "carts"]]
y = df["purchases"]

In [0]:
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Task 2: Log parameters, metrics, and model (MLflow)....

In [0]:

# Load data
df = spark.table("ecommerce_catalog.default.events_gold").toPandas()

X = df[["views", "carts"]]
y = df["purchases"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

with mlflow.start_run(run_name="linear_regression_v1"):

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("features", "views,carts")
    mlflow.log_param("test_size", 0.2)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("r2_score", r2)


print(f"R² Score: {r2:.4f}")


R² Score: 0.9821


### Task 3: View in MLflow UI... (done by going into experiments section under ai/ml)

### Task 4: Task 4: Compare runs...

In [0]:
# revenue column have nan values so.. 
df = spark.table("ecommerce_catalog.default.events_gold").toPandas()

df = df.fillna({
    "revenue": 0,
    "views": 0,
    "carts": 0,
    "purchases": 0
})

X = df[["views", "carts", "revenue"]]
y = df["purchases"]

# Filled nan values with 0.


# running without nan values....

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

with mlflow.start_run(run_name="linear_regression_v2"):

    mlflow.log_param("model_type", "LinearRegression")
    mlflow.log_param("features", "views, carts, revenue")
    mlflow.log_param("test_size", 0.2)

    model = LinearRegression()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    r2 = r2_score(y_test, y_pred)

    mlflow.log_metric("r2_score", r2)


print(f"R² Score: {r2:.4f}")



R² Score: 0.9807
