<a href="https://colab.research.google.com/github/anjelisa01/test_colab/blob/main/test1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Replace TOKEN and USERNAME
token = ""  # your GitHub token
username = "anjelisa01"
repo = "test_colab"

!git clone https://{username}:{token}@github.com/{username}/{repo}.git
%cd {repo}


Cloning into 'test_colab'...
remote: Enumerating objects: 3, done.[K
remote: Counting objects: 100% (3/3), done.[K
remote: Total 3 (delta 0), reused 0 (delta 0), pack-reused 0 (from 0)[K
Receiving objects: 100% (3/3), done.
/content/test_colab


In [17]:
# Install MLflow (if not already)
!pip install -q mlflow

# Imports
import pandas as pd
import numpy as np
from sklearn.datasets import load_diabetes
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold, SelectKBest, f_regression
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
import mlflow
import mlflow.sklearn
import joblib


In [22]:
def load_and_split_data():
    X, y = load_diabetes(return_X_y=True, as_frame=True)
    return train_test_split(X, y, test_size=0.2, random_state=42)

def engineer_features(X):
    X = X.copy()
    X["bmi_squared"] = X["bmi"] ** 2
    return X

def remove_low_variance_features(X_train, X_test, threshold=0.0001):
    selector = VarianceThreshold(threshold=threshold)
    try:
        X_train_sel = selector.fit_transform(X_train)
        X_test_sel = selector.transform(X_test)
        selected_features = X_train.columns[selector.get_support()].tolist()
        return X_train_sel, X_test_sel, selected_features
    except ValueError:
        # Fallback: keep original data if no features meet the threshold
        print("⚠️ No features passed variance threshold — skipping variance filter.")
        return X_train.values, X_test.values, X_train.columns.tolist()


In [19]:
def run_baseline_models(X, y, selected_features):
    models = {
        "Ridge": Ridge(),
        "RandomForest": RandomForestRegressor(random_state=42),
        "SVR": SVR()
    }

    results = []
    mlflow.set_experiment("ML_Workflow_Functional")

    for name, model in models.items():
        pipe = Pipeline([
            ("scale", StandardScaler()),
            ("model", model)
        ])
        scores = cross_val_score(pipe, X, y, cv=5, scoring='neg_mean_squared_error')
        mean_rmse = np.sqrt(-scores.mean())

        with mlflow.start_run(run_name=f"Baseline_{name}"):
            mlflow.log_param("model", name)
            mlflow.log_metric("cv_rmse", mean_rmse)
            mlflow.set_tag("features", str(selected_features))

        results.append((name, mean_rmse))

    return sorted(results, key=lambda x: x[1])[:2]  # top 2 models

In [24]:
def tune_top_models(top_models, X_train, y_train, X_test, y_test):
    configs = {
        "Ridge": {
            "model": Ridge(),
            "param_grid": {"model__alpha": [0.1, 1.0, 10.0]}
        },
        "RandomForest": {
            "model": RandomForestRegressor(random_state=42),
            "param_grid": {
                "model__n_estimators": [50, 100],
                "model__max_depth": [3, None]
            }
        }
    }

    best_model = None
    best_score = float("inf")

    for name, _ in top_models:
        if name not in configs:
            continue

        model = configs[name]["model"]
        grid = configs[name]["param_grid"]

        pipe = Pipeline([
            ("scale", StandardScaler()),
            ("select", SelectKBest(score_func=f_regression, k=8)),
            ("model", model)
        ])

        search = GridSearchCV(pipe, grid, cv=5, scoring="neg_mean_squared_error", n_jobs=-1)

        with mlflow.start_run(run_name=f"Tuned_{name}"):
            search.fit(X_train, y_train)
            best_pipe = search.best_estimator_
            y_pred = best_pipe.predict(X_test)
            rmse = np.sqrt(mean_squared_error(y_test, y_pred))

            mlflow.log_param("model", name)
            mlflow.log_params(search.best_params_)
            mlflow.log_metric("test_rmse", rmse)
            mlflow.sklearn.log_model(best_pipe, "model")

            if rmse < best_score:
                best_score = rmse
                best_model = best_pipe

    return best_model

In [25]:
def run_workflow():
    X_train, X_test, y_train, y_test = load_and_split_data()

    X_train = engineer_features(X_train)
    X_test = engineer_features(X_test)

    X_train_sel, X_test_sel, selected = remove_low_variance_features(X_train, X_test)

    top_models = run_baseline_models(X_train_sel, y_train, selected)
    best_model = tune_top_models(top_models, X_train_sel, y_train, X_test_sel, y_test)

    joblib.dump(best_model, "best_model.pkl")
    print("✅ Best model saved as 'best_model.pkl'")

run_workflow()




✅ Best model saved as 'best_model.pkl'


In [26]:
import joblib

# Load the pipeline from file
best_model = joblib.load("best_model.pkl")
print("Best model loaded successfully!")


Best model loaded successfully!


In [32]:
import pandas as pd
from sklearn.datasets import load_diabetes

# Load the original dataset structure for demonstration
X, _ = load_diabetes(return_X_y=True, as_frame=True)

# Let's assume you want to predict on the first 5 samples
sample_data = X.head(5)

# Since our pipeline has feature engineering (e.g., bmi_squared),
# be sure you replicate that step. If your pipeline already includes feature engineering,
# then you can directly use this sample. Otherwise, you need to add the additional columns.
# sample_data["bmi_squared"] = sample_data["bmi"] ** 2

print("Sample data used for prediction:")
print(sample_data)


Sample data used for prediction:
        age       sex       bmi        bp        s1        s2        s3  \
0  0.038076  0.050680  0.061696  0.021872 -0.044223 -0.034821 -0.043401   
1 -0.001882 -0.044642 -0.051474 -0.026328 -0.008449 -0.019163  0.074412   
2  0.085299  0.050680  0.044451 -0.005670 -0.045599 -0.034194 -0.032356   
3 -0.089063 -0.044642 -0.011595 -0.036656  0.012191  0.024991 -0.036038   
4  0.005383 -0.044642 -0.036385  0.021872  0.003935  0.015596  0.008142   

         s4        s5        s6  
0 -0.002592  0.019907 -0.017646  
1 -0.039493 -0.068332 -0.092204  
2 -0.002592  0.002861 -0.025930  
3  0.034309  0.022688 -0.009362  
4 -0.002592 -0.031988 -0.046641  


In [33]:
# Get predictions on the sample data
predictions = best_model.predict(sample_data)
print("Predictions for the sample data:")
print(predictions)


Predictions for the sample data:
[207.58086739  89.90700731 176.47171505 163.973325   100.92764622]


