In [1]:
#### Preparing Training&Testing DataSet
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.datasets import fetch_california_housing

housing=fetch_california_housing()

# Load dataset
data=pd.DataFrame(housing.data,columns=housing.feature_names)
data['Price']=housing.target
#data = pd.read_csv("housing.csv")
X = data.drop(columns=["Price"])
y = data["Price"]

# Split data
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

# Save to CSV
train_data.to_csv("train.csv", index=False, header=False)
test_data.to_csv("test.csv", index=False, header=False)


# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [2]:
### Deploying the model to mlflow server
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from urllib.parse import urlparse

# Log metrics and model
mlflow.set_tracking_uri(uri="http://127.0.0.1:5003")
##create a new MLFLOW experiment
mlflow.set_experiment("HousePricingPrediction")

# Start MLflow run
with mlflow.start_run():
    # Train model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X_train, y_train)

    # Evaluate model
    #print(X_test)
    y_pred = model.predict(X_test)
    print(y_pred)
    mse = mean_squared_error(y_test, y_pred)

    tracking_url_type_store=urlparse(mlflow.get_tracking_uri()).scheme
    mlflow.log_metric("mse", mse)
    # Set a tag that we can use to remind ourselves what this run was for
    mlflow.set_tag("House Pricing Info", "Basic pricing model for california house pricing")
    #mlflow.sklearn.log_model(model, "model", registered_model_name="Best Randomforest Model")
    mlflow.sklearn.log_model(model, "model", registered_model_name="Best Price Prediction Model")


[0.50871   0.74404   4.9150873 ... 4.839239  0.71801   1.65353  ]




🏃 View run rogue-ram-137 at: http://127.0.0.1:5003/#/experiments/102411229856079510/runs/4c5b9f43d91042e2ad21f1e3e7e76cf7
🧪 View experiment at: http://127.0.0.1:5003/#/experiments/102411229856079510


In [11]:
mlflow.pyfunc.get_model_dependencies('runs:/2c83bc91ac874492a60025ddd11d8df3/model')

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

2025/03/19 20:34:46 INFO mlflow.pyfunc: To install the dependencies that were used to train the model, run the following command: '%pip install -r /var/folders/z_/ysww621d3nj_ygd42wyd3xw00000gn/T/tmpo0f3vhnw/model/requirements.txt'.


'/var/folders/z_/ysww621d3nj_ygd42wyd3xw00000gn/T/tmpo0f3vhnw/model/requirements.txt'

In [20]:
## Inferencing from model from model registry

import mlflow.sklearn
model_name="Best Randomforest Model"
model_version="latest"

model_uri=f"models:/{model_name}/{model_version}"

model=mlflow.sklearn.load_model(model_uri)
model

Downloading artifacts:   0%|          | 0/5 [00:00<?, ?it/s]

In [21]:
y_pred_new=model.predict(X_test)
y_pred_new

array([0.50871  , 0.74404  , 4.9150873, ..., 4.839239 , 0.71801  ,
       1.65353  ], shape=(4128,))

In [None]:
### Upload train/test dataset to S3
import boto3
s3 = boto3.client("s3")
bucket_name = "anikamoments140224"
s3.upload_file("train.csv", bucket_name, "train.csv")
s3.upload_file("test.csv", bucket_name, "test.csv")

In [None]:
### Trying to fetch the image URI
from sagemaker import image_uris

region = "us-east-1"
xgboost_image_uri = image_uris.retrieve("xgboost", region, "1.2-1")
print(xgboost_image_uri)

In [None]:
### Training the model in Sagemaker
import sagemaker
from sagemaker import get_execution_role
from sagemaker.inputs import TrainingInput
from sagemaker.estimator import Estimator

# Set up SageMaker session
sagemaker_session = sagemaker.Session()
role = get_execution_role()
bucket_name="anikamoments140224"

# Define S3 paths
train_input = TrainingInput(f"s3://{bucket_name}/train.csv", content_type="csv")
test_input = TrainingInput(f"s3://{bucket_name}/test.csv", content_type="csv")

# Use XGBoost built-in algorithm
xgboost_estimator = Estimator(
    image_uri=sagemaker.image_uris.retrieve("xgboost", sagemaker_session.boto_region_name, "1.2-1"),
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    output_path=f"s3://{bucket_name}/output",
    sagemaker_session=sagemaker_session,
)

# Set hyperparameters
xgboost_estimator.set_hyperparameters(
    objective="reg:squarederror",
    num_round=100,
    max_depth=5,
    eta=0.1,
)

# Train the model
xgboost_estimator.fit({"train": train_input, "validation": test_input})


In [None]:
# Deploy the model to sagemaker endpoint
predictor = xgboost_estimator.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
    endpoint_name="house-price-prediction-endpoint",
)


In [None]:
### Monitoring the model deployed in sagemaker
from sagemaker.model_monitor import DefaultModelMonitor
from sagemaker.model_monitor.dataset_format import DatasetFormat

# Create a model monitor
monitor = DefaultModelMonitor(
    role=role,
    instance_count=1,
    instance_type="ml.m5.large",
    volume_size_in_gb=20,
    max_runtime_in_seconds=1800,
)

# Schedule monitoring
monitor.create_monitoring_schedule(
    monitor_schedule_name="house-price-monitoring-schedule",
    endpoint_input=predictor.endpoint_name,
    output_s3_uri=f"s3://{bucket_name}/monitoring",
    statistics=DatasetFormat.csv(header=False),
)
