In [13]:
# Install required packages
!pip install xgboost scikit-learn pandas pytest pytest-cov google-cloud-storage

# Load dataset, train model, save as JSON
import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Load penguins dataset (using seaborn for convenience)
import seaborn as sns
penguins = sns.load_dataset('penguins').dropna()

# Prepare features and target
X = penguins[['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g']]
y = penguins['species']

# Encode target
le = LabelEncoder()
y_enc = le.fit_transform(y)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42)

# Train XGBoost classifier
model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
model.fit(X_train, y_train)

# Save model to JSON file
model.save_model("penguin_model.json")

print("Model trained and saved as penguin_model.json")


Model trained and saved as penguin_model.json


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


In [14]:
# Authenticate with GCP in Colab
from google.colab import auth
auth.authenticate_user()

# Set your bucket name and blob name
GCS_BUCKET_NAME = "arjunaji"  # Replace with your bucket
GCS_BLOB_NAME = "penguin_model.json"

from google.cloud import storage

def upload_to_gcs(bucket_name, source_file_name, destination_blob_name):
    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    blob.upload_from_filename(source_file_name)
    print(f"Uploaded {source_file_name} to gs://{bucket_name}/{destination_blob_name}")

upload_to_gcs(GCS_BUCKET_NAME, "penguin_model.json", GCS_BLOB_NAME)


Uploaded penguin_model.json to gs://arjunaji/penguin_model.json


In [15]:
!pip install fastapi "uvicorn[standard]" pydantic

from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field, validator
import xgboost as xgb
import numpy as np

app = FastAPI()

# Load model locally (simplified for Colab testing)
model = xgb.XGBClassifier()
model.load_model("penguin_model.json")

class PenguinFeatures(BaseModel):
    bill_length_mm: float = Field(..., ge=0)
    bill_depth_mm: float = Field(..., ge=0)
    flipper_length_mm: float = Field(..., ge=0)
    body_mass_g: float = Field(..., ge=0)

    @validator('*')
    def check_not_negative(cls, v):
        if v < 0:
            raise ValueError('Value must be non-negative')
        return v

@app.post("/predict")
def predict(features: PenguinFeatures):
    data = np.array([[features.bill_length_mm,
                      features.bill_depth_mm,
                      features.flipper_length_mm,
                      features.body_mass_g]])
    pred = model.predict(data)[0]
    return {"prediction": int(pred)}




/tmp/ipython-input-457933273.py:20: PydanticDeprecatedSince20: Pydantic V1 style `@validator` validators are deprecated. You should migrate to Pydantic V2 style `@field_validator` validators, see the migration guide for more details. Deprecated in Pydantic V2.0 to be removed in V3.0. See Pydantic V2 Migration Guide at https://errors.pydantic.dev/2.11/migration/
  @validator('*')


In [21]:
%%writefile test_api.py
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel, Field, validator
import xgboost as xgb
import numpy as np
from fastapi.testclient import TestClient

# Define the FastAPI app within the test file for testing purposes
app = FastAPI()

# Load model locally (simplified for Colab testing)
# Make sure the model file 'penguin_model.json' exists in the same directory
model = xgb.XGBClassifier()
try:
    model.load_model("penguin_model.json")
except xgb.core.XGBoostError:
    # Handle case where model file might not be found during testing
    print("Warning: penguin_model.json not found. Model loading skipped for tests.")
    model = None # Or a mock model if needed for tests that don't require loading

class PenguinFeatures(BaseModel):
    bill_length_mm: float = Field(..., ge=0)
    bill_depth_mm: float = Field(..., ge=0)
    flipper_length_mm: float = Field(..., ge=0)
    body_mass_g: float = Field(..., ge=0)

    @validator('*', pre=True) # Removed each_item=True
    def check_not_negative(cls, v):
        if isinstance(v, (int, float)) and v < 0:
            raise ValueError('Value must be non-negative')
        return v

@app.post("/predict")
def predict(features: PenguinFeatures):
    if model is None:
        raise HTTPException(status_code=500, detail="Model not loaded")
    data = np.array([[features.bill_length_mm,
                      features.bill_depth_mm,
                      features.flipper_length_mm,
                      features.body_mass_g]])
    pred = model.predict(data)[0]
    # Note: The original model output is a class index (0, 1, or 2).
    # If you want to return the actual species name, you'll need the LabelEncoder
    # or a mapping from index to species name available here as well.
    # For simplicity, returning the index as an integer.
    return {"prediction": int(pred)}


client = TestClient(app)

def test_predict_endpoint_valid_input():
    sample_data = {
        "bill_length_mm": 39.1,
        "bill_depth_mm": 18.7,
        "flipper_length_mm": 181,
        "body_mass_g": 3750
    }
    response = client.post("/predict", json=sample_data)
    assert response.status_code == 200
    assert "prediction" in response.json()
    # You might want to add checks on the predicted value range (0, 1, or 2)

def test_predict_endpoint_missing_field():
    sample_data = {
        "bill_length_mm": 39.1,
        "bill_depth_mm": 18.7,
        "flipper_length_mm": 181
        # Missing body_mass_g
    }
    response = client.post("/predict", json=sample_data)
    assert response.status_code == 422

def test_predict_endpoint_invalid_type():
    sample_data = {
        "bill_length_mm": "invalid",  # should be float
        "bill_depth_mm": 18.7,
        "flipper_length_mm": 181,
        "body_mass_g": 3750
    }
    response = client.post("/predict", json=sample_data)
    assert response.status_code == 422

def test_predict_endpoint_out_of_range():
    sample_data = {
        "bill_length_mm": 500,  # unrealistic but accepted if >=0
        "bill_depth_mm": 18.7,
        "flipper_length_mm": 181,
        "body_mass_g": -50  # negative not allowed
    }
    response = client.post("/predict", json=sample_data)
    assert response.status_code == 422  # Validator catches negative value

Overwriting test_api.py


In [22]:
!pytest --cov=.


platform linux -- Python 3.11.13, pytest-8.4.1, pluggy-1.6.0
rootdir: /content
plugins: cov-6.2.1, typeguard-4.4.4, anyio-4.10.0, langsmith-0.4.12
collected 4 items                                                              [0m

test_api.py [32m.[0m[32m.[0m[32m.[0m[32m.[0m[32m                                                         [100%][0m

_______________ coverage: platform linux, python 3.11.13-final-0 _______________

Name          Stmts   Miss  Cover
---------------------------------
test_api.py      47      4    91%
---------------------------------
TOTAL            47      4    91%
