In [1]:
#!/usr/bin/env python3
"""Test S3 (MinIO) Connection from Jupyter"""

import boto3
import pandas as pd
import os
from datetime import datetime

print("=" * 50)
print("Testing S3 (MinIO) Connection from Jupyter")
print("=" * 50)

# Display configuration
print("\n1. Environment Configuration:")
print(f"   AWS_ACCESS_KEY_ID: {os.environ.get('AWS_ACCESS_KEY_ID', 'Not set')}")
print(f"   AWS_SECRET_ACCESS_KEY: {'***' if os.environ.get('AWS_SECRET_ACCESS_KEY') else 'Not set'}")
print(f"   AWS_ENDPOINT_URL: {os.environ.get('AWS_ENDPOINT_URL', 'Not set')}")
print(f"   MLFLOW_TRACKING_URI: {os.environ.get('MLFLOW_TRACKING_URI', 'Not set')}")

# Create S3 client
print("\n2. Creating S3 client...")
try:
    s3_client = boto3.client(
        's3',
        endpoint_url=os.environ.get('AWS_ENDPOINT_URL', 'http://minio:9000'),
        aws_access_key_id=os.environ.get('AWS_ACCESS_KEY_ID', 'minio'),
        aws_secret_access_key=os.environ.get('AWS_SECRET_ACCESS_KEY', 'minio123')
    )
    print("   ✅ S3 client created successfully!")
except Exception as e:
    print(f"   ❌ Error creating S3 client: {e}")
    exit(1)

# List buckets
print("\n3. Listing buckets...")
try:
    buckets = s3_client.list_buckets()
    print("   Available buckets:")
    for bucket in buckets['Buckets']:
        print(f"     - {bucket['Name']}")
except Exception as e:
    print(f"   ❌ Error listing buckets: {e}")
    exit(1)

# Create test data
print("\n4. Creating test data...")
test_data = pd.DataFrame({
    'timestamp': pd.date_range(start='2024-01-01', periods=5, freq='H'),
    'value': range(5),
    'category': ['A', 'B', 'A', 'B', 'A']
})
print("   Test data created:")
print(test_data.to_string(index=False))

# Save to S3
print("\n5. Saving data to S3...")
try:
    csv_buffer = test_data.to_csv(index=False)
    key = f"jupyter_test/test_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv"
    
    s3_client.put_object(
        Bucket='features',
        Key=key,
        Body=csv_buffer
    )
    print(f"   ✅ Data saved to s3://features/{key}")
except Exception as e:
    print(f"   ❌ Error saving to S3: {e}")
    exit(1)

# Read from S3
print("\n6. Reading data from S3...")
try:
    response = s3_client.get_object(Bucket='features', Key=key)
    df_from_s3 = pd.read_csv(response['Body'])
    print("   ✅ Data read successfully!")
    print("   First 3 rows:")
    print(df_from_s3.head(3).to_string(index=False))
except Exception as e:
    print(f"   ❌ Error reading from S3: {e}")
    exit(1)

# List objects
print("\n7. Listing objects in bucket...")
try:
    objects = s3_client.list_objects_v2(Bucket='features', Prefix='jupyter_test/')
    if 'Contents' in objects:
        print("   Objects in s3://features/jupyter_test/:")
        for obj in objects['Contents'][-5:]:  # Show last 5 objects
            print(f"     - {obj['Key']} (Size: {obj['Size']} bytes)")
    else:
        print("   No objects found")
except Exception as e:
    print(f"   ❌ Error listing objects: {e}")

print("\n" + "=" * 50)
print("✅ All S3 connection tests passed!")
print("=" * 50) 

Testing S3 (MinIO) Connection from Jupyter

1. Environment Configuration:
   AWS_ACCESS_KEY_ID: Not set
   AWS_SECRET_ACCESS_KEY: Not set
   AWS_ENDPOINT_URL: Not set
   MLFLOW_TRACKING_URI: Not set

2. Creating S3 client...
   ✅ S3 client created successfully!

3. Listing buckets...
   Available buckets:
     - features
     - mlflow
     - models

4. Creating test data...
   Test data created:
          timestamp  value category
2024-01-01 00:00:00      0        A
2024-01-01 01:00:00      1        B
2024-01-01 02:00:00      2        A
2024-01-01 03:00:00      3        B
2024-01-01 04:00:00      4        A

5. Saving data to S3...
   ✅ Data saved to s3://features/jupyter_test/test_data_20250727_161908.csv

6. Reading data from S3...
   ✅ Data read successfully!
   First 3 rows:
          timestamp  value category
2024-01-01 00:00:00      0        A
2024-01-01 01:00:00      1        B
2024-01-01 02:00:00      2        A

7. Listing objects in bucket...
   Objects in s3://features/jupyt

In [2]:
#!/usr/bin/env python3
"""
MLOps Examples Notebook
=======================
This notebook contains examples of using:
1. MinIO S3 for data storage
2. MLflow for experiment tracking
3. Data processing pipelines
4. Model training and tracking
"""

import boto3
import pandas as pd
import numpy as np
import mlflow
import os
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# %% [markdown]
# ## 1. Setup Configuration

# Configure S3 client
s3_client = boto3.client(
    's3',
    endpoint_url='http://minio:9000',
    aws_access_key_id='minio',
    aws_secret_access_key='minio123'
)

# Configure MLflow
mlflow.set_tracking_uri('http://mlflow:5001')
os.environ['MLFLOW_S3_ENDPOINT_URL'] = 'http://minio:9000'
os.environ['AWS_ACCESS_KEY_ID'] = 'minio'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'minio123'

print("Configuration completed!")
print(f"MLflow tracking URI: {mlflow.get_tracking_uri()}")

# %% [markdown]
# ## 2. Generate Sample Dataset

# Generate synthetic data
np.random.seed(42)
n_samples = 1000

data = pd.DataFrame({
    'feature_1': np.random.normal(100, 15, n_samples),
    'feature_2': np.random.exponential(2, n_samples),
    'feature_3': np.random.uniform(0, 100, n_samples),
    'category': np.random.choice(['A', 'B', 'C', 'D'], n_samples),
    'timestamp': pd.date_range(start='2024-01-01', periods=n_samples, freq='H')
})

# Create target variable with some relationship to features
data['target'] = (
    0.5 * data['feature_1'] + 
    2.0 * data['feature_2'] + 
    0.1 * data['feature_3'] +
    np.random.normal(0, 5, n_samples)
)

print("Dataset shape:", data.shape)
print("\nFirst 5 rows:")
print(data.head())

# %% [markdown]
# ## 3. Save Data to S3

# Save raw data to S3
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
raw_data_key = f"data/raw/synthetic_data_{timestamp}.parquet"

# Convert to parquet and upload
data.to_parquet('/tmp/temp_data.parquet', index=False)
with open('/tmp/temp_data.parquet', 'rb') as f:
    s3_client.put_object(
        Bucket='features',
        Key=raw_data_key,
        Body=f
    )

print(f"✅ Raw data saved to s3://features/{raw_data_key}")

# %% [markdown]
# ## 4. Feature Engineering

# One-hot encode categorical features
data_encoded = pd.get_dummies(data, columns=['category'], prefix='category')

# Create time-based features
data_encoded['hour'] = data_encoded['timestamp'].dt.hour
data_encoded['day_of_week'] = data_encoded['timestamp'].dt.dayofweek
data_encoded['is_weekend'] = (data_encoded['day_of_week'] >= 5).astype(int)

# Drop timestamp for modeling
features = data_encoded.drop(['timestamp', 'target'], axis=1)
target = data_encoded['target']

print("Features shape after encoding:", features.shape)
print("\nFeature columns:")
print(features.columns.tolist())

# %% [markdown]
# ## 5. Train Model with MLflow Tracking

# Create or get experiment
experiment_name = "jupyter_ml_experiment"
experiment = mlflow.get_experiment_by_name(experiment_name)
if experiment is None:
    experiment_id = mlflow.create_experiment(
        experiment_name,
        tags={"environment": "jupyter", "project": "mlops_demo"}
    )
else:
    experiment_id = experiment.experiment_id

print(f"Using experiment: {experiment_name} (ID: {experiment_id})")

# Split data
X_train, X_test, y_train, y_test = train_test_split(
    features, target, test_size=0.2, random_state=42
)

# Train model with MLflow tracking
with mlflow.start_run(experiment_id=experiment_id, run_name="rf_model_jupyter"):
    # Log parameters
    n_estimators = 100
    max_depth = 10
    min_samples_split = 5
    
    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)
    mlflow.log_param("min_samples_split", min_samples_split)
    mlflow.log_param("n_features", X_train.shape[1])
    mlflow.log_param("n_samples_train", X_train.shape[0])
    
    # Train model
    print("Training Random Forest model...")
    rf_model = RandomForestRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        random_state=42
    )
    rf_model.fit(X_train, y_train)
    
    # Make predictions
    y_pred_train = rf_model.predict(X_train)
    y_pred_test = rf_model.predict(X_test)
    
    # Calculate metrics
    train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
    train_r2 = r2_score(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    
    # Log metrics
    mlflow.log_metric("train_rmse", train_rmse)
    mlflow.log_metric("test_rmse", test_rmse)
    mlflow.log_metric("train_r2", train_r2)
    mlflow.log_metric("test_r2", test_r2)
    
    print(f"Train RMSE: {train_rmse:.2f}")
    print(f"Test RMSE: {test_rmse:.2f}")
    print(f"Train R²: {train_r2:.3f}")
    print(f"Test R²: {test_r2:.3f}")
    
    # Create and log feature importance plot
    plt.figure(figsize=(10, 6))
    feature_importance = pd.DataFrame({
        'feature': features.columns,
        'importance': rf_model.feature_importances_
    }).sort_values('importance', ascending=False).head(10)
    
    sns.barplot(data=feature_importance, x='importance', y='feature')
    plt.title('Top 10 Feature Importances')
    plt.tight_layout()
    plt.savefig('/tmp/feature_importance.png')
    mlflow.log_artifact('/tmp/feature_importance.png')
    plt.close()
    
    # Log the model
    mlflow.sklearn.log_model(
        rf_model, 
        "random_forest_model",
        input_example=X_train.iloc[:5]
    )
    
    # Save predictions to S3
    predictions_df = pd.DataFrame({
        'actual': y_test,
        'predicted': y_pred_test,
        'error': y_test - y_pred_test
    })
    
    pred_key = f"predictions/rf_predictions_{timestamp}.csv"
    predictions_csv = predictions_df.to_csv(index=False)
    s3_client.put_object(
        Bucket='models',
        Key=pred_key,
        Body=predictions_csv
    )
    mlflow.log_param("predictions_s3_path", f"s3://models/{pred_key}")
    
    run_id = mlflow.active_run().info.run_id
    print(f"\n✅ MLflow run completed: {run_id}")

# %% [markdown]
# ## 6. List Recent Experiments and Runs

# List recent runs
runs = mlflow.search_runs(experiment_ids=[experiment_id], max_results=5)
print("\nRecent MLflow runs:")
print(runs[['run_id', 'start_time', 'metrics.test_rmse', 'metrics.test_r2']].to_string(index=False))

# %% [markdown]
# ## 7. List S3 Objects Created

# List objects in features bucket
print("\nObjects in s3://features/:")
response = s3_client.list_objects_v2(Bucket='features', MaxKeys=10)
if 'Contents' in response:
    for obj in response['Contents'][-5:]:
        print(f"  - {obj['Key']} ({obj['Size']} bytes)")

# List objects in models bucket
print("\nObjects in s3://models/:")
response = s3_client.list_objects_v2(Bucket='models', MaxKeys=10)
if 'Contents' in response:
    for obj in response['Contents'][-5:]:
        print(f"  - {obj['Key']} ({obj['Size']} bytes)")

print("\n" + "="*50)
print("✅ MLOps workflow completed successfully!")
print("="*50)
print("\nNext steps:")
print("1. View experiment in MLflow UI: http://localhost:5001")
print("2. Browse S3 objects in MinIO: http://localhost:9001")
print("3. Create Airflow DAG to automate this workflow") 

Configuration completed!
MLflow tracking URI: http://mlflow:5001
Dataset shape: (1000, 6)

First 5 rows:
    feature_1  feature_2  feature_3 category           timestamp     target
0  107.450712   0.366602  21.906881        A 2024-01-01 00:00:00  51.889656
1   97.926035   0.220898   3.672136        A 2024-01-01 01:00:00  50.159429
2  109.715328   2.023568  10.802575        A 2024-01-01 02:00:00  61.273821
3  122.845448   2.451590  33.886065        B 2024-01-01 03:00:00  63.505707
4   96.487699   0.064191  80.258568        B 2024-01-01 04:00:00  58.068972
✅ Raw data saved to s3://features/data/raw/synthetic_data_20250727_162018.parquet
Features shape after encoding: (1000, 10)

Feature columns:
['feature_1', 'feature_2', 'feature_3', 'category_A', 'category_B', 'category_C', 'category_D', 'hour', 'day_of_week', 'is_weekend']
Using experiment: jupyter_ml_experiment (ID: 2)
Training Random Forest model...




Train RMSE: 2.74
Test RMSE: 6.02
Train R²: 0.925
Test R²: 0.682





✅ MLflow run completed: da8d72bb1ba34d3b9fe510b9dd360d01
🏃 View run rf_model_jupyter at: http://mlflow:5001/#/experiments/2/runs/da8d72bb1ba34d3b9fe510b9dd360d01
🧪 View experiment at: http://mlflow:5001/#/experiments/2

Recent MLflow runs:
                          run_id                       start_time  metrics.test_rmse  metrics.test_r2
da8d72bb1ba34d3b9fe510b9dd360d01 2025-07-27 16:20:18.924000+00:00            6.02333         0.682454

Objects in s3://features/:
  - data/raw/synthetic_data_20250727_162018.parquet (50970 bytes)
  - jupyter_test/test_data_20250727_161847.csv (145 bytes)
  - jupyter_test/test_data_20250727_161908.csv (145 bytes)
  - processed/aggregated_data_2025-07-27.csv (168 bytes)
  - raw/sample_data_2025-07-27.csv (3079 bytes)

Objects in s3://models/:
  - predictions/rf_predictions_20250727_162018.csv (11060 bytes)

✅ MLOps workflow completed successfully!

Next steps:
1. View experiment in MLflow UI: http://localhost:5001
2. Browse S3 objects in MinIO: http:/