#### DataOps pipeline for ML model training

In [1]:
import pandas as pd
import numpy as np

# Set a random seed for reproducibility
np.random.seed(42)

# Generate synthetic data
n_samples = 1000
tenure_months = np.random.randint(1, 60, size=n_samples)
feature1 = np.random.randn(n_samples)  # Example feature 1
feature2 = np.random.randn(n_samples)  # Example feature 2
churn = np.random.randint(0, 2, size=n_samples)  # Binary target variable

# Create a DataFrame
data = pd.DataFrame({
    'tenure_months': tenure_months,
    'feature1': feature1,
    'feature2': feature2,
    'churn': churn
})

# Save the synthetic data to a CSV file
data.to_csv('customer_data.csv', index=False)

# Example: Simple DataOps pipeline for ML model training
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# Data ingestion
data = pd.read_csv('customer_data.csv')

# Data preprocessing
X = data.drop('churn', axis=1)
y = data['churn']

# Data quality check
assert X.isnull().sum().sum() == 0, "Missing values detected"

# Feature engineering
X['tenure_years'] = X['tenure_months'] / 12

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

# Model evaluation
y_pred = model.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
print(f"Model accuracy: {accuracy:.2f}")

# Monitoring
if accuracy < 0.8:
    print("Warning: Model performance below threshold")

# Version control and logging (placeholder)
print("Logging model version and performance metrics...")


Model accuracy: 0.54
Logging model version and performance metrics...


#### CI/CD pipeline with Git and DVC

In [1]:
# Install necessary packages
!pip install gitpython dvc

import os
import subprocess
from git import Repo

# Define repository URL and directory
repo_url = "https://github.com/airex-lab/ai-toolbox.git"
repo_dir = "/content/ai-toolbox"

# Function to initialize Git and DVC
def initialize_repo():
    if not os.path.exists(repo_dir):
        print(f"Cloning repository from {repo_url}...")
        Repo.clone_from(repo_url, repo_dir)
    os.chdir(repo_dir)
    if not os.path.exists(".git"):
        subprocess.run(["git", "init"], check=True)
    if not os.path.exists(".dvc"):
        subprocess.run(["dvc", "init"], check=True)

# Initialize repository
initialize_repo()

# Configure Git
subprocess.run(["git", "config", "--global", "user.name", "Colab User"], check=True)
subprocess.run(["git", "config", "--global", "user.email", "colab@example.com"], check=True)

# Create a placeholder dataset
dataset_path = "data/transactions.csv"
os.makedirs(os.path.dirname(dataset_path), exist_ok=True)
with open(dataset_path, "w") as f:
    f.write("TransactionID,Amount,Date\n1,100,2024-01-01\n2,150,2024-01-02\n")

# Track the dataset with DVC and Git
subprocess.run(["dvc", "add", dataset_path], check=True)
subprocess.run(["git", "add", f"{dataset_path}.dvc"], check=True)

# Check if there are changes to commit
status = subprocess.run(["git", "status", "--porcelain"], capture_output=True, text=True)
if status.stdout:
    try:
        subprocess.run(["git", "commit", "-m", "Update transaction dataset"], check=True)
    except subprocess.CalledProcessError as e:
        print(f"Git commit failed: {e}")
else:
    print("No changes to commit for dataset.")

# Create a placeholder model
model_path = "models/fraud_detection_model.pkl"
os.makedirs(os.path.dirname(model_path), exist_ok=True)
with open(model_path, "wb") as f:
    f.write(b"fake_model_data")

# Track the model with DVC and Git
subprocess.run(["dvc", "add", model_path], check=True)
subprocess.run(["git", "add", f"{model_path}.dvc"], check=True)

# Check if there are changes to commit
status = subprocess.run(["git", "status", "--porcelain"], capture_output=True, text=True)
if status.stdout:
    try:
        subprocess.run(["git", "commit", "-m", "Retrain fraud detection model"], check=True)
    except subprocess.CalledProcessError as e:
        print(f"Git commit failed: {e}")
else:
    print("No changes to commit for model.")

# Simulate running tests
def run_tests():
    # Simulate a test result
    return True

if run_tests():
    try:
        subprocess.run(["git", "push", "origin", "main"], check=True)
        subprocess.run(["dvc", "push"], check=True)
    except subprocess.CalledProcessError as e:
        print(f"Git push or DVC push failed: {e}")

    # Simulate deployment
    def deploy_model():
        # Simulate deployment steps
        print("Model deployed.")

    deploy_model()

    # In a production environment
    try:
        subprocess.run(["git", "pull", "origin", "main"], check=True)
        subprocess.run(["dvc", "pull"], check=True)
    except subprocess.CalledProcessError as e:
        print(f"Git pull or DVC pull failed: {e}")

    # Simulate loading and using the model
    def load_model():
        # Simulate loading a model
        print("Model loaded.")

    load_model()
else:
    print("Tests failed. Deployment aborted.")

Git push or DVC push failed: Command '['git', 'push', 'origin', 'main']' returned non-zero exit status 128.
Model deployed.
Model loaded.


#### Agile DataOps sprints

In [2]:
# Conceptual example of agile DataOps sprints

def update_data():
    print("Updating data...")

def train_basic_model():
    print("Training basic model...")

def deploy_to_test_environment():
    print("Deploying to test environment...")

def collect_feedback():
    print("Collecting feedback...")

def engineer_demographic_features():
    print("Engineering demographic features...")

def retrain_model_with_demographics():
    print("Retraining model with demographics...")

def setup_ab_testing_infrastructure():
    print("Setting up A/B testing infrastructure...")

def deploy_model_variants():
    print("Deploying model variants...")

def analyze_ab_test_results():
    print("Analyzing A/B test results...")

def select_best_performing_model():
    print("Selecting best performing model...")

def engineer_content_based_features():
    print("Engineering content-based features...")

def train_hybrid_model():
    print("Training hybrid model...")

def deploy_to_production():
    print("Deploying to production...")

def monitor_performance():
    print("Monitoring performance...")

def review_sprint_outcomes():
    print("Reviewing sprint outcomes...")

def plan_next_sprint():
    print("Planning next sprint...")

def sprint_1():
    # Goal: Implement basic collaborative filtering model
    update_data()
    train_basic_model()
    deploy_to_test_environment()
    collect_feedback()

def sprint_2():
    # Goal: Incorporate user demographics
    update_data()
    engineer_demographic_features()
    retrain_model_with_demographics()
    deploy_to_test_environment()
    collect_feedback()

def sprint_3():
    # Goal: Implement A/B testing framework
    setup_ab_testing_infrastructure()
    deploy_model_variants()
    analyze_ab_test_results()
    select_best_performing_model()

def sprint_4():
    # Goal: Optimize model for cold start problem
    engineer_content_based_features()
    train_hybrid_model()
    deploy_to_production()
    monitor_performance()

# Main agile loop
for sprint in [sprint_1, sprint_2, sprint_3, sprint_4]:
    sprint()
    review_sprint_outcomes()
    plan_next_sprint()


Updating data...
Training basic model...
Deploying to test environment...
Collecting feedback...
Reviewing sprint outcomes...
Planning next sprint...
Updating data...
Engineering demographic features...
Retraining model with demographics...
Deploying to test environment...
Collecting feedback...
Reviewing sprint outcomes...
Planning next sprint...
Setting up A/B testing infrastructure...
Deploying model variants...
Analyzing A/B test results...
Selecting best performing model...
Reviewing sprint outcomes...
Planning next sprint...
Engineering content-based features...
Training hybrid model...
Deploying to production...
Monitoring performance...
Reviewing sprint outcomes...
Planning next sprint...


#### role interactions

In [3]:
# Placeholder function implementations
def extract_data_from_source():
    print("Extracting data from source...")
    return "raw_data"

def clean_and_validate(raw_data):
    print(f"Cleaning and validating {raw_data}...")
    return "cleaned_data"

def store_in_data_warehouse(cleaned_data):
    print(f"Storing {cleaned_data} in data warehouse...")

def fetch_from_data_warehouse():
    print("Fetching data from data warehouse...")
    return "fetched_data"

def perform_analysis(data):
    print(f"Performing analysis on {data}...")
    return "insights"

def create_visualizations(insights):
    print(f"Creating visualizations for {insights}...")
    return "visualizations"

def present_to_stakeholders(visualizations):
    print(f"Presenting {visualizations} to stakeholders...")

def implement_ci_cd_pipeline():
    print("Implementing CI/CD pipeline...")

def automate_data_quality_checks():
    print("Automating data quality checks...")

def monitor_pipeline_performance():
    print("Monitoring pipeline performance...")

def design_data_model():
    print("Designing data model...")

def plan_data_integration_strategy():
    print("Planning data integration strategy...")

def ensure_scalability_and_security():
    print("Ensuring scalability and security...")

def define_business_objectives():
    print("Defining business objectives...")

def review_data_insights():
    print("Reviewing data insights...")

def make_data_driven_decisions():
    print("Making data-driven decisions...")

# Role-specific tasks
def data_engineer_task():
    # Set up data pipeline
    raw_data = extract_data_from_source()
    cleaned_data = clean_and_validate(raw_data)
    store_in_data_warehouse(cleaned_data)

def data_analyst_task():
    # Analyze data and create visualizations
    data = fetch_from_data_warehouse()
    insights = perform_analysis(data)
    visualizations = create_visualizations(insights)
    present_to_stakeholders(visualizations)

def dataops_engineer_task():
    # Set up automated testing and deployment
    implement_ci_cd_pipeline()
    automate_data_quality_checks()
    monitor_pipeline_performance()

def data_architect_task():
    # Design data system architecture
    design_data_model()
    plan_data_integration_strategy()
    ensure_scalability_and_security()

def business_stakeholder_task():
    # Provide business context and evaluate results
    define_business_objectives()
    review_data_insights()
    make_data_driven_decisions()

# Main DataOps workflow
def dataops_project():
    data_architect_task()
    data_engineer_task()
    dataops_engineer_task()
    data_analyst_task()
    business_stakeholder_task()

# Run the main DataOps project workflow
dataops_project()


Designing data model...
Planning data integration strategy...
Ensuring scalability and security...
Extracting data from source...
Cleaning and validating raw_data...
Storing cleaned_data in data warehouse...
Implementing CI/CD pipeline...
Automating data quality checks...
Monitoring pipeline performance...
Fetching data from data warehouse...
Performing analysis on fetched_data...
Creating visualizations for insights...
Presenting visualizations to stakeholders...
Defining business objectives...
Reviewing data insights...
Making data-driven decisions...
