## A seamless MLOps workflow, powered by Generative AI

In [40]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/titanic/train.csv
/kaggle/input/titanic/test.csv
/kaggle/input/titanic/gender_submission.csv


In [41]:
#  1. Setup: Install and Import
!pip install -q langchain langchain-google-genai google-generativeai pandas scikit-learn matplotlib seaborn pyyaml rich

In [42]:
pip install -q -U google-genai

Note: you may need to restart the kernel to use updated packages.


In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import google.generativeai as genai
import os
import warnings
import requests
import json
warnings.filterwarnings("ignore")

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score

In [44]:
# -------------------- CONFIG --------------------
GEMINI_API_KEY = "Your-GEMINI-API_Key"
GEMINI_API_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-1.5-flash:generateContent"
HEADERS = {"Content-Type": "application/json"}


In [45]:
# -------------------- GEMINI HELPER --------------------
def query_gemini(prompt):
    payload = {
        "contents": [{"parts": [{"text": prompt}]}]
    }
    response = requests.post(
        f"{GEMINI_API_URL}?key={GEMINI_API_KEY}",
        headers=HEADERS,
        data=json.dumps(payload)
    )
    return response.json()['candidates'][0]['content']['parts'][0]['text']




In [46]:
#  Step 1: Load Data
def load_data():
    df = pd.read_csv("/kaggle/input/titanic/train.csv")
    return df


In [47]:
# 🧹 Step 2: Preprocess Data

def preprocess_data(df):
    #  Prompt to Gemini: Ask for preprocessing steps without reading CSV
    prompt = """
You are a data scientist. Clean the Titanic dataset with steps like:

1. Handle missing values
2. Encode categorical variables
3. Drop irrelevant columns

The DataFrame is already loaded as df. DO NOT use read_csv(). Output Python code ONLY. No markdown or explanation.
"""
    # Get preprocessing suggestion from Gemini
    suggestion = query_gemini(prompt)

    # Clean Gemini's response — remove backticks and extra formatting
    clean_code = suggestion.replace("```python", "").replace("```", "").strip()

    # Extra safeguard: Remove any accidental `read_csv()` lines
    clean_code = "\n".join([
        line for line in clean_code.splitlines()
        if 'read_csv' not in line
    ])

    print("Preprocessing Code:\n", clean_code)

    # Create a new execution environment with df as context
    exec_globals = {'df': df.copy()}

    #  Dynamically execute the Gemini-generated code
    exec(clean_code, exec_globals)
    print("📊 Columns after preprocessing:", exec_globals['df'].shape)


    # Return the cleaned DataFrame
    return exec_globals['df']


In [48]:
def train_model(X_train, y_train):
    prompt = """
You are a machine learning engineer. Train a RandomForestClassifier using the variables X_train and y_train.
Do NOT generate synthetic data. Use X_train and y_train directly. Name the model as `model`.
Return ONLY the training code.
"""
    suggestion = query_gemini(prompt)

    # Clean Gemini's output
    clean_code = suggestion.replace("```python", "").replace("```", "").strip()

    # Strip out any code that tries to redefine data
    forbidden_phrases = ["make_classification", "X =", "y =", "train_test_split"]
    clean_code = "\n".join([
        line for line in clean_code.splitlines()
        if not any(p in line for p in forbidden_phrases)
    ])

    # Force consistent model variable name
    clean_code = clean_code.replace("rf_classifier", "model").replace("clf", "model")

    print(" Training Code:\n", clean_code)

    exec_globals = {'X_train': X_train, 'y_train': y_train}
    exec(clean_code, exec_globals)

    if 'model' not in exec_globals:
        raise ValueError(" ERROR: `model` was not defined in the training code.")

    return exec_globals['model']


In [49]:
def evaluate_model(model, X_test, y_test):
    from sklearn.metrics import accuracy_score, classification_report

    # 🎯 Predict on test data
    y_pred = model.predict(X_test)

    # 🧮 Compute metrics
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred)

    # 📝 Log evaluation
    print("✅ Accuracy:", acc)
    print("📊 Classification Report:\n", report)

    # 📁 Optional: Save results to output folder
    with open("evaluation_log.txt", "w") as f:
        f.write(f"Accuracy: {acc}\n\n")
        f.write("Classification Report:\n")
        f.write(report)

    # 🤖 Gemini-based evaluation summary (optional, toggle on/off)
    enable_gemini_summary = True
    if enable_gemini_summary:
        prompt = f"""
You are a machine learning expert. Analyze the following classification report and accuracy score.

Accuracy: {acc}

Report:
{report}

Explain in simple terms how well the model is performing and suggest ways to improve it if needed.
"""
        gemini_feedback = query_gemini(prompt)
        print("💡 Gemini Summary:\n", gemini_feedback)

        with open("gemini_evaluation_summary.txt", "w") as f:
            f.write(gemini_feedback)

    return acc, report


In [50]:
def generate_report(acc, report, save_to_file=True, verbose=True):
    """
    Generates a human-readable evaluation report using Gemini 1.5 Flash.

    Args:
        acc (float): Accuracy of the model.
        report (str): Classification report from sklearn.
        save_to_file (bool): If True, saves the summary to a text file.
        verbose (bool): If True, prints the summary.

    Returns:
        str: Gemini-generated summary report.
    """
    prompt = f"""
You are a machine learning evaluator. Here is a model performance summary:

Accuracy: {acc}

Classification Report:
{report}

Write a professional, human-readable evaluation report. Include insights into precision, recall, and F1-score, and suggest potential improvements if applicable.
"""

    gemini_summary = query_gemini(prompt)

    if verbose:
        print("📋 Gemini Evaluation Summary:\n")
        print(gemini_summary)

    if save_to_file:
        with open("final_model_summary.txt", "w") as f:
            f.write("🎯 Accuracy: " + str(acc) + "\n\n")
            f.write("📊 Classification Report:\n" + report + "\n\n")
            f.write("🧠 Gemini Summary:\n" + gemini_summary)

    return gemini_summary


In [51]:
import datetime
import joblib

def run_pipeline():
    output_dir = "/kaggle/working/output"
    os.makedirs(output_dir, exist_ok=True)
    log_path = os.path.join(output_dir, "mlops_log.txt")

    try:
        with open(log_path, "w") as log_file:
            log = lambda msg: log_file.write(f"[{datetime.datetime.now()}] {msg}\n")

            log("🚀 Starting MLOps Pipeline")

            df = load_data()
            log("✅ Loaded data.")

            df_clean = preprocess_data(df)
            log("🧹 Preprocessed data.")

            X = df_clean.drop("Survived", axis=1)
            y = df_clean["Survived"]
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
            print("🔎 Training data shape:", X_train.shape)


            model = train_model(X_train, y_train)
            log("🤖 Trained model.")

            acc, eval_report = evaluate_model(model, X_test, y_test)
            log(f"🧪 Evaluation Accuracy: {acc}")
            log(eval_report)

            summary = generate_report(acc, eval_report)
            log("\n📋 Gemini-generated Summary:")
            log(summary)

            # Optional: Save model
            model_path = os.path.join(output_dir, "random_forest_model.pkl")
            joblib.dump(model, model_path)
            log(f"💾 Model saved to: {model_path}")

            print("✅ MLOps Pipeline Complete. Check output folder for logs, summary, and model.")

    except Exception as e:
        with open(log_path, "a") as log_file:
            log_file.write(f"[{datetime.datetime.now()}]  ERROR: {str(e)}\n")
        print(" Pipeline failed. Check logs for details.")


In [52]:
run_pipeline()

Preprocessing Code:
 import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Handle missing values
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
df.dropna(subset=['Fare'], inplace=True)

# Encode categorical variables
le = LabelEncoder()
for col in ['Sex', 'Embarked']:
    df[col] = le.fit_transform(df[col])

# Drop irrelevant columns
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1, inplace=True)
📊 Columns after preprocessing: (891, 8)
🔎 Training data shape: (712, 7)
 Training Code:
 from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train, y_train)
✅ Accuracy: 0.8212290502793296
📊 Classification Report:
               precision    recall  f1-score   support

           0       0.85      0.85      0.85       105
           1       0.78      0.78      0.78        74

    accuracy                           0.82       179
   macro avg       0.82      0