In [None]:

# Step 1: Install Required Libraries
# !pip install pandas faiss-cpu sentence-transformers openai

In [66]:
import pandas as pd
import numpy as np
import faiss
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import normalize
from openai import OpenAI

# Step 1: Model synonym mapping (all lowercase)
MODEL_SYNONYMS = {
    "bizdevops": ["bizdevops", "biz"],
    "bucena": ["bucena", "bucena maturity model", "bucena mm"],
    "calms": ["calms","calm"],
    "cmmi": ["cmmi", "capability maturity model integration"],
    "capgemini": ["capgemini", "cap gemini"],
    "dmm4ssc": ["devops maturity model for small software companies","dmm4ssc"],
    "de feijter": ["de feijter", "defeijter"],
    "eficode": ["eficode","efi code","eficod"],
    "hp": ["hp", "hewlett-packard","hewlett packard"],
    "ibm": ["ibm", "international business machines"],
    "mmbdo (maturity model for bizdevops)": [
        "mmbdo", "mmbdo (maturity model for bizdevops)",
        "mmbdo (maturity model for biz)", "mmbdo(maturitymodelforbizdevops)",
        "mmbdo ( for bizdevops)"
    ],
    "mohamed": ["mohamed","mohammed"],
    "neubrand": ["neubrand","neu brand","newbrand","new brand"],
    "quantitative maturity assessment of devsecops": [
        "quantitative maturity assessment of devsecops",
        "quantitativematurityassessmentofdevsecops",
        "qmaodso"
    ],
    "radstaak": ["radstaak","radstak","rad staak","rad stak"],
    "teixeira": ["teixeira"],
    "xmatters": ["xmatters","xmatter"],
    "rmcosd": ["rmcosd","readiness model for cloud outsourcing software development" ],
    "dcmm": ["dcmm","devops capability maturity model" ],
    "opensamm" : ["opensamm","open software assurance maturity model","open samm","opensam","open sam"]
}

# Step 2: Load and clean datasets
df_models = pd.read_csv("devops_maturity_models.csv")
df_components = pd.read_csv("mmcmse.csv")

df_models.columns = df_models.columns.str.strip().str.lower()
df_components.columns = df_components.columns.str.strip().str.lower()

# Step 3: Normalize model name using synonyms
def normalize_model_name(name):
    name = str(name).lower().strip()
    for canonical, aliases in MODEL_SYNONYMS.items():
        if name == canonical or name in aliases:
            return canonical.title()
    return name.title()

# Step 4: Extract model names from context
def extract_model_name(context):
    context = str(context).lower()
    for canonical, aliases in MODEL_SYNONYMS.items():
        for alias in aliases:
            if alias in context:
                return canonical.title()
    return None

df_components["model"] = df_components["context"].apply(extract_model_name)

# Step 5: Create semantic chunks
def create_chunks(df_models, df_components):
    chunks = []

    df_models["maturity_model_name"] = df_models["maturity_model_name"].apply(normalize_model_name)
    df_components["model"] = df_components["model"].apply(normalize_model_name)

    for _, row in df_models.iterrows():
        model = row['maturity_model_name']
        if pd.isna(model):
            continue
        text_parts = [f"Maturity Model: {model}"]
        for col in ['dimension', 'subdimension', 'level', 'metric', 'success_factor', 'evaluation_method']:
            val = row.get(col)
            if pd.notna(val) and str(val).strip() != "":
                text_parts.append(f"{col.capitalize()}: {val}")
        if len(text_parts) > 1:
            full_text = "\n".join(text_parts)
            chunks.append({"model": model, "text": full_text, "source": "devops_maturity_models"})

    for _, row in df_components.iterrows():
        if pd.notna(row.get("context")) and pd.notna(row.get("keyword")):
            model = row.get("model", "Unknown")
            entry = f"Maturity Model: {model}\n{row['type'].capitalize()}: {row['keyword']}\nContext: {row['context']}"
            chunks.append({"model": model, "text": entry, "source": "mmcmse"})

    return pd.DataFrame(chunks)

# Step 6: Build FAISS index
def build_faiss_index(df_chunks, model_name="all-MiniLM-L6-v2"):
    embedder = SentenceTransformer(model_name)
    embeddings = embedder.encode(df_chunks["text"].tolist(), convert_to_tensor=False)
    embeddings = normalize(np.array(embeddings))
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(embeddings)
    return index, embeddings, embedder

# Step 7: Semantic search
def search_query(query, df_chunks, index, embedder, top_k=10):
    query_vec = embedder.encode([query], convert_to_tensor=False)
    query_vec = normalize(np.array(query_vec))
    D, I = index.search(query_vec, top_k)
    results = df_chunks.iloc[I[0]]

    # 🧹 Filter out missing/unknown models
    results = results[results["model"].notna() & (results["model"].str.lower() != "none") & (results["model"].str.strip() != "")]
    return results

# Step 8: RAG context generation
def generate_rag_context(query, df_chunks, index, embedder, top_k=30):
    # Normalize query for case-insensitive matching
    query_lower = query.lower()

    # Step 1: Match models from user query using synonyms (case-insensitive)
    matched_models = []
    for canonical_model, aliases in MODEL_SYNONYMS.items():
        for alias in aliases:
            if alias.lower() in query_lower:
                matched_models.append(canonical_model)
                break  # No need to check other aliases for this model

    # Step 2: If matched, return only those chunks
    if matched_models:
        # Normalize model names in df_chunks for comparison
        df_chunks["model_normalized"] = df_chunks["model"].str.lower().str.strip()

        allowed_models = [model.lower().strip() for model in matched_models]
        filtered_chunks = df_chunks[df_chunks["model_normalized"].isin(allowed_models)]

        # Remove duplicate rows by text
        filtered_chunks = filtered_chunks.drop_duplicates(subset="text")

        context = "\n---\n".join(filtered_chunks["text"].tolist())
        return context

    # Step 3: If no explicit model match, fall back to semantic search
    retrieved = search_query(query, df_chunks, index, embedder, top_k)
    context = "\n---\n".join(retrieved["text"].drop_duplicates().tolist())
    return context



# Step 9: OpenAI query with context
def query_openai_with_context(user_query, retrieved_context, api_key, model="gpt-4"):
    client = OpenAI(api_key=api_key)
    system_prompt = "You are a DevOps maturity model expert. Answer questions based on the following information."
    full_prompt = f"{retrieved_context}\n\nUser Question: {user_query}"
    response = client.chat.completions.create(
        model=model,
        messages=[
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": full_prompt}
        ],
        temperature=0.3
    )
    return response.choices[0].message.content

# Step 10: Pipeline runner
def run_rag_pipeline(user_query, api_key, top_k=10):
    df_chunks = create_chunks(df_models, df_components)
    index, embeddings, embedder = build_faiss_index(df_chunks)
    context = generate_rag_context(user_query, df_chunks, index, embedder, top_k=top_k)
    print("Retrieved Chunks:\n", context)
    print("\n---\nAnswer from OpenAI:\n")
    answer = query_openai_with_context(user_query, context, api_key)
    print(answer)

# Step 11: Interactive input loop
if __name__ == "__main__":
    api_key = "OpenAI_Key"
    while True:
        user_query = input("\nEnter your DevOps maturity model question (or type 'exit' to quit): ")
        if user_query.strip().lower() == 'exit':
            print("Exiting.")
            break
        run_rag_pipeline(user_query, api_key)



Enter your DevOps maturity model question (or type 'exit' to quit):  automation


Retrieved Chunks:
 

---
Answer from OpenAI:

As a DevOps maturity model expert, I can say that automation is a critical aspect of DevOps. It refers to the process of using technology to perform tasks with reduced human assistance. In the context of DevOps, automation can be applied to various stages of the software delivery lifecycle, including development, testing, deployment, and operations. 

In the DevOps maturity model, automation is often seen as a progression. Organizations may start with basic automation of build and deployment processes and gradually move towards more advanced practices like infrastructure as code, automated testing, and automated monitoring and recovery. 

The ultimate goal is to achieve Continuous Integration/Continuous Delivery (CI/CD), where code changes are automatically built, tested, and deployed to production. This not only speeds up the software delivery process but also reduces the risk of human error and improves the overall quality of the software


Enter your DevOps maturity model question (or type 'exit' to quit):  exit


Exiting.
