In [3]:
import os
import llama_cpp
import pandas as pd

In [24]:
import pandas as pd

# Load the dataset
df = pd.read_csv(r"D:\downloads\Mistral modeling\pmbok_prompt_completion_pairs.csv")

# Define output path
output_path = r"D:\downloads\Mistral modeling\train_data.txt"

# Format each row and write to a text file
with open(output_path, 'w', encoding='utf-8') as f:
    for _, row in df.iterrows():
        f.write("<SFT>\n")
        f.write(f"Prompt: {row['Prompt']}\n")
        f.write(f"Completion: {row['Completion']}\n\n")

print(f"Data successfully written to {output_path}")


Data successfully written to D:\downloads\Mistral modeling\train_data.txt


In [4]:
import networkx as nx
import pandas as pd

# Initialize the knowledge graph as a directed graph
G = nx.DiGraph()

# Debug message
print("Knowledge graph initialized.")


Knowledge graph initialized.


In [5]:
import fitz  # PyMuPDF for PDF text extraction

def extract_text_from_pdf(pdf_path):
    """
    Extract text from a PDF file.
    """
    text = ""
    try:
        with fitz.open(pdf_path) as pdf:
            for page_num in range(pdf.page_count):
                page = pdf[page_num]
                text += page.get_text("text")
        print("Successfully extracted text from PDF.")
    except Exception as e:
        print(f"Error reading PDF file: {e}")
        text = ""
    return text


In [6]:
# Input cell for activity source
activity_input = None  # Placeholder for activity text
pdf_path = "D:/downloads/file.pdf"  # Set PDF path here if using a PDF, or leave as an empty string

# Full task description as text input for dynamic activity setting
text_input = (
    "Provide a detailed 7-day mobile app development plan. Outline each day’s tasks, key milestones, and expected deliverables. "
    "Include a brief risk assessment with potential risks and mitigation strategies. Keep responses concise and brief."
)

# Conditional logic to determine whether to use PDF or text input
use_pdf = pdf_path != ""

# Set activity_input based on whether to use PDF or text input
if use_pdf:
    activity_input = extract_text_from_pdf(pdf_path)
    if not activity_input:
        print("Warning: PDF text extraction failed, falling back to text input.")
        activity_input = text_input
else:
    activity_input = text_input

print("Activity input set successfully:", activity_input[:200] + "..." if len(activity_input) > 100 else activity_input)


Successfully extracted text from PDF.
Activity input set successfully: Provide a detailed 7-day mobile app development plan. Outline each day’s tasks, key
milestones, and expected deliverables.
...


# Gnerating Graph

In [7]:
import pandas as pd
df = pd.read_csv("D:/downloads/Mistral modeling/body_of_knowledge_with_embeddings_saved.csv")

In [8]:
# Define a function to parse the 'Inputs' field, assuming it contains comma-separated values
def parse_inputs(inputs_string):
    return [item.strip() for item in inputs_string.split(",") if item.strip()]

# Populate the graph with nodes and edges based on the dataset
for idx, row in df.iterrows():
    # Add each process as a node in the graph with default attributes if missing
    process_id = row["Process_ID"]
    process_name = row.get("Process_Name", f"Unnamed_Process_{process_id}")
    knowledge_area = row["Knowledge_Area_Name"]
    process_description = row["Process_Description"]
    
    # Add process as a node with attributes for reasoning
    G.add_node(process_id, name=process_name, area=knowledge_area, description=process_description)
    
    # Parse 'Inputs' field for dependencies if it's a string
    inputs = []
    if isinstance(row["Inputs"], str):
        try:
            inputs = parse_inputs(row["Inputs"])
        except Exception as e:
            print(f"Error parsing Inputs for row {idx}: {e}")
            inputs = []

    # Add edges for each input item, ensuring they are hashable
    for input_item in inputs:
        if isinstance(input_item, str) or isinstance(input_item, int):  # Check if input is hashable
            G.add_edge(input_item, process_id, relationship="dependency")
        else:
            print(f"Skipped non-hashable input for process '{process_name}': {input_item}")

    # Debug message to confirm node and edge addition
    print(f"Added process '{process_name}' with dependencies: {inputs if inputs else 'None'}")

print("Knowledge graph populated with processes and dependencies.")


Added process 'Direct and Manage Project Work' with dependencies: ["[['Project Management Plan'", "'Assignments'", "'Agreements'", "'Project Documents'", "'Issues'", "'Organizational Process Assets']]"]
Added process 'Monitor and Control Project Work' with dependencies: ["[{'Attribute': 'Cost'", "'Value': '402500'}", "{'Attribute': 'Actual Cost'", "'Value': '402500'}", "{'Attribute': 'Actual Finish Date'", "'Value': '2017-11-27'}", "{'Attribute"]
Added process 'Identify Risks' with dependencies: ['Risks', 'Risk Owners']
Added process 'Monitor Stakeholder Engagement' with dependencies: ["[{'Input': 'Project Management Plan'", "'InputId': 1046", "'Output': 'Project Management Plan Update'", "'OutputId': 1075}", "{'Input': 'Project Documents'", "'InputId': 1054", "'Output': 'Project Management Plan Update"]
Added process 'Plan Resource Management' with dependencies: ["[{'Outputs': 'Resource Management Plan'", "'Description': 'The plan that describes how the project team will acquire", 'de

# Reasoing logic for LLama

In [9]:
# Define function to get dependencies of a given process
def get_dependencies(process_id):
    dependencies = list(G.predecessors(process_id))
    print(f"Dependencies for '{G.nodes[process_id]['name']}': {dependencies}")
    return dependencies

# Define function to get stakeholders for a process (based on edge relationships)
def get_stakeholders(process_id):
    stakeholders = [node for node, attr in G.nodes(data=True) if attr.get("area") == "Stakeholder Management"]
    print(f"Stakeholders for '{G.nodes[process_id]['name']}': {stakeholders}")
    return stakeholders

# Define function to identify risk-related dependencies (mocked for simplicity)
def get_risks(process_id):
    risks = [node for node, attr in G.nodes(data=True) if attr.get("area") == "Risk Management"]
    print(f"Risks affecting '{G.nodes[process_id]['name']}': {risks}")
    return risks

# Debugging statements to verify function outputs
print("Reasoning functions initialized.")


Reasoning functions initialized.


In [10]:
# Define a function for Llama to query the graph
def generate_graph_context(activity_input):
    # Find the process ID associated with the activity input
    process_ids = [pid for pid, data in G.nodes(data=True) if data['name'].lower() in activity_input.lower()]
    if not process_ids:
        print("No matching process found in the knowledge graph.")
        return "No relevant process found in the knowledge graph."

    process_id = process_ids[0]  # Assuming the first match
    dependencies = get_dependencies(process_id)
    stakeholders = get_stakeholders(process_id)
    risks = get_risks(process_id)

    # Construct context for Llama's input prompt
    context = (
        f"Activity: {activity_input}\n"
        f"Dependencies: {dependencies}\n"
        f"Stakeholders: {stakeholders}\n"
        f"Risks: {risks}\n"
    )

    # Debug message for constructed context
    print(f"Constructed graph context for Llama:\n{context}")
    return context


# Initializing data and model 

In [11]:
base_dir = "D:\\downloads\\Mistral modeling"

# Specify paths for the model and dataset
model_path = os.path.join(base_dir, "Llama-3.2-1B-Instruct-Q5_K_M.gguf")
dataset_path = os.path.join(base_dir, "pmbok_prompt_completion_pairs.csv")

# Initialize the llama-cpp model with the specified model path
llama = llama_cpp.Llama(model_path=model_path)

# Load the dataset
dataset = pd.read_csv(dataset_path)

llama_model_loader: loaded meta data with 35 key-value pairs and 147 tensors from D:\downloads\Mistral modeling\Llama-3.2-1B-Instruct-Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.type str              = model
llama_model_loader: - kv   2:                               general.name str              = Llama 3.2 1B Instruct
llama_model_loader: - kv   3:                           general.finetune str              = Instruct
llama_model_loader: - kv   4:                           general.basename str              = Llama-3.2
llama_model_loader: - kv   5:                         general.size_label str              = 1B
llama_model_loader: - kv   6:                            general.license str              = llama3.2
llama_model_loader: 

# Formatting

In [12]:

def prepare_prompt_completion(example):
    prompt = example["Prompt"]
    completion = example["Completion"]
    return f"{prompt}\n{completion}"

# Convert the dataset into prompt-completion pairs
formatted_prompts = [prepare_prompt_completion(row) for _, row in dataset.iterrows()]

# Fine tuning Llama-3.2-1B Qunatized version. 

In [18]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments
import torch

def run_finetuning(local_model_path, dataset_path):
    # Load the tokenizer and model from the local quantized model
    tokenizer = AutoTokenizer.from_pretrained(local_model_path)
    model = AutoModelForCausalLM.from_pretrained(local_model_path)
    
    # Check if GPU is available
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)

    # Load dataset
    dataset = load_dataset("csv", data_files=dataset_path)

    # Set training arguments tailored for a quantized model
    training_args = TrainingArguments(
        output_dir="./fine_tuned_model",
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        num_train_epochs=2,
        logging_steps=10,
        save_steps=500,
        fp16=True if device == "cuda" else False,
    )
    
    # Define trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset["train"],
    )

    # Run training
    try:
        trainer.train()
        print("Fine-tuning completed successfully.")
    except Exception as e:
        print(f"An error occurred during fine-tuning: {e}")


In [21]:
# Set your local model path and dataset path
local_model_path = "Llama-3.2-1B-Instruct-Q5_K_M.gguf"
dataset_path = "pmbok_prompt_completion_pairs.csv"

# Call the function with both required arguments
run_finetuning(local_model_path, dataset_path)


OSError: Llama-3.2-1B-Instruct-Q5_K_M.gguf is not a local folder and is not a valid model identifier listed on 'https://huggingface.co/models'
If this is a private repository, make sure to pass a token having permission to this repo either by logging in with `huggingface-cli login` or by passing `token=<your_token>`

# Activity input and output generation 

In [32]:
def generate_graph_context(activity_input):
    # Lowercase input for case-insensitive matching
    input_keywords = set(activity_input.lower().split())

    # Find process IDs by matching keywords in `name` or `description`
    matching_process_ids = []
    for pid, data in G.nodes(data=True):
        name_keywords = set(data.get("name", "").lower().split())
        description_keywords = set(data.get("description", "").lower().split())
        
        # Check for any overlap between input keywords and node keywords
        if input_keywords & name_keywords or input_keywords & description_keywords:
            matching_process_ids.append(pid)
    
    if not matching_process_ids:
        # Return a fallback context if no match is found
        return "No relevant process found in the knowledge graph."

    # Use the first matched process ID for retrieving relevant information
    process_id = matching_process_ids[0]
    
    # Retrieve and filter dependencies, stakeholders, and risks (keep only string values)
    dependencies = [dep for dep in get_dependencies(process_id) if isinstance(dep, str)]
    stakeholders = [stake for stake in get_stakeholders(process_id) if isinstance(stake, str)]
    risks = [risk for risk in get_risks(process_id) if isinstance(risk, str)]

    # Construct the context summary for Llama's prompt
    context = (
        f"Activity: {activity_input}\n"
        f"Dependencies: {dependencies}\n"
        f"Stakeholders: {stakeholders}\n"
        f"Risks: {risks}\n"
    )
    
    return context  # Only return the constructed context


In [38]:
def generate_response_with_graph(activity_input):
    # Retrieve relevant context from the knowledge graph based on the input
    graph_context = generate_graph_context(activity_input)

    # Tailored prompt to ensure relevance to `activity_input`
    prompt = (
        f"{graph_context}\n\n"
        f"Activity: {activity_input}\n\n"
        "Please generate a practical project plan specific to the activity, including:\n"
        "- Timeline with Key Milestones\n"
        "- Resource Requirements\n"
        "- Dependencies\n"
        "- Brief Risk Assessment with mitigation suggestions\n"
        "- Stakeholder Communication Needs (if relevant)\n\n"
        "Provide actionable, specific information directly related to the activity."
    )
    
    # Print prompt for debugging
    print("Final Prompt to Llama:\n", prompt)

    # Generate response with adjusted parameters for relevance and specificity
    response = llama(
        prompt,
        max_tokens=1900,  # Adjusted for focused output within 900 tokens
        temperature=0,  # Reduced temperature for coherent, concise output
        top_p=0.1,
        presence_penalty=0.8,  # Encourage thorough coverage of all sections
        frequency_penalty=0.3   # Minimize repetitive sections
    )

    # Extract and print the model response text
    output_text = response["choices"][0]["text"]
    print("Model Response:", output_text)
    
    # Validate the response includes the key sections
    required_sections = [
        "Objective and Scope",
        "Timeline with Key Milestones",
        "Resource Requirements",
        "Dependencies",
        "Risk Assessment",
    ]
    
    # Identify missing sections for review
    missing_sections = [section for section in required_sections if section not in output_text]
    if missing_sections:
        print("Warning: The response is missing sections:", missing_sections)
    
    return output_text

# Run the updated function using dynamic `activity_input`
Fine_tuned_Response = generate_response_with_graph(activity_input)


Dependencies for 'Direct and Manage Project Work': ["[['Project Management Plan'", "'Assignments'", "'Agreements'", "'Project Documents'", "'Issues'", "'Organizational Process Assets']]", '[7] Communications Management Plan', '[8] Organizational Process Assets', '[9] Project Documents', "['Work performance data'", "'Work performance information'", "'Team performance reports']"]
Stakeholders for 'Direct and Manage Project Work': [49, 108, 24, 119, 9]
Risks affecting 'Direct and Manage Project Work': [85, 5, 62, 59, 139, 76, 107, 10, 140, 118, 17, 57, 30, 1141, 1031]
Final Prompt to Llama:
 Activity: Provide a detailed 7-day mobile app development plan. Outline each day’s tasks, key
milestones, and expected deliverables.

Dependencies: ["[['Project Management Plan'", "'Assignments'", "'Agreements'", "'Project Documents'", "'Issues'", "'Organizational Process Assets']]", '[7] Communications Management Plan', '[8] Organizational Process Assets', '[9] Project Documents', "['Work performance

Llama.generate: 190 prefix-match hit, remaining 1 prompt tokens to eval
llama_perf_context_print:        load time =    4083.57 ms
llama_perf_context_print: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:        eval time =       0.00 ms /   321 runs   (    0.00 ms per token,      inf tokens per second)
llama_perf_context_print:       total time =   17664.94 ms /   322 tokens


Model Response:  Here is the detailed plan:


**Day 1: Project Management Plan and Assignments**

* **Project Management Plan**: Define project scope, objectives, timelines, budget, resources, and deliverables.
* **Assignments**: Identify team members, their roles, responsibilities, and expectations.
* **Agreements**: Establish communication channels, issue resolution processes, and conflict resolution procedures.
* **Project Documents**: Create a project charter, scope statement, and change management plan.
* **Work performance data**: Collect initial data on team performance, including task completion rates and feedback.

Expected Deliverables:

* Project Management Plan document
* Assignments document
* Agreements document
* Project Charter
* Scope Statement
* Change Management Plan

**Day 2: Organizational Process Assets**

* **Organizational Process Assets**: Identify and document existing processes, procedures, and standards.
* **Project Documents**: Update project documents to r

# Extract Relevant Information from Llama’s Output

In [32]:
import re

def extract_entities_from_output(output_text):
    """
    Extract tasks, dependencies, risks, and stakeholders from Llama's output.
    This function uses regex to identify key phrases for simplicity.
    """
    tasks = re.findall(r"Tasks?: (.+?)(?:\n|$)", output_text)
    dependencies = re.findall(r"Dependencies?: (.+?)(?:\n|$)", output_text)
    risks = re.findall(r"Risks?: (.+?)(?:\n|$)", output_text)
    stakeholders = re.findall(r"Stakeholders?: (.+?)(?:\n|$)", output_text)

    # Debug output for extracted entities
    print("Extracted Tasks:", tasks)
    print("Extracted Dependencies:", dependencies)
    print("Extracted Risks:", risks)
    print("Extracted Stakeholders:", stakeholders)
    
    return tasks, dependencies, risks, stakeholders


# Apply Ponderation Techniques

In [37]:
# Define keyword sets for relevance scoring
task_keywords = {"task", "milestone", "deliverable", "complete"}
dependency_keywords = {"dependency", "requirement", "before", "after"}
risk_keywords = {"risk", "issue", "challenge", "mitigation"}
stakeholder_keywords = {"stakeholder", "team", "manager", "client", "involved"}

def score_entity(entity, keywords):
    """
    Assign a relevance score based on the presence of keywords.
    Higher scores for entities with multiple relevant keywords.
    """
    words = set(entity.lower().split())
    score = sum(1 for word in words if word in keywords)
    return score


In [38]:
def ponderate_entities(entities, keywords):
    """
    Filter and rank entities based on contextual relevance using keyword scoring.
    """
    scored_entities = [(entity, score_entity(entity, keywords)) for entity in entities]
    # Filter out entities with a score of 0 (no relevance)
    scored_entities = [(entity, score) for entity, score in scored_entities if score > 0]
    # Sort by score in descending order
    ranked_entities = sorted(scored_entities, key=lambda x: x[1], reverse=True)
    
    # Debug output for scored entities
    print("Scored and Ranked Entities:", ranked_entities)
    
    # Return only the entity names, not the scores, for final output
    return [entity for entity, score in ranked_entities]



# Graph expansion if applicable

In [39]:
def expand_graph_with_llama_output(output_text):
    # Extract entities from Llama’s output
    tasks, dependencies, risks, stakeholders = extract_entities_from_output(output_text)
    
    # Ponderate each category with contextual scoring
    tasks = ponderate_entities(tasks, task_keywords)
    dependencies = ponderate_entities(dependencies, dependency_keywords)
    risks = ponderate_entities(risks, risk_keywords)
    stakeholders = ponderate_entities(stakeholders, stakeholder_keywords)
    
    # Add tasks as new nodes
    for task in tasks:
        G.add_node(task, type="task")
        print(f"Added task node: {task}")
    
    # Add dependencies as edges between tasks if applicable
    for dependency in dependencies:
        task_links = dependency.split(" and ")
        if len(task_links) == 2:
            G.add_edge(task_links[0].strip(), task_links[1].strip(), relationship="dependency")
            print(f"Added dependency edge: {task_links[0].strip()} -> {task_links[1].strip()}")
    
    # Add risks and link them to tasks
    for risk in risks:
        G.add_node(risk, type="risk")
        for task in tasks:
            G.add_edge(risk, task, relationship="risk_impact")
            print(f"Added risk impact edge: {risk} -> {task}")
    
    # Add stakeholders and link them to tasks
    for stakeholder in stakeholders:
        G.add_node(stakeholder, type="stakeholder")
        for task in tasks:
            G.add_edge(stakeholder, task, relationship="involvement")
            print(f"Added stakeholder involvement edge: {stakeholder} -> {task}")


In [None]:
# Example: Generate a response and expand the graph with new information
activity_input = "Identify additional tasks and risks for the mobile app project lifecycle"
response_text = generate_response_with_graph(activity_input)  # Assuming this generates a response

# Expand the graph using Llama's output
expand_graph_with_llama_output(response_text)


# Refinig output with a more perfomrant models

In [None]:
import os
from groq import Groq

# Loading API
with open("pmkey.txt", "r") as file:
    api_key = file.read().strip()
os.environ["GROQ_API_KEY"] = api_key

# Initialize Groq client
client = Groq(api_key=os.environ.get("GROQ_API_KEY"))

# Define a function to refine the already-generated model output using Groq API
def refine_with_groq(model_output):
    messages = [
        {
            "role": "user",
            "content": f"Refine the following project plan to be more grounded in reality and well-structured:\n\n{model_output}"
        }
    ]
    
    # Send request to Groq API
    chat_completion = client.chat.completions.create(
        messages=messages,
        model="llama3-8b-8192" 
    )
    
    # Retrieve and print the refined output
    refined_output = chat_completion.choices[0].message.content
    print("Refined Output from Groq API:", refined_output)
    return refined_output

# Call refine_with_groq to refine this stored response
refined_output = refine_with_groq(Fine_tuned_Response)


# Human evaluation and other Metrics

In [9]:
from rouge import Rouge
from sentence_transformers import SentenceTransformer, util

reference_text = """
Day 1: Define project scope, objectives, and team roles. 
Day 2: Requirements gathering and stakeholder analysis...
"""  

# ROUGE Score
rouge = Rouge()
rouge_scores = rouge.get_scores(refined_output, reference_text)
print("ROUGE Scores:", rouge_scores)

# Cosine Similarity
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
output_embedding = model.encode(refined_output, convert_to_tensor=True)
reference_embedding = model.encode(reference_text, convert_to_tensor=True)
cosine_similarity = util.pytorch_cos_sim(output_embedding, reference_embedding)
print("Cosine Similarity:", cosine_similarity.item())


ROUGE Scores: [{'rouge-1': {'r': 0.5333333333333333, 'p': 0.034334763948497854, 'f': 0.06451612789574664}, 'rouge-2': {'r': 0.0625, 'p': 0.002347417840375587, 'f': 0.004524886180053751}, 'rouge-l': {'r': 0.5333333333333333, 'p': 0.034334763948497854, 'f': 0.06451612789574664}}]


  attn_output = torch.nn.functional.scaled_dot_product_attention(


Cosine Similarity: 0.5321035981178284
