In [None]:
import pandas as pd
import json
import faiss
from sentence_transformers import SentenceTransformer
from datetime import datetime

# Load the CVE dataset
file_path = "./data/normalized_cve_data.csv"  # Replace with your dataset file path
cve_data = pd.read_csv(file_path)

# Initialize the SentenceTransformer model for text embeddings
model = SentenceTransformer('all-MiniLM-L6-v2')

# Function to generate input-output pairs
def generate_input_output_pairs(data):
    pairs = []
    created_prompts = set()
    total_rows = len(data)
    # To track already created prompts (to avoid duplicates)
    created_prompts = set()

    # Iterate through dataset rows and generate prompts for all fields
    total_rows = len(data)
    for index, row in data.iterrows():
        cve_id = row.get("CVE_ID", "Unknown ID")
        description = row.get("Description", "No description provided.")
        device = row.get("Device", "Unknown Device")
        epss_score = row.get("EPSS_Score", "N/A")
        cvss_scores = row.get("CVSS Scores", "N/A")
        severity = row.get("Severity ", "Unknown Severity").strip("[]'")
        published_date = row.get("Published Date", "Unknown Date")
        update_date = row.get("Update Date", "Unknown Date")
        problem_type = row.get("Problem Type", "Unknown Problem Type")
        # Additional fields
        exploitability = row.get("Exploitability Scores", "N/A")
        access_vector = row.get("Vectors", "N/A")
        # access_complexity = row.get("Access Complexity", "N/A")
        authentication = row.get("Score Sources", "N/A")
        impact = row.get("Impact", "N/A")
        # confidentiality_impact = row.get("Confidentiality Impact", "N/A")
        # integrity_impact = row.get("Integrity Impact", "N/A")
        # availability_impact = row.get("Availability Impact", "N/A")
        year = row.get("Published Date", "").split("-")[0]  # Extract year
        vendor = row.get("Vendor", "N/A")
        firmware_version = row.get("Firmware Version", "N/A")
        product = row.get("Product", "N/A")

        # 1. Greeting and Introduction
        greeting_input = "Hello, can you tell me about TargetedVulnAi?"
        greeting_output = "Hello! I am a part of the TargetedVulnAi project, where we focus on providing automated analysis and summaries of CVE data, specifically related to cameras, routers, switches, and NVRs. How can I assist you today?"
        if greeting_input not in created_prompts:
            pairs.append({"input": greeting_input, "output": greeting_output})
            created_prompts.add(greeting_input)

        # 2. Summarization Prompt (Including CVE_ID and Update Date)
        summarization_input = f"Summarize CVE {cve_id}: {description} (CVSS: {cvss_scores}, EPSS: {epss_score}, Exploitability: {exploitability})"
        summarization_output = f"Summary: CVE {cve_id} involves a {problem_type} affecting {device}. Severity: {severity}, CVSS: {cvss_scores}, Exploitability: {exploitability}. Last updated on {update_date}. Description: {description}"
        if summarization_input not in created_prompts:
            pairs.append({"input": summarization_input, "output": summarization_output})
            created_prompts.add(summarization_input)
        
        # 3. Question Generation Prompt
        question_input = f"Create a detailed question based on the following vulnerability: {description} (CVSS: {cvss_scores}, EPSS: {epss_score})"
        question_output = f"What are the potential impacts of CVE {cve_id} on {device}? How severe is this vulnerability (CVSS: {cvss_scores}, EPSS: {epss_score})? Last updated on {update_date}."
        if question_input not in created_prompts:
            pairs.append({"input": question_input, "output": question_output})
            created_prompts.add(question_input)
        
        # 4. Risk Assessment Prompt (Including Problem Type)
        risk_assessment_input = f"Write a comprehensive risk assessment report for CVE {cve_id}, considering the following details: Description: {description}, CVSS Score: {cvss_scores}, EPSS Score: {epss_score}, Exploitability: {exploitability}, Impact: {impact}, Device: {device}."
        risk_assessment_output = f"Risk Assessment for CVE {cve_id}: Severity: {severity}, CVSS: {cvss_scores}, EPSS: {epss_score}, Exploitability: {exploitability}. Published: {published_date}, Last updated: {update_date}. Description: {description}. Impacts: {impact}. Device: {device}. Mitigation steps: [Add appropriate steps here]."
        if risk_assessment_input not in created_prompts:
            pairs.append({"input": risk_assessment_input, "output": risk_assessment_output})
            created_prompts.add(risk_assessment_input)
        
        # 5. Categorization Prompt (Considering Problem Type)
        categorization_input = f"Categorize CVE {cve_id} by Problem Type and Device: {description}, CVSS: {cvss_scores}, EPSS: {epss_score}, Exploitability: {exploitability}."
        categorization_output = f"Problem Type: {problem_type}, Device: {device}, CVSS: {cvss_scores}, EPSS: {epss_score}, Exploitability: {exploitability}"
        if categorization_input not in created_prompts:
            pairs.append({"input": categorization_input, "output": categorization_output})
            created_prompts.add(categorization_input)
        
        # 6. JSON Extraction Prompt (Including all fields and details)
        json_input = f"Extract and format the following CVE details as JSON: CVE {cve_id}, Description: {description}, CVSS: {cvss_scores}, EPSS: {epss_score}, Device: {device}, Impact: {impact}, Exploitability: {exploitability}."
        json_output = {
            "CVE_ID": cve_id,
            "Description": description,
            "Device": device,
            "EPSS_Score": epss_score,
            "CVSS Scores": cvss_scores,
            "Severity": severity,
            "Published Date": published_date,
            "Update Date": update_date,
            "Problem Type": problem_type,
            # "CWE_ID": cwe_id,
            "Exploitability": exploitability,
            "Access Vector": access_vector,
            # "Access Complexity": access_complexity,
            "Authentication": authentication,
            "Impact": impact,
            # "Confidentiality Impact": confidentiality_impact,
            # "Integrity Impact": integrity_impact,
            # "Availability Impact": availability_impact,
            "Year": year,
            "Vendor": vendor,
            "Firmware Version": firmware_version,
            "Product": product
        }
        if json_input not in created_prompts:
            pairs.append({"input": json_input, "output": json_output})
            created_prompts.add(json_input)

        # 7. Complex Query - Filtering by EPSS, CVSS, and Update Date
        filtered_cve_data = data.sort_values(by=["EPSS_Score", "CVSS Scores", "Update Date"], ascending=False).head(10)
        complex_query_input = "Filter the CVEs based on EPSS score, CVSS score, and update date. Show the top 5."
        complex_query_output = filtered_cve_data.to_dict(orient='records')  # Show up to 5 results
        if complex_query_input not in created_prompts:
            pairs.append({"input": complex_query_input, "output": complex_query_output})
            created_prompts.add(complex_query_input)
    
        # 8. Report Generation Prompt - For Each CVE (With All Fields)
        report_input = f"Generate a detailed report for CVE {cve_id} including all the relevant details."
        report_output = {
            "CVE_ID": cve_id,
            "Description": description,
            "Device": device,
            "EPSS_Score": epss_score,
            "CVSS Scores": cvss_scores,
            "Severity": severity,
            "Published Date": published_date,
            "Update Date": update_date,
            "Problem Type": problem_type,
            # "CWE_ID": cwe_id,
            "Exploitability": exploitability,
            "Access Vector": access_vector,
            # "Access Complexity": access_complexity,
            "Authentication": authentication,
            "Impact": impact,
            # "Confidentiality Impact": confidentiality_impact,
            # "Integrity Impact": integrity_impact,
            # "Availability Impact": availability_impact,
            "Year": year,
            "Vendor": vendor,
            "Firmware Version": firmware_version,
            "Product": product
        }
        if report_input not in created_prompts:
            pairs.append({"input": report_input, "output": report_output})
            created_prompts.add(report_input)

        # 1. Queries for Vendor
        vendor_input = f"What vendor is affected by CVE {cve_id}?"
        vendor_output = f"CVE {cve_id} affects the vendor: {vendor}."
        if vendor_input not in created_prompts:
            pairs.append({"input": vendor_input, "output": vendor_output})
            created_prompts.add(vendor_input)

        vendor_comparison_input = f"Compare CVE {cve_id} with other CVEs from the same vendor."
        vendor_comparison_output = f"CVE {cve_id} is from {vendor}. Other CVEs from {vendor} may share similar characteristics like {description}."
        if vendor_comparison_input not in created_prompts:
            pairs.append({"input": vendor_comparison_input, "output": vendor_comparison_output})
            created_prompts.add(vendor_comparison_input)

        vendor_mitigation_input = f"What mitigation steps are available for CVE {cve_id} from {vendor}?"
        vendor_mitigation_output = f"Mitigation steps for CVE {cve_id} from {vendor}: [Vendor-specific steps]."
        if vendor_mitigation_input not in created_prompts:
            pairs.append({"input": vendor_mitigation_input, "output": vendor_mitigation_output})
            created_prompts.add(vendor_mitigation_input)

        # 2. Queries for Product
        product_input = f"Which product is affected by CVE {cve_id}?"
        product_output = f"CVE {cve_id} affects the product: {product}."
        if product_input not in created_prompts:
            pairs.append({"input": product_input, "output": product_output})
            created_prompts.add(product_input)

        product_mitigation_input = f"What are the mitigation strategies for CVE {cve_id} affecting {product}?"
        product_mitigation_output = f"Mitigation strategies for CVE {cve_id} affecting {product}: [Product-specific mitigation details]."
        if product_mitigation_input not in created_prompts:
            pairs.append({"input": product_mitigation_input, "output": product_mitigation_output})
            created_prompts.add(product_mitigation_input)

        # 3. Queries for Device
        device_input = f"Which type of device is affected by CVE {cve_id}?"
        device_output = f"CVE {cve_id} affects the device: {device}."
        if device_input not in created_prompts:
            pairs.append({"input": device_input, "output": device_output})
            created_prompts.add(device_input)

        device_filter_input = f"Filter CVEs that affect {device}."
        device_filter_output = data[data["Device"] == device].to_dict(orient='records')
        if device_filter_input not in created_prompts:
            pairs.append({"input": device_filter_input, "output": device_filter_output})
            created_prompts.add(device_filter_input)

        # 4. Queries for Firmware Version
        firmware_input = f"Which firmware versions are affected by CVE {cve_id}?"
        firmware_output = f"CVE {cve_id} affects firmware version: {firmware_version}."
        if firmware_input not in created_prompts:
            pairs.append({"input": firmware_input, "output": firmware_output})
            created_prompts.add(firmware_input)

        firmware_comparison_input = f"Compare CVEs based on firmware version {firmware_version}."
        firmware_comparison_output = f"CVE {cve_id} with firmware version {firmware_version} is compared with others for potential vulnerabilities."
        if firmware_comparison_input not in created_prompts:
            pairs.append({"input": firmware_comparison_input, "output": firmware_comparison_output})
            created_prompts.add(firmware_comparison_input)

        # 5. Queries for Dates (Published and Updated)
        date_published_input = f"What CVEs were published on {published_date}?"
        date_published_output = f"CVEs published on {published_date}: [{cve_id}]"
        if date_published_input not in created_prompts:
            pairs.append({"input": date_published_input, "output": date_published_output})
            created_prompts.add(date_published_input)

        # Status Update
        print(f"Processed CVE {cve_id} - Progress: {index + 1}/{total_rows} CVEs")

    return pairs

# Generate pairs for the entire dataset (no predefined size)
input_output_pairs = generate_input_output_pairs(cve_data)

# Convert to DataFrame
df = pd.DataFrame(input_output_pairs)

# Prepare FAISS index
# First, we need to get the embeddings for the 'input' column
input_texts = df['input'].tolist()

# Encode the inputs into vectors
input_embeddings = model.encode(input_texts)

# Create a FAISS index
dimension = input_embeddings.shape[1]  # Embedding dimension size
index = faiss.IndexFlatL2(dimension)  # L2 distance metric for similarity search
index.add(input_embeddings.astype('float32'))  # Add embeddings to the index

# Save the FAISS index to a file
faiss.write_index(index, './data/faiss_index.index')

# Save the generated input-output pairs to a CSV file
output_file = "./data/generated_input_output_pairs.csv"
df.to_csv(output_file, index=False)
print(f"Generated {len(input_output_pairs)} input-output pairs saved to {output_file}")
