In [None]:
import pandas as pd
from tqdm import tqdm

In [None]:
df = pd.read_csv('./mdr-gpt4o_bigvul_recommendations_progress.csv')

In [None]:
len(df)

In [None]:
df.info()

# Load model

In [None]:
from langchain_openai import ChatOpenAI

llm = ChatOpenAI(
    model="gpt-4o-mini",
    temperature=0,
    max_tokens=16384,
    timeout=None,
    max_retries=2
)

# Template

In [None]:
from langchain.prompts import ChatPromptTemplate

messages = [
    ("system", "You are a software engineer and security expert who specializes in generating fixes for vulnerable code affected by different CWEs and CVEs."),
    ("human", """
        # CONTEXT #
        You are a software engineer and security expert who specializes in generating fixes for vulnerable code affected by different CWEs and CVEs.
        
        # OBJECTIVE #
        Generate a fix for the given vulnerable code based on the provided context.
        
        # STYLE #
        Provide the fixed code snippet only, following best practices for secure and efficient coding.
        
        # TONE #
        Professional and technical.
        
        # AUDIENCE #
        Software engineers and security experts.
        
        # RESPONSE FORMAT #
        The response should be a single corrected code snippet without any additional explanations or comments.
        
        # PROMPT #
        Based on the following vulnerable code and the given recommendation, generate a fixed version of the code:
        {context}
    """),
]

prompt_template = ChatPromptTemplate.from_messages(messages)

# Response

In [None]:
def generate_fix_recommendation(code_before, recommendation, cve, cwe):
    combined_context = f"""
        Vulnerable code:
        CWE: {cwe}
        CVE: {cve}

        Code:
        {code_before}

        Recommendation:
        {recommendation}
    """

    # result = chain.run([combined_context], memory=None)

    prompt = prompt_template.invoke({"context": combined_context})
    result = llm.invoke(prompt)

    return result.content
        

# Load progress

In [None]:
import os

progress_file = 'gpt4o_bigvul_fixes.csv'
if os.path.exists(progress_file):
    results_df = pd.read_csv(progress_file)
    processed_ids = set(results_df['cve'].astype(str) + "_" + results_df['cwe'].astype(str))
    results = results_df.to_dict('records')
else:
    results = []
    processed_ids = set()

In [None]:
for index, row in tqdm(df.iterrows(), total=df.shape[0], desc="Processing"):
    current_id = f"{row['cve']}_{row['cwe']}"

    # Skip already processed examples
    if current_id in processed_ids:
        # print(f"Skipped {current_id}")
        continue

    print(f"Running for {current_id}")

    code_before = row['func_before']

    # if len(func_before) > 22000:
    #     func_before = row['func_before'][:22000]
    
    # Generate fixes
    fix = generate_fix_recommendation(code_before, row['recommendation'], row['cve'], row['cwe'])

    # Store the results
    results.append({
        'cve': row['cve'],
        'cwe': row['cwe'],
        'func_before': row['func_before'],
        'filename': row['filename'],
        'recommendation': row['recommendation'],
        'fix': fix,
    })

    # Save progress every 10 examples
    if index % 1 == 0:
        results_df = pd.DataFrame(results)
        results_df.to_csv(progress_file, index=False)