In [None]:
pip install openai

In [1]:
import openai

In [2]:
openai.api_key = 'key'

In [3]:
!pip install chardet



In [None]:
# basic test

In [4]:
response = openai.chat.completions.create(  # ✅ Use the new API format
    model="gpt-4o",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Can you explain how GPT-4o works?"}
    ]
)

assistant_reply = response.choices[0].message.content  # ✅ Updated response handling
print(assistant_reply)

GPT-4o is an iteration of the GPT (Generative Pre-trained Transformer) series developed by OpenAI. While specific technical details of GPT-4o may not be explicitly available, I can provide a general understanding based on how GPT models typically work and any improvements that might be expected from a more advanced version:

1. **Transformer Architecture**: Like its predecessors, GPT-4o is built on the transformer architecture, which uses mechanisms like self-attention and feed-forward neural networks to process and generate text. This architecture allows the model to understand the context and relationships between words in a sentence.

2. **Pre-training and Fine-tuning**:
   - **Pre-training**: The model is pre-trained on a large corpus of text from the internet. During this phase, it learns to predict the next word in a sentence, thereby capturing grammar, facts, reasoning abilities, and some level of world knowledge.
   - **Fine-tuning**: After pre-training, GPT-4o is fine-tuned on

In [23]:
import os
import openai
import pandas as pd
import json
import time
import chardet  # Detects file encoding

# Define paths
PROMPT_DIR = r"M:\FULL_DATA_COLLECTED\prompts"
LLM_NAME = "gpt-4o"  # Change this if using a different LLM
RESULTS_DIR = os.path.join(PROMPT_DIR, LLM_NAME)
os.makedirs(RESULTS_DIR, exist_ok=True)  # Ensure directory exists

# Prepare storage for results
results = []
error_logs = []

# List all prompt files
prompt_files = [f for f in os.listdir(PROMPT_DIR) if f.endswith(".txt")]

for idx, prompt_file in enumerate(prompt_files, start=1):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)

    # Detect file encoding
    with open(prompt_path, "rb") as f:
        raw_data = f.read()
        encoding = chardet.detect(raw_data)['encoding']
    
    # Read the prompt content with detected encoding
    try:
        with open(prompt_path, "r", encoding=encoding) as f:
            prompt_content = f.read().strip()
    except Exception as e:
        print(f"❌ Encoding issue with {prompt_file}: {e}")
        error_logs.append([prompt_file, "Encoding Error", str(e)])
        continue  # Skip this file and move to next

    # Print prompt details
    prompt_length = len(prompt_content)
    print(f"\n📜 [{idx}/{len(prompt_files)}] Processing: {prompt_file} (Length: {prompt_length} characters)")

    # **Skip processing if the prompt is too long**
    MAX_LENGTH = 100000  # Adjust this limit as needed
    if prompt_length > MAX_LENGTH:
        print(f"⚠️ Skipping {prompt_file} because it exceeds {MAX_LENGTH} characters.")
        error_logs.append([prompt_file, "Skipped", "Prompt too long"])
        continue  # Skip to next prompt

    # Send the prompt to GPT-4o
    try:
        response = openai.chat.completions.create(
            model=LLM_NAME,
            messages=[
                {"role": "system", "content": "You are a cybersecurity expert analyzing software vulnerabilities. "
                                              "Provide your response **only in the following strict JSON format**:\n"
                                              "{\n"
                                              '    "answer": "Yes" or "No",\n'
                                              '    "reasoning": "Concise but detailed explanation."\n'
                                              "}\n"
                                              "Do not include any extra text outside this JSON structure."},
                {"role": "user", "content": prompt_content}
            ]
        )

        # Extract response
        assistant_reply = response.choices[0].message.content.strip()

        # ✅ **Remove surrounding triple backticks (` ```json ... ``` `)**
        if assistant_reply.startswith("```json"):
            assistant_reply = assistant_reply[7:]  # Remove starting ```json
        if assistant_reply.endswith("```"):
            assistant_reply = assistant_reply[:-3]  # Remove ending ```

        # Parse JSON response
        try:
            response_json = json.loads(assistant_reply)  # Ensure it's valid JSON
            answer = response_json.get("answer", "N/A")
            reasoning = response_json.get("reasoning", "N/A")

            # Store results
            results.append([prompt_file, answer, reasoning])
            print(f"✅ Successfully processed {prompt_file}")

        except json.JSONDecodeError:
            print(f"❌ Invalid JSON response for {prompt_file}: {assistant_reply[:500]}")
            error_logs.append([prompt_file, "Invalid JSON", assistant_reply])
            results.append([prompt_file, "Error", "Invalid JSON response"])

    except Exception as e:
        print(f"❌ Error processing {prompt_file}: {e}")
        error_logs.append([prompt_file, "API Error", str(e)])
        results.append([prompt_file, "Error", str(e)])

    # **Longer delay to prevent API rate limits**
    time.sleep(10)  # Adjust if needed

# Convert results to DataFrame
df_results = pd.DataFrame(results, columns=["Prompt File", "Answer", "Reasoning"])

# Save to Excel
xlsx_path = os.path.join(RESULTS_DIR, f"{LLM_NAME}_results.xlsx")
df_results.to_excel(xlsx_path, index=False)

# Save error logs separately
if error_logs:
    df_errors = pd.DataFrame(error_logs, columns=["Prompt File", "Error Type", "Details"])
    error_xlsx_path = os.path.join(RESULTS_DIR, f"{LLM_NAME}_errors.xlsx")
    df_errors.to_excel(error_xlsx_path, index=False)
    print(f"\n⚠️ Errors saved separately in: {error_xlsx_path}")

print(f"\n✅ All responses saved successfully in: {xlsx_path}")



📜 [1/50] Processing: aircrack-ng_aircrack-ng_da087238963c1239fdabd47dc1b65279605aca70.txt (Length: 1868 characters)
✅ Successfully processed aircrack-ng_aircrack-ng_da087238963c1239fdabd47dc1b65279605aca70.txt

📜 [2/50] Processing: bdwgc_bdwgc_6a93f8e5bcad22137f41b6c60a1c7384baaec2b3.txt (Length: 2613 characters)
✅ Successfully processed bdwgc_bdwgc_6a93f8e5bcad22137f41b6c60a1c7384baaec2b3.txt

📜 [3/50] Processing: bubblewrap_bubblewrap_d7fc532c42f0e9bf427923bab85433282b3e5117.txt (Length: 3268 characters)
❌ Invalid JSON response for bubblewrap_bubblewrap_d7fc532c42f0e9bf427923bab85433282b3e5117.txt: ```{"answer": "Yes", "reasoning": "The previous fix introduced a mandatory use of setsid(), which could break applications that relied on the older behavior of not creating a new session. By making the call to setsid() optional with the --new-session flag, the future candidate commit addresses a potential compatibility issue for applications that did not require a new terminal session. Wh

In [24]:
# modified to handle errors

In [28]:
import os
import openai
import pandas as pd
import json
import time
import chardet  # Detects file encoding

# Define paths
PROMPT_DIR = r"M:\FULL_DATA_COLLECTED\prompts"
LLM_NAME = "gpt-4o"  # Change this if using a different LLM
RESULTS_DIR = os.path.join(PROMPT_DIR, LLM_NAME)
os.makedirs(RESULTS_DIR, exist_ok=True)  # Ensure directory exists

# Prepare storage for results
results = []
error_logs = []

# List all prompt files
prompt_files = [f for f in os.listdir(PROMPT_DIR) if f.endswith(".txt")]

for idx, prompt_file in enumerate(prompt_files, start=1):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)

    # Detect file encoding
    with open(prompt_path, "rb") as f:
        raw_data = f.read()
        encoding = chardet.detect(raw_data)['encoding']
    
    # Read the prompt content with detected encoding
    try:
        with open(prompt_path, "r", encoding=encoding) as f:
            prompt_content = f.read().strip()
    except Exception as e:
        print(f"❌ Encoding issue with {prompt_file}: {e}")
        error_logs.append([prompt_file, "Encoding Error", str(e)])
        continue  # Skip this file and move to next

    # Print prompt details
    prompt_length = len(prompt_content)
    print(f"\n📜 [{idx}/{len(prompt_files)}] Processing: {prompt_file} (Length: {prompt_length} characters)")

    # **Skip processing if the prompt is too long**
    MAX_LENGTH = 100000  # Adjust this limit as needed
    if prompt_length > MAX_LENGTH:
        print(f"⚠️ Skipping {prompt_file} because it exceeds {MAX_LENGTH} characters.")
        results.append([prompt_file, "Skipped", "Prompt too long"])
        continue  # Skip to next prompt

    # **Retry logic for invalid JSON responses**
    retry_attempts = 3
    for attempt in range(retry_attempts):
        try:
            response = openai.chat.completions.create(
                model=LLM_NAME,
                messages=[
                    {"role": "system", "content": "You are a cybersecurity expert analyzing software vulnerabilities. "
                                                  "Provide your response **only in the following strict JSON format**:\n"
                                                  "{\n"
                                                  '    "answer": "Yes" or "No",\n'
                                                  '    "reasoning": "Concise but detailed explanation."\n'
                                                  "}\n"
                                                  "Do not include any extra text outside this JSON structure."},
                    {"role": "user", "content": prompt_content}
                ]
            )

            # Extract response
            assistant_reply = response.choices[0].message.content.strip()

            # ✅ **Attempt to clean JSON formatting**
            if assistant_reply.startswith("```json"):
                assistant_reply = assistant_reply[7:]  # Remove starting ```json
            if assistant_reply.endswith("```"):
                assistant_reply = assistant_reply[:-3]  # Remove ending ```

            try:
                response_json = json.loads(assistant_reply)  # Ensure it's valid JSON
                answer = response_json.get("answer", "N/A")
                reasoning = response_json.get("reasoning", "N/A")

                # Store results
                results.append([prompt_file, answer, reasoning])
                print(f"✅ Successfully processed {prompt_file}")
                break  # **Exit retry loop on success**

            except json.JSONDecodeError:
                print(f"⚠️ Attempt {attempt + 1}/{retry_attempts}: Invalid JSON response for {prompt_file}, retrying...")

        except Exception as e:
            print(f"❌ Error processing {prompt_file}: {e}")
            error_logs.append([prompt_file, "API Error", str(e)])
            results.append([prompt_file, "Error", str(e)])
            break  # Stop retrying for API errors

        # **Delay before retrying**
        time.sleep(10)

    else:
        # If all retry attempts fail, log as invalid JSON
        print(f"❌ All {retry_attempts} attempts failed for {prompt_file}. Marking as error.")
        error_logs.append([prompt_file, "Invalid JSON", assistant_reply])
        results.append([prompt_file, "Error", "Invalid JSON response"])

    # **Longer delay to prevent API rate limits**
    time.sleep(10)  # Adjust if needed

# Convert results to DataFrame
df_results = pd.DataFrame(results, columns=["Prompt File", "Answer", "Reasoning"])

# Save to Excel
xlsx_path = os.path.join(RESULTS_DIR, f"{LLM_NAME}_results.xlsx")
df_results.to_excel(xlsx_path, index=False)

# Save error logs separately
if error_logs:
    df_errors = pd.DataFrame(error_logs, columns=["Prompt File", "Error Type", "Details"])
    error_xlsx_path = os.path.join(RESULTS_DIR, f"{LLM_NAME}_errors.xlsx")
    df_errors.to_excel(error_xlsx_path, index=False)
    print(f"\n⚠️ Errors saved separately in: {error_xlsx_path}")

print(f"\n✅ All responses saved successfully in: {xlsx_path}")



📜 [1/50] Processing: aircrack-ng_aircrack-ng_da087238963c1239fdabd47dc1b65279605aca70.txt (Length: 1868 characters)
✅ Successfully processed aircrack-ng_aircrack-ng_da087238963c1239fdabd47dc1b65279605aca70.txt

📜 [2/50] Processing: bdwgc_bdwgc_6a93f8e5bcad22137f41b6c60a1c7384baaec2b3.txt (Length: 2613 characters)
✅ Successfully processed bdwgc_bdwgc_6a93f8e5bcad22137f41b6c60a1c7384baaec2b3.txt

📜 [3/50] Processing: bubblewrap_bubblewrap_d7fc532c42f0e9bf427923bab85433282b3e5117.txt (Length: 3268 characters)
✅ Successfully processed bubblewrap_bubblewrap_d7fc532c42f0e9bf427923bab85433282b3e5117.txt

📜 [4/50] Processing: cgminer_cgminer_e1c5050734123973b99d181c45e74b2cbb00272e.txt (Length: 5451 characters)
✅ Successfully processed cgminer_cgminer_e1c5050734123973b99d181c45e74b2cbb00272e.txt

📜 [5/50] Processing: chromium_chromium_dcd538eb3daf6c52d3ebef0a7afea758f6c657c8.txt (Length: 4327 characters)
✅ Successfully processed chromium_chromium_dcd538eb3daf6c52d3ebef0a7afea758f6c657c8.txt



In [None]:
# prompts2

In [13]:
import os
import openai
import pandas as pd
import json
import time
import chardet  # Detects file encoding

# Define paths
PROMPT_DIR = r"M:\FULL_DATA_COLLECTED\prompts2"
LLM_NAME = "gpt-4o"  # Change this if using a different LLM
RESULTS_DIR = os.path.join(PROMPT_DIR, LLM_NAME)
os.makedirs(RESULTS_DIR, exist_ok=True)  # Ensure directory exists

# Prepare storage for results
results = []
error_logs = []

# List all prompt files
prompt_files = [f for f in os.listdir(PROMPT_DIR) if f.endswith(".txt")]

for idx, prompt_file in enumerate(prompt_files, start=1):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)

    # Detect file encoding
    with open(prompt_path, "rb") as f:
        raw_data = f.read()
        encoding = chardet.detect(raw_data)['encoding']
    
    # Read the prompt content with detected encoding
    try:
        with open(prompt_path, "r", encoding=encoding) as f:
            prompt_content = f.read().strip()
    except Exception as e:
        print(f"❌ Encoding issue with {prompt_file}: {e}")
        error_logs.append([prompt_file, "Encoding Error", str(e)])
        continue  # Skip this file and move to next

    # Print prompt details
    prompt_length = len(prompt_content)
    print(f"\n📜 [{idx}/{len(prompt_files)}] Processing: {prompt_file} (Length: {prompt_length} characters)")

    # **Skip processing if the prompt is too long**
    MAX_LENGTH = 100000  # Adjust this limit as needed
    if prompt_length > MAX_LENGTH:
        print(f"⚠️ Skipping {prompt_file} because it exceeds {MAX_LENGTH} characters.")
        results.append([prompt_file, "Skipped", "Prompt too long"])
        continue  # Skip to next prompt

    # **Retry logic for invalid JSON responses**
    retry_attempts = 3
    for attempt in range(retry_attempts):
        try:
            response = openai.chat.completions.create(
                model=LLM_NAME,
                messages=[
                    {"role": "system", "content": "You are a cybersecurity expert analyzing software vulnerabilities. "
                                                  "Provide your response **only in the following strict JSON format**:\n"
                                                  "{\n"
                                                  '    "answer": "Yes" or "No",\n'
                                                  '    "reasoning": "Concise but detailed explanation."\n'
                                                  "}\n"
                                                  "Do not include any extra text outside this JSON structure."},
                    {"role": "user", "content": prompt_content}
                ]
            )

            # Extract response
            assistant_reply = response.choices[0].message.content.strip()

            # ✅ **Attempt to clean JSON formatting**
            if assistant_reply.startswith("```json"):
                assistant_reply = assistant_reply[7:]  # Remove starting ```json
            if assistant_reply.endswith("```"):
                assistant_reply = assistant_reply[:-3]  # Remove ending ```

            try:
                response_json = json.loads(assistant_reply)  # Ensure it's valid JSON
                answer = response_json.get("answer", "N/A")
                reasoning = response_json.get("reasoning", "N/A")

                # Store results
                results.append([prompt_file, answer, reasoning])
                print(f"✅ Successfully processed {prompt_file}")
                break  # **Exit retry loop on success**

            except json.JSONDecodeError:
                print(f"⚠️ Attempt {attempt + 1}/{retry_attempts}: Invalid JSON response for {prompt_file}, retrying...")

        except Exception as e:
            print(f"❌ Error processing {prompt_file}: {e}")
            error_logs.append([prompt_file, "API Error", str(e)])
            results.append([prompt_file, "Error", str(e)])
            break  # Stop retrying for API errors

        # **Delay before retrying**
        time.sleep(10)

    else:
        # If all retry attempts fail, log as invalid JSON
        print(f"❌ All {retry_attempts} attempts failed for {prompt_file}. Marking as error.")
        error_logs.append([prompt_file, "Invalid JSON", assistant_reply])
        results.append([prompt_file, "Error", "Invalid JSON response"])

    # **Longer delay to prevent API rate limits**
    time.sleep(10)  # Adjust if needed

# Convert results to DataFrame
df_results = pd.DataFrame(results, columns=["Prompt File", "Answer", "Reasoning"])

# Save to Excel
xlsx_path = os.path.join(RESULTS_DIR, f"{LLM_NAME}_results.xlsx")
df_results.to_excel(xlsx_path, index=False)

# Save error logs separately
if error_logs:
    df_errors = pd.DataFrame(error_logs, columns=["Prompt File", "Error Type", "Details"])
    error_xlsx_path = os.path.join(RESULTS_DIR, f"{LLM_NAME}_errors.xlsx")
    df_errors.to_excel(error_xlsx_path, index=False)
    print(f"\n⚠️ Errors saved separately in: {error_xlsx_path}")

print(f"\n✅ All responses saved successfully in: {xlsx_path}")



📜 [1/50] Processing: aircrack-ng_aircrack-ng_da087238963c1239fdabd47dc1b65279605aca70.txt (Length: 2146 characters)
❌ Error processing aircrack-ng_aircrack-ng_da087238963c1239fdabd47dc1b65279605aca70.txt: module 'openai' has no attribute 'chat'


KeyboardInterrupt: 

In [None]:
# temp 0.3

In [6]:
import os
import openai
import pandas as pd
import json
import time
import chardet  # Detects file encoding

# Define paths
PROMPT_DIR = r"M:\FULL_DATA_COLLECTED\newDiverseSample"
LLM_NAME = "gpt-4o"  # Change this if using a different LLM
RESULTS_DIR = os.path.join(PROMPT_DIR, LLM_NAME)
os.makedirs(RESULTS_DIR, exist_ok=True)  # Ensure directory exists

# Prepare storage for results
results = []
error_logs = []

# List all prompt files
prompt_files = [f for f in os.listdir(PROMPT_DIR) if f.endswith(".txt")]

for idx, prompt_file in enumerate(prompt_files, start=1):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)

    # Detect file encoding
    with open(prompt_path, "rb") as f:
        raw_data = f.read()
        encoding = chardet.detect(raw_data)['encoding']
    
    # Read the prompt content with detected encoding
    try:
        with open(prompt_path, "r", encoding=encoding) as f:
            prompt_content = f.read().strip()
    except Exception as e:
        print(f"❌ Encoding issue with {prompt_file}: {e}")
        error_logs.append([prompt_file, "Encoding Error", str(e)])
        continue  # Skip this file and move to next

    # Print prompt details
    prompt_length = len(prompt_content)
    print(f"\n📜 [{idx}/{len(prompt_files)}] Processing: {prompt_file} (Length: {prompt_length} characters)")

    # **Skip processing if the prompt is too long**
    MAX_LENGTH = 100000  # Adjust this limit as needed
    if prompt_length > MAX_LENGTH:
        print(f"⚠️ Skipping {prompt_file} because it exceeds {MAX_LENGTH} characters.")
        results.append([prompt_file, "Skipped", "Prompt too long"])
        continue  # Skip to next prompt

    # **Retry logic for invalid JSON responses**
    retry_attempts = 3
    for attempt in range(retry_attempts):
        try:
            response = openai.chat.completions.create(
                model=LLM_NAME,
                temperature=0.3,
                messages=[
                    {"role": "system", "content": "You are a cybersecurity expert analyzing software vulnerabilities. "
                                                  "Provide your response **only in the following strict JSON format**:\n"
                                                  "{\n"
                                                  '    "answer": "Yes" or "No",\n'
                                                  '    "reasoning": "Concise but detailed explanation."\n'
                                                  "}\n"
                                                  "Do not include any extra text outside this JSON structure."},
                    {"role": "user", "content": prompt_content}
                ]
            )

            # Extract response
            assistant_reply = response.choices[0].message.content.strip()

            # ✅ **Attempt to clean JSON formatting**
            if assistant_reply.startswith("```json"):
                assistant_reply = assistant_reply[7:]  # Remove starting ```json
            if assistant_reply.endswith("```"):
                assistant_reply = assistant_reply[:-3]  # Remove ending ```

            try:
                response_json = json.loads(assistant_reply)  # Ensure it's valid JSON
                answer = response_json.get("answer", "N/A")
                reasoning = response_json.get("reasoning", "N/A")

                # Store results
                results.append([prompt_file, answer, reasoning])
                print(f"✅ Successfully processed {prompt_file}")
                break  # **Exit retry loop on success**

            except json.JSONDecodeError:
                print(f"⚠️ Attempt {attempt + 1}/{retry_attempts}: Invalid JSON response for {prompt_file}, retrying...")

        except Exception as e:
            print(f"❌ Error processing {prompt_file}: {e}")
            error_logs.append([prompt_file, "API Error", str(e)])
            results.append([prompt_file, "Error", str(e)])
            break  # Stop retrying for API errors

        # **Delay before retrying**
        time.sleep(10)

    else:
        # If all retry attempts fail, log as invalid JSON
        print(f"❌ All {retry_attempts} attempts failed for {prompt_file}. Marking as error.")
        error_logs.append([prompt_file, "Invalid JSON", assistant_reply])
        results.append([prompt_file, "Error", "Invalid JSON response"])

    # **Longer delay to prevent API rate limits**
    time.sleep(10)  # Adjust if needed

# Convert results to DataFrame
df_results = pd.DataFrame(results, columns=["Prompt File", "Answer", "Reasoning"])

# Save to Excel
xlsx_path = os.path.join(RESULTS_DIR, f"{LLM_NAME}_resultsTempSetting.xlsx")
df_results.to_excel(xlsx_path, index=False)

# Save error logs separately
if error_logs:
    df_errors = pd.DataFrame(error_logs, columns=["Prompt File", "Error Type", "Details"])
    error_xlsx_path = os.path.join(RESULTS_DIR, f"{LLM_NAME}_errorsTempSetting.xlsx")
    df_errors.to_excel(error_xlsx_path, index=False)
    print(f"\n⚠️ Errors saved separately in: {error_xlsx_path}")

print(f"\n✅ All responses saved successfully in: {xlsx_path}")



📜 [1/46] Processing: 1682_libxkbcommon_libxkbcommon_842e4351c2c97de6051cab6ce36b4a81e709a0e1_parser.c_changes_v9_v10.txt.txt (Length: 10295 characters)
✅ Successfully processed 1682_libxkbcommon_libxkbcommon_842e4351c2c97de6051cab6ce36b4a81e709a0e1_parser.c_changes_v9_v10.txt.txt

📜 [2/46] Processing: 1697_libxml2_libxml2_899a5d9f0ed13b8e32449a08a361e0de127dd961_parser.c_changes_v342_v343.txt.txt (Length: 3175 characters)
✅ Successfully processed 1697_libxml2_libxml2_899a5d9f0ed13b8e32449a08a361e0de127dd961_parser.c_changes_v342_v343.txt.txt

📜 [3/46] Processing: 1700_libzip_libzip_9b46957ec98d85a572e9ef98301247f39338a3b5_zip_open.c_changes_v6_v7.txt.txt (Length: 3189 characters)
✅ Successfully processed 1700_libzip_libzip_9b46957ec98d85a572e9ef98301247f39338a3b5_zip_open.c_changes_v6_v7.txt.txt

📜 [4/46] Processing: 2464_linux_linux_1ee0a224bc9aad1de496c795f96bc6ba2c394811_io_ti.c_changes_v32_v33.txt.txt (Length: 15190 characters)
✅ Successfully processed 2464_linux_linux_1ee0a224bc9

In [None]:
# all prompt range wise

In [None]:
import os
import openai
import pandas as pd
import json
import time
import chardet  # Detects file encoding
import re  # For extracting numerical ranges

# Initialize OpenAI client (✅ New API format)
client = openai.Client(api_key="key")  # Replace with actual API key

# Define paths
PROMPT_DIR = r"M:\FULL_DATA_COLLECTED\PromptsFull"
LLM_NAME = "gpt-4o"

# 🟢 **SET YOUR RANGE HERE (Manually update this)**
RANGE_INPUT = "8701-9057"  # Example: "100-200", "200-n"

# Create a directory for storing results inside PromptsFull
RESULTS_DIR = os.path.join(PROMPT_DIR, LLM_NAME)
os.makedirs(RESULTS_DIR, exist_ok=True)  # Ensure directory exists

# Function to extract range
def extract_range(range_str):
    match = re.match(r"(\d+)-(\d+|n)", range_str)
    if match:
        start, end = match.groups()
        start = int(start)
        end = int(end) if end.isdigit() else None  # `None` means all remaining
        return start, end
    return None, None

start_idx, end_idx = extract_range(RANGE_INPUT)

# Prepare storage for results
results = []
error_logs = []

# List all prompt files, sorted numerically
prompt_files = sorted(
    [f for f in os.listdir(PROMPT_DIR) if f.endswith(".txt")],
    key=lambda x: int(re.search(r'\d+', x).group()) if re.search(r'\d+', x) else float('inf')
)

# Filter files based on manually set range
if start_idx is not None:
    prompt_files = prompt_files[start_idx - 1:end_idx]  # Adjust for 0-based index

for idx, prompt_file in enumerate(prompt_files, start=start_idx):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)

    # Detect file encoding
    with open(prompt_path, "rb") as f:
        raw_data = f.read()
        encoding = chardet.detect(raw_data)['encoding']

    # Read the prompt content with detected encoding
    try:
        with open(prompt_path, "r", encoding=encoding) as f:
            prompt_content = f.read().strip()
    except Exception as e:
        error_logs.append([prompt_file, "Encoding Error", str(e)])
        continue

    # Retry mechanism (up to 3 times)
    retries = 3
    for attempt in range(1, retries + 1):
        try:
            response = client.chat.completions.create(  # ✅ Updated API call
                model=LLM_NAME,
                messages=[
                    {"role": "system", "content": "You are a cybersecurity expert analyzing software vulnerabilities. "
                                                  "Provide your response **only in the following strict JSON format**:\n"
                                                  "{\n"
                                                  '    "answer": "Yes" or "No",\n'
                                                  '    "reasoning": "Concise but detailed explanation.",\n'
                                                  '    "confidence": 1-10\n'
                                                  "}\n"
                                                  "Do not include any extra text outside this JSON structure."},
                    {"role": "user", "content": prompt_content}
                ]
            )

            # ✅ Handle markdown output
            assistant_reply = response.choices[0].message.content.strip()
            if assistant_reply.startswith("```json"):
                assistant_reply = assistant_reply[7:]
            if assistant_reply.endswith("```"):
                assistant_reply = assistant_reply[:-3]

            # ✅ Parse JSON response
            try:
                response_json = json.loads(assistant_reply)
                answer = response_json.get("answer", "N/A")
                reasoning = response_json.get("reasoning", "N/A")
                confidence = response_json.get("confidence", "N/A")  # ✅ Extract confidence

                # Store results
                results.append([prompt_file, answer, reasoning, confidence])
                break  # **Exit retry loop on success**

            except json.JSONDecodeError:
                error_logs.append([prompt_file, "Invalid JSON", assistant_reply])

        except openai.RateLimitError:
            time.sleep(120)  # ✅ Increased wait time to 120 seconds

        except openai.APIError as e:
            error_logs.append([prompt_file, "API Error", str(e)])

        except Exception as e:
            error_logs.append([prompt_file, "Unexpected Error", str(e)])

        if attempt == retries:
            results.append([prompt_file, "Error", "Final failure after retries", "N/A"])

    # **Print progress every 50 records**
    if idx % 50 == 0:
        print(f"✅ Processed {idx} records so far...")

    # **Longer delay to prevent API rate limits**
    time.sleep(10)  # Adjust if needed

# Convert results to DataFrame
df_results = pd.DataFrame(results, columns=["Prompt File", "Answer", "Reasoning", "Confidence"])

# Define filename based on range
xlsx_filename = f"{LLM_NAME}_results_{RANGE_INPUT.replace('-', '_')}.xlsx"
xlsx_path = os.path.join(RESULTS_DIR, xlsx_filename)
df_results.to_excel(xlsx_path, index=False)

# Save error logs separately
if error_logs:
    error_xlsx_filename = f"{LLM_NAME}_errors_{RANGE_INPUT.replace('-', '_')}.xlsx"
    error_xlsx_path = os.path.join(RESULTS_DIR, error_xlsx_filename)
    df_errors = pd.DataFrame(error_logs, columns=["Prompt File", "Error Type", "Details"])
    df_errors.to_excel(error_xlsx_path, index=False)

print(f"\n✅ Successfully processed all files in range {RANGE_INPUT}. Results saved in: {xlsx_path}")
if error_logs:
    print(f"⚠️ Errors logged in: {error_xlsx_path}")


✅ Processed 8750 records so far...
✅ Processed 8800 records so far...
✅ Processed 8850 records so far...
✅ Processed 8900 records so far...
✅ Processed 8950 records so far...
✅ Processed 9000 records so far...
✅ Processed 9050 records so far...


In [None]:
# fetching functionality, semantics

In [3]:
import os
import openai
import pandas as pd
import json
import time
import chardet
import re

# === OpenAI client setup ===
client = openai.Client(api_key="key")  # Replace with your actual key
LLM_NAME = "gpt-4o"

# === Directories and range config ===
PROMPT_DIR = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompts2"
RESULTS_DIR = PROMPT_DIR  # Save output in same directory

RANGE_INPUT = "101-390"  # Update this range manually

# === Extract range ===
def extract_range(range_str):
    match = re.match(r"(\d+)-(\d+|n)", range_str)
    if match:
        start, end = match.groups()
        start = int(start)
        end = int(end) if end.isdigit() else None
        return start, end
    return None, None

start_idx, end_idx = extract_range(RANGE_INPUT)

# === List and filter prompt files ===
prompt_files = sorted(
    [f for f in os.listdir(PROMPT_DIR) if f.endswith(".txt")],
    key=lambda x: int(re.search(r'\d+', x).group()) if re.search(r'\d+', x) else float('inf')
)

if start_idx is not None:
    prompt_files = prompt_files[start_idx - 1:end_idx]

# === Storage for outputs and errors ===
results = []
error_logs = []

# === Process each prompt ===
for idx, prompt_file in enumerate(prompt_files, start=start_idx):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)

    # Detect encoding
    with open(prompt_path, "rb") as f:
        raw_data = f.read()
        encoding = chardet.detect(raw_data)['encoding']

    try:
        with open(prompt_path, "r", encoding=encoding) as f:
            prompt_content = f.read().strip()
    except Exception as e:
        error_logs.append([prompt_file, "Encoding Error", str(e)])
        continue

    # === Send to OpenAI with retries ===
    retries = 3
    for attempt in range(1, retries + 1):
        try:
            response = client.chat.completions.create(
                model=LLM_NAME,
                messages=[
                    {"role": "user", "content": prompt_content}
                ]
            )
            summary = response.choices[0].message.content.strip()
            results.append([prompt_file, summary])
            break  # ✅ Success

        except openai.RateLimitError:
            time.sleep(120)

        except openai.APIError as e:
            error_logs.append([prompt_file, "API Error", str(e)])

        except Exception as e:
            error_logs.append([prompt_file, "Unexpected Error", str(e)])

        if attempt == retries:
            results.append([prompt_file, "ERROR: failed after 3 retries"])

    # === Print progress every 50 ===
    if idx % 50 == 0:
        print(f"✅ Processed {idx} files...")

    time.sleep(10)  # Adjust if needed to avoid rate limits

# === Save summary results ===
df_results = pd.DataFrame(results, columns=["Prompt File", "Semantic Summary"])
filename_out = f"{LLM_NAME}_semantic_summaries_{RANGE_INPUT.replace('-', '_')}.xlsx"
df_results.to_excel(os.path.join(RESULTS_DIR, filename_out), index=False)

# === Save error logs if any ===
if error_logs:
    df_errors = pd.DataFrame(error_logs, columns=["Prompt File", "Error Type", "Details"])
    filename_err = f"{LLM_NAME}_errors_{RANGE_INPUT.replace('-', '_')}.xlsx"
    df_errors.to_excel(os.path.join(RESULTS_DIR, filename_err), index=False)

# === Final output ===
print(f"\n✅ Finished processing range {RANGE_INPUT}.")
print(f"📝 Results saved to: {os.path.join(RESULTS_DIR, filename_out)}")
if error_logs:
    print(f"⚠️ Errors saved to: {os.path.join(RESULTS_DIR, filename_err)}")


✅ Processed 150 files...
✅ Processed 200 files...
✅ Processed 250 files...
✅ Processed 300 files...
✅ Processed 350 files...

✅ Finished processing range 101-390.
📝 Results saved to: M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompts2\gpt-4o_semantic_summaries_101_390.xlsx


In [None]:
# merge GPT responses

In [5]:
import os
import pandas as pd
import re

# Directory where your range-based files are stored
RESULTS_DIR = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompts2"

# Output filename
merged_filename = "gpt-4o_semantic_summaries_merged.xlsx"
merged_path = os.path.join(RESULTS_DIR, merged_filename)

# Find all matching summary files (excluding 'merged')
summary_files = [
    f for f in os.listdir(RESULTS_DIR)
    if f.startswith("gpt-4o_semantic_summaries_") and f.endswith(".xlsx") and "merged" not in f
]

# Extract starting index from filenames for sorting
def extract_start_index(filename):
    match = re.search(r"gpt-4o_semantic_summaries_(\d+)_\d+\.xlsx", filename)
    return int(match.group(1)) if match else float('inf')

# Sort files based on starting index of their range
summary_files_sorted = sorted(summary_files, key=extract_start_index)

# Merge all into one DataFrame
df_list = []
for file in summary_files_sorted:
    path = os.path.join(RESULTS_DIR, file)
    try:
        df = pd.read_excel(path)
        df_list.append(df)
        print(f"✅ Loaded: {file}")
    except Exception as e:
        print(f"⚠️ Skipped {file} due to error: {e}")

# Save merged file
if df_list:
    merged_df = pd.concat(df_list, ignore_index=True)
    merged_df.to_excel(merged_path, index=False)
    print(f"\n✅ Merged {len(df_list)} files into: {merged_path}")
else:
    print("❌ No matching summary files found.")


✅ Loaded: gpt-4o_semantic_summaries_1_10.xlsx
✅ Loaded: gpt-4o_semantic_summaries_11_100.xlsx
✅ Loaded: gpt-4o_semantic_summaries_101_390.xlsx

✅ Merged 3 files into: M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompts2\gpt-4o_semantic_summaries_merged.xlsx


In [None]:
# for no responses

In [10]:
import os
import openai
import pandas as pd
import json
import time
import chardet
import re

# === OpenAI client setup ===
client = openai.Client(api_key="key")  # Replace with your actual key
LLM_NAME = "gpt-4o"

# === Directories and range config ===
PROMPT_DIR = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\no\prompts2"
RESULTS_DIR = PROMPT_DIR  # Save output in same directory

RANGE_INPUT = "201-432"  # Update this range manually as needed

# === Extract range ===
def extract_range(range_str):
    match = re.match(r"(\d+)-(\d+|n)", range_str)
    if match:
        start, end = match.groups()
        start = int(start)
        end = int(end) if end.isdigit() else None
        return start, end
    return None, None

start_idx, end_idx = extract_range(RANGE_INPUT)

# === List and filter prompt files ===
prompt_files = sorted(
    [f for f in os.listdir(PROMPT_DIR) if f.endswith(".txt")],
    key=lambda x: int(re.search(r'\d+', x).group()) if re.search(r'\d+', x) else float('inf')
)

if start_idx is not None:
    prompt_files = prompt_files[start_idx - 1:end_idx]

# === Storage for outputs and errors ===
results = []
error_logs = []

# === Process each prompt ===
for idx, prompt_file in enumerate(prompt_files, start=start_idx):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)

    # Detect encoding
    with open(prompt_path, "rb") as f:
        raw_data = f.read()
        encoding = chardet.detect(raw_data)['encoding']

    try:
        with open(prompt_path, "r", encoding=encoding) as f:
            prompt_content = f.read().strip()
    except Exception as e:
        error_logs.append([prompt_file, "Encoding Error", str(e)])
        continue

    # === Send to OpenAI with retries ===
    retries = 3
    for attempt in range(1, retries + 1):
        try:
            response = client.chat.completions.create(
                model=LLM_NAME,
                messages=[
                    {"role": "user", "content": prompt_content}
                ]
            )
            summary = response.choices[0].message.content.strip()
            results.append([prompt_file, summary])
            break  # ✅ Success

        except openai.RateLimitError:
            time.sleep(120)

        except openai.APIError as e:
            error_logs.append([prompt_file, "API Error", str(e)])

        except Exception as e:
            error_logs.append([prompt_file, "Unexpected Error", str(e)])

        if attempt == retries:
            results.append([prompt_file, "ERROR: failed after 3 retries"])

    # === Print progress every 50 ===
    if idx % 50 == 0:
        print(f"✅ Processed {idx} files...")

    time.sleep(10)  # Adjust if needed to avoid rate limits

# === Save summary results ===
df_results = pd.DataFrame(results, columns=["Prompt File", "Semantic Summary"])
filename_out = f"{LLM_NAME}_semantic_summaries_{RANGE_INPUT.replace('-', '_')}_no.xlsx"
df_results.to_excel(os.path.join(RESULTS_DIR, filename_out), index=False)

# === Save error logs if any ===
if error_logs:
    df_errors = pd.DataFrame(error_logs, columns=["Prompt File", "Error Type", "Details"])
    filename_err = f"{LLM_NAME}_errors_{RANGE_INPUT.replace('-', '_')}_no.xlsx"
    df_errors.to_excel(os.path.join(RESULTS_DIR, filename_err), index=False)

# === Final output ===
print(f"\n✅ Finished processing range {RANGE_INPUT}.")
print(f"📝 Results saved to: {os.path.join(RESULTS_DIR, filename_out)}")
if error_logs:
    print(f"⚠️ Errors saved to: {os.path.join(RESULTS_DIR, filename_err)}")


✅ Processed 250 files...
✅ Processed 300 files...
✅ Processed 350 files...
✅ Processed 400 files...

✅ Finished processing range 201-432.
📝 Results saved to: M:\FULL_DATA_COLLECTED\FINAL_DATASET\no\prompts2\gpt-4o_semantic_summaries_201_432_no.xlsx


In [None]:
# merge outputs for "No"

In [12]:
import os
import pandas as pd
import re

# Directory where "no" response files are stored
RESULTS_DIR = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\no\prompts2"

# Output filename
merged_filename = "gpt-4o_semantic_summaries_merged_no.xlsx"
merged_path = os.path.join(RESULTS_DIR, merged_filename)

# Find all matching summary files for "no" responses (excluding merged)
summary_files = [
    f for f in os.listdir(RESULTS_DIR)
    if f.startswith("gpt-4o_semantic_summaries_") and f.endswith("_no.xlsx") and "merged" not in f
]

# Extract starting index from filenames for sorting
def extract_start_index(filename):
    match = re.search(r"gpt-4o_semantic_summaries_(\d+)_\d+_no\.xlsx", filename)
    return int(match.group(1)) if match else float('inf')

# Sort files based on starting index
summary_files_sorted = sorted(summary_files, key=extract_start_index)

# Merge all into one DataFrame
df_list = []
for file in summary_files_sorted:
    path = os.path.join(RESULTS_DIR, file)
    try:
        df = pd.read_excel(path)
        df_list.append(df)
        print(f"✅ Loaded: {file}")
    except Exception as e:
        print(f"⚠️ Skipped {file} due to error: {e}")

# Save merged file
if df_list:
    merged_df = pd.concat(df_list, ignore_index=True)
    merged_df.to_excel(merged_path, index=False)
    print(f"\n✅ Merged {len(df_list)} files into: {merged_path}")
else:
    print("❌ No matching summary files found.")


✅ Loaded: gpt-4o_semantic_summaries_1_10_no.xlsx
✅ Loaded: gpt-4o_semantic_summaries_11_200_no.xlsx
✅ Loaded: gpt-4o_semantic_summaries_201_432_no.xlsx

✅ Merged 3 files into: M:\FULL_DATA_COLLECTED\FINAL_DATASET\no\prompts2\gpt-4o_semantic_summaries_merged_no.xlsx


In [None]:
# x->y prompt for yes

In [14]:
import os
import openai
import pandas as pd
import json
import time
import chardet
import re

# === OpenAI client setup ===
client = openai.Client(api_key="key")  # Replace with your actual key
LLM_NAME = "gpt-4o"

# === Directories and range config ===
PROMPT_DIR = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt3YesNo\Yes"
RESULTS_DIR = PROMPT_DIR

RANGE_INPUT = "21-390"  # ✅ Update this range as needed

# === Extract range ===
def extract_range(range_str):
    match = re.match(r"(\d+)-(\d+|n)", range_str)
    if match:
        start, end = match.groups()
        start = int(start)
        end = int(end) if end.isdigit() else None
        return start, end
    return None, None

start_idx, end_idx = extract_range(RANGE_INPUT)

# === List and filter prompt files ===
prompt_files = sorted(
    [f for f in os.listdir(PROMPT_DIR) if f.endswith(".txt")],
    key=lambda x: int(re.search(r'\d+', x).group()) if re.search(r'\d+', x) else float('inf')
)

if start_idx is not None:
    prompt_files = prompt_files[start_idx - 1:end_idx]

# === Storage for outputs and errors ===
results = []
error_logs = []

# === Process each prompt ===
for idx, prompt_file in enumerate(prompt_files, start=start_idx):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)

    # Detect encoding
    with open(prompt_path, "rb") as f:
        raw_data = f.read()
        encoding = chardet.detect(raw_data)['encoding']

    try:
        with open(prompt_path, "r", encoding=encoding) as f:
            prompt_content = f.read().strip()
    except Exception as e:
        error_logs.append([prompt_file, "Encoding Error", str(e)])
        continue

    # === Send to OpenAI with retries ===
    retries = 3
    for attempt in range(1, retries + 1):
        try:
            response = client.chat.completions.create(
                model=LLM_NAME,
                messages=[
                    {"role": "user", "content": prompt_content}
                ]
            )
            summary = response.choices[0].message.content.strip()
            results.append([prompt_file, summary])
            break  # ✅ Success

        except openai.RateLimitError:
            time.sleep(120)

        except openai.APIError as e:
            error_logs.append([prompt_file, "API Error", str(e)])

        except Exception as e:
            error_logs.append([prompt_file, "Unexpected Error", str(e)])

        if attempt == retries:
            results.append([prompt_file, "ERROR: failed after 3 retries"])

    # === Print progress every 50 ===
    if idx % 50 == 0:
        print(f"✅ Processed {idx} files...")

    time.sleep(10)  # Adjust if needed to avoid rate limits

# === Save summary results ===
df_results = pd.DataFrame(results, columns=["Prompt File", "Semantic Summary"])
filename_out = f"{LLM_NAME}_semantic_summaries_{RANGE_INPUT.replace('-', '_')}_yes.xlsx"
df_results.to_excel(os.path.join(RESULTS_DIR, filename_out), index=False)

# === Save error logs if any ===
if error_logs:
    df_errors = pd.DataFrame(error_logs, columns=["Prompt File", "Error Type", "Details"])
    filename_err = f"{LLM_NAME}_errors_{RANGE_INPUT.replace('-', '_')}_yes.xlsx"
    df_errors.to_excel(os.path.join(RESULTS_DIR, filename_err), index=False)

# === Final output ===
print(f"\n✅ Finished processing range {RANGE_INPUT}.")
print(f"📝 Results saved to: {os.path.join(RESULTS_DIR, filename_out)}")
if error_logs:
    print(f"⚠️ Errors saved to: {os.path.join(RESULTS_DIR, filename_err)}")


✅ Processed 50 files...
✅ Processed 100 files...
✅ Processed 150 files...
✅ Processed 200 files...
✅ Processed 250 files...
✅ Processed 300 files...
✅ Processed 350 files...

✅ Finished processing range 21-390.
📝 Results saved to: M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt3YesNo\Yes\gpt-4o_semantic_summaries_21_390_yes.xlsx


In [None]:
# merge

In [16]:
import os
import pandas as pd

# === Input directory with GPT YES outputs
results_dir = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt3YesNo\Yes"
output_file = os.path.join(results_dir, "gpt4o_semantic_summaries_final_yes.xlsx")

# === Gather all output files
files = [
    f for f in os.listdir(results_dir)
    if f.startswith("gpt-4o_semantic_summaries_") and f.endswith("_yes.xlsx")
]

merged = []

# === Load and tag each file
for file in files:
    path = os.path.join(results_dir, file)
    try:
        df = pd.read_excel(path)
        df.insert(1, "Target", 1)  # ✅ Insert 'Target=1' as second column
        merged.append(df)
        print(f"✅ Merged: {file} ({len(df)} rows)")
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

# === Combine and save
if merged:
    final = pd.concat(merged, ignore_index=True)
    final.to_excel(output_file, index=False)
    print(f"\n✅ Final merged YES file saved to:\n{output_file}")
    print(f"📊 Total rows: {len(final)}")
else:
    print("⚠️ No files found or loaded.")


✅ Merged: gpt-4o_semantic_summaries_1_20_yes.xlsx (20 rows)
✅ Merged: gpt-4o_semantic_summaries_21_390_yes.xlsx (370 rows)

✅ Final merged YES file saved to:
M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt3YesNo\Yes\gpt4o_semantic_summaries_final_yes.xlsx
📊 Total rows: 390


In [None]:
# No

In [18]:
import os
import openai
import pandas as pd
import json
import time
import chardet
import re

# === OpenAI client setup ===
client = openai.Client(api_key="key")  # Replace with your actual key
LLM_NAME = "gpt-4o"

# === Directories and range config ===
PROMPT_DIR = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt3YesNo\No"
RESULTS_DIR = PROMPT_DIR

RANGE_INPUT = "11-432"  # ✅ Update this range as needed

# === Extract range ===
def extract_range(range_str):
    match = re.match(r"(\d+)-(\d+|n)", range_str)
    if match:
        start, end = match.groups()
        start = int(start)
        end = int(end) if end.isdigit() else None
        return start, end
    return None, None

start_idx, end_idx = extract_range(RANGE_INPUT)

# === List and filter prompt files ===
prompt_files = sorted(
    [f for f in os.listdir(PROMPT_DIR) if f.endswith(".txt")],
    key=lambda x: int(re.search(r'\d+', x).group()) if re.search(r'\d+', x) else float('inf')
)

if start_idx is not None:
    prompt_files = prompt_files[start_idx - 1:end_idx]

# === Storage for outputs and errors ===
results = []
error_logs = []

# === Process each prompt ===
for idx, prompt_file in enumerate(prompt_files, start=start_idx):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)

    # Detect encoding
    with open(prompt_path, "rb") as f:
        raw_data = f.read()
        encoding = chardet.detect(raw_data)['encoding']

    try:
        with open(prompt_path, "r", encoding=encoding) as f:
            prompt_content = f.read().strip()
    except Exception as e:
        error_logs.append([prompt_file, "Encoding Error", str(e)])
        continue

    # === Send to OpenAI with retries ===
    retries = 3
    for attempt in range(1, retries + 1):
        try:
            response = client.chat.completions.create(
                model=LLM_NAME,
                messages=[
                    {"role": "user", "content": prompt_content}
                ]
            )
            summary = response.choices[0].message.content.strip()
            results.append([prompt_file, summary])
            break  # ✅ Success

        except openai.RateLimitError:
            time.sleep(120)

        except openai.APIError as e:
            error_logs.append([prompt_file, "API Error", str(e)])

        except Exception as e:
            error_logs.append([prompt_file, "Unexpected Error", str(e)])

        if attempt == retries:
            results.append([prompt_file, "ERROR: failed after 3 retries"])

    # === Print progress every 50 ===
    if idx % 50 == 0:
        print(f"✅ Processed {idx} files...")

    time.sleep(10)  # Adjust if needed to avoid rate limits

# === Save summary results ===
df_results = pd.DataFrame(results, columns=["Prompt File", "Semantic Summary"])
filename_out = f"{LLM_NAME}_semantic_summaries_{RANGE_INPUT.replace('-', '_')}_no.xlsx"
df_results.to_excel(os.path.join(RESULTS_DIR, filename_out), index=False)

# === Save error logs if any ===
if error_logs:
    df_errors = pd.DataFrame(error_logs, columns=["Prompt File", "Error Type", "Details"])
    filename_err = f"{LLM_NAME}_errors_{RANGE_INPUT.replace('-', '_')}_no.xlsx"
    df_errors.to_excel(os.path.join(RESULTS_DIR, filename_err), index=False)

# === Final output ===
print(f"\n✅ Finished processing range {RANGE_INPUT}.")
print(f"📝 Results saved to: {os.path.join(RESULTS_DIR, filename_out)}")
if error_logs:
    print(f"⚠️ Errors saved to: {os.path.join(RESULTS_DIR, filename_err)}")


✅ Processed 50 files...
✅ Processed 100 files...
✅ Processed 150 files...
✅ Processed 200 files...
✅ Processed 250 files...
✅ Processed 300 files...
✅ Processed 350 files...
✅ Processed 400 files...

✅ Finished processing range 11-432.
📝 Results saved to: M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt3YesNo\No\gpt-4o_semantic_summaries_11_432_no.xlsx


In [None]:
# Merge

In [20]:
import os
import re
import pandas as pd

# === Input and output paths
results_dir = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt3YesNo\No"
output_file = os.path.join(results_dir, "gpt4o_semantic_summaries_final_no.xlsx")

# === Collect files
files = [
    f for f in os.listdir(results_dir)
    if f.startswith("gpt-4o_semantic_summaries_") and f.endswith("_no.xlsx")
]

# === Sort by numeric range start (e.g., from '1_10' or '11_432')
def extract_start_index(filename):
    match = re.search(r"_(\d+)_\d+_no\.xlsx", filename)
    return int(match.group(1)) if match else float('inf')

files_sorted = sorted(files, key=extract_start_index)

# === Load and merge
merged = []
for file in files_sorted:
    path = os.path.join(results_dir, file)
    try:
        df = pd.read_excel(path)
        df.insert(1, "Target", 0)  # ✅ Insert as second column
        merged.append(df)
        print(f"✅ Merged: {file} ({len(df)} rows)")
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

# === Save final file
if merged:
    final = pd.concat(merged, ignore_index=True)
    final.to_excel(output_file, index=False)
    print(f"\n✅ Final merged NO file saved to:\n{output_file}")
    print(f"📊 Total rows: {len(final)}")
else:
    print("⚠️ No valid files found.")


✅ Merged: gpt-4o_semantic_summaries_1_10_no.xlsx (10 rows)
✅ Merged: gpt-4o_semantic_summaries_11_432_no.xlsx (422 rows)

✅ Final merged NO file saved to:
M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt3YesNo\No\gpt4o_semantic_summaries_final_no.xlsx
📊 Total rows: 432


In [None]:
# merge yes and no

In [22]:
import os
import pandas as pd

# === Base directory
base_dir = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt3YesNo"

# === Input files
yes_file = os.path.join(base_dir, "Yes", "gpt4o_semantic_summaries_final_yes.xlsx")
no_file  = os.path.join(base_dir, "No", "gpt4o_semantic_summaries_final_no.xlsx")

# === Output file
output_file = os.path.join(base_dir, "gpt4o_semantic_summaries_final_with_target.xlsx")

# === Load both
df_yes = pd.read_excel(yes_file)
df_no = pd.read_excel(no_file)

# === Ensure consistent column order
columns = ["Prompt File", "Target", "Semantic Summary"]
df_yes = df_yes[columns]
df_no = df_no[columns]

# === Concatenate YES first, then NO
final_df = pd.concat([df_yes, df_no], ignore_index=True)

# === Save
final_df.to_excel(output_file, index=False)

print(f"\n✅ YES + NO merged in order (YES first).")
print(f"📁 Saved to: {output_file}")
print(f"📊 Total rows: {len(final_df)}")



✅ YES + NO merged in order (YES first).
📁 Saved to: M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt3YesNo\gpt4o_semantic_summaries_final_with_target.xlsx
📊 Total rows: 822


In [1]:
# x-> y response
# non judgemental answer
# yes 

In [3]:
import os
import openai
import pandas as pd
import json
import time
import chardet
import re

# === OpenAI client setup ===
client = openai.Client(api_key="key")  # Replace if needed
LLM_NAME = "gpt-4o"

# === Directories and range config ===
PROMPT_DIR = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt4YesNo\Yes"
RESULTS_DIR = PROMPT_DIR
RANGE_INPUT = "11-390"  # ✅ Update as needed

# === Extract range ===
def extract_range(range_str):
    match = re.match(r"(\d+)-(\d+|n)", range_str)
    if match:
        start, end = match.groups()
        start = int(start)
        end = int(end) if end.isdigit() else None
        return start, end
    return None, None

start_idx, end_idx = extract_range(RANGE_INPUT)

# === List and filter prompt files ===
prompt_files = sorted(
    [f for f in os.listdir(PROMPT_DIR) if f.endswith(".txt")],
    key=lambda x: int(re.search(r'\d+', x).group()) if re.search(r'\d+', x) else float('inf')
)

if start_idx is not None:
    prompt_files = prompt_files[start_idx - 1:end_idx]

# === Storage for outputs and errors ===
results = []
error_logs = []

# === Process each prompt ===
for idx, prompt_file in enumerate(prompt_files, start=start_idx):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)

    # Detect encoding
    with open(prompt_path, "rb") as f:
        raw_data = f.read()
        encoding = chardet.detect(raw_data)['encoding']

    try:
        with open(prompt_path, "r", encoding=encoding) as f:
            prompt_content = f.read().strip()
    except Exception as e:
        error_logs.append([prompt_file, "Encoding Error", str(e)])
        continue

    # === Send to OpenAI with retries ===
    retries = 3
    for attempt in range(1, retries + 1):
        try:
            response = client.chat.completions.create(
                model=LLM_NAME,
                messages=[
                    {"role": "user", "content": prompt_content}
                ]
            )
            task_action = response.choices[0].message.content.strip()
            results.append([prompt_file, task_action])
            break  # ✅ Success

        except openai.RateLimitError:
            time.sleep(120)

        except openai.APIError as e:
            error_logs.append([prompt_file, "API Error", str(e)])

        except Exception as e:
            error_logs.append([prompt_file, "Unexpected Error", str(e)])

        if attempt == retries:
            results.append([prompt_file, "ERROR: failed after 3 retries"])

    # === Print progress every 50 ===
    if idx % 50 == 0:
        print(f"✅ Processed {idx} files...")

    time.sleep(10)  # Adjust to control rate

# === Save results to Excel
df_results = pd.DataFrame(results, columns=["Prompt File", "Task-Action Response"])
filename_out = f"{LLM_NAME}_task_action_{RANGE_INPUT.replace('-', '_')}_yes.xlsx"
df_results.to_excel(os.path.join(RESULTS_DIR, filename_out), index=False)

# === Save error logs
if error_logs:
    df_errors = pd.DataFrame(error_logs, columns=["Prompt File", "Error Type", "Details"])
    filename_err = f"{LLM_NAME}_errors_{RANGE_INPUT.replace('-', '_')}_yes.xlsx"
    df_errors.to_excel(os.path.join(RESULTS_DIR, filename_err), index=False)

# === Final output summary
print(f"\n✅ Finished processing range {RANGE_INPUT}.")
print(f"📝 Results saved to: {os.path.join(RESULTS_DIR, filename_out)}")
if error_logs:
    print(f"⚠️ Errors saved to: {os.path.join(RESULTS_DIR, filename_err)}")


✅ Processed 50 files...
✅ Processed 100 files...
✅ Processed 150 files...
✅ Processed 200 files...
✅ Processed 250 files...
✅ Processed 300 files...
✅ Processed 350 files...

✅ Finished processing range 11-390.
📝 Results saved to: M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt4YesNo\Yes\gpt-4o_task_action_11_390_yes.xlsx


In [None]:
# merge

In [4]:
import os
import re
import pandas as pd

# === Input and output paths
results_dir = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt4YesNo\Yes"
output_file = os.path.join(results_dir, "gpt4o_task_action_final_yes.xlsx")

# === Collect files
files = [
    f for f in os.listdir(results_dir)
    if f.startswith("gpt-4o_task_action_") and f.endswith("_yes.xlsx")
]

# === Sort by numeric range start (e.g., from '1_10' or '11_432')
def extract_start_index(filename):
    match = re.search(r"_(\d+)_\d+_yes\.xlsx", filename)
    return int(match.group(1)) if match else float('inf')

files_sorted = sorted(files, key=extract_start_index)

# === Load and merge
merged = []
for file in files_sorted:
    path = os.path.join(results_dir, file)
    try:
        df = pd.read_excel(path)
        df.insert(1, "Target", 1)  # ✅ Insert as second column
        merged.append(df)
        print(f"✅ Merged: {file} ({len(df)} rows)")
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

# === Save final file
if merged:
    final = pd.concat(merged, ignore_index=True)
    final.to_excel(output_file, index=False)
    print(f"\n✅ Final merged YES file saved to:\n{output_file}")
    print(f"📊 Total rows: {len(final)}")
else:
    print("⚠️ No valid files found.")


✅ Merged: gpt-4o_task_action_1_10_yes.xlsx (10 rows)
✅ Merged: gpt-4o_task_action_11_390_yes.xlsx (380 rows)

✅ Final merged YES file saved to:
M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt4YesNo\Yes\gpt4o_task_action_final_yes.xlsx
📊 Total rows: 390


In [None]:
# no

In [6]:
import os
import openai
import pandas as pd
import json
import time
import chardet
import re

# === OpenAI client setup ===
client = openai.Client(api_key="key")  # Replace if needed
LLM_NAME = "gpt-4o"

# === Directories and range config ===
PROMPT_DIR = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt4YesNo\No"
RESULTS_DIR = PROMPT_DIR
RANGE_INPUT = "11-432"  # ✅ Update as needed

# === Extract range ===
def extract_range(range_str):
    match = re.match(r"(\d+)-(\d+|n)", range_str)
    if match:
        start, end = match.groups()
        start = int(start)
        end = int(end) if end.isdigit() else None
        return start, end
    return None, None

start_idx, end_idx = extract_range(RANGE_INPUT)

# === List and filter prompt files ===
prompt_files = sorted(
    [f for f in os.listdir(PROMPT_DIR) if f.endswith(".txt")],
    key=lambda x: int(re.search(r'\d+', x).group()) if re.search(r'\d+', x) else float('inf')
)

if start_idx is not None:
    prompt_files = prompt_files[start_idx - 1:end_idx]

# === Storage for outputs and errors ===
results = []
error_logs = []

# === Process each prompt ===
for idx, prompt_file in enumerate(prompt_files, start=start_idx):
    prompt_path = os.path.join(PROMPT_DIR, prompt_file)

    # Detect encoding
    with open(prompt_path, "rb") as f:
        raw_data = f.read()
        encoding = chardet.detect(raw_data)['encoding']

    try:
        with open(prompt_path, "r", encoding=encoding) as f:
            prompt_content = f.read().strip()
    except Exception as e:
        error_logs.append([prompt_file, "Encoding Error", str(e)])
        continue

    # === Send to OpenAI with retries ===
    retries = 3
    for attempt in range(1, retries + 1):
        try:
            response = client.chat.completions.create(
                model=LLM_NAME,
                messages=[
                    {"role": "user", "content": prompt_content}
                ]
            )
            task_action = response.choices[0].message.content.strip()
            results.append([prompt_file, task_action])
            break  # ✅ Success

        except openai.RateLimitError:
            time.sleep(120)

        except openai.APIError as e:
            error_logs.append([prompt_file, "API Error", str(e)])

        except Exception as e:
            error_logs.append([prompt_file, "Unexpected Error", str(e)])

        if attempt == retries:
            results.append([prompt_file, "ERROR: failed after 3 retries"])

    # === Print progress every 50 ===
    if idx % 50 == 0:
        print(f"✅ Processed {idx} files...")

    time.sleep(10)  # Adjust to control rate

# === Save results to Excel
df_results = pd.DataFrame(results, columns=["Prompt File", "Task-Action Response"])
filename_out = f"{LLM_NAME}_task_action_{RANGE_INPUT.replace('-', '_')}_no.xlsx"
df_results.to_excel(os.path.join(RESULTS_DIR, filename_out), index=False)

# === Save error logs
if error_logs:
    df_errors = pd.DataFrame(error_logs, columns=["Prompt File", "Error Type", "Details"])
    filename_err = f"{LLM_NAME}_errors_{RANGE_INPUT.replace('-', '_')}_no.xlsx"
    df_errors.to_excel(os.path.join(RESULTS_DIR, filename_err), index=False)

# === Final output summary
print(f"\n✅ Finished processing range {RANGE_INPUT}.")
print(f"📝 Results saved to: {os.path.join(RESULTS_DIR, filename_out)}")
if error_logs:
    print(f"⚠️ Errors saved to: {os.path.join(RESULTS_DIR, filename_err)}")


✅ Processed 50 files...
✅ Processed 100 files...
✅ Processed 150 files...
✅ Processed 200 files...
✅ Processed 250 files...
✅ Processed 300 files...
✅ Processed 350 files...
✅ Processed 400 files...

✅ Finished processing range 11-432.
📝 Results saved to: M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt4YesNo\No\gpt-4o_task_action_11_432_no.xlsx


In [None]:
# merge

In [7]:
import os
import re
import pandas as pd

# === Input and output paths
results_dir = r"M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt4YesNo\No"
output_file = os.path.join(results_dir, "gpt4o_task_action_final_no.xlsx")

# === Collect all matching GPT result files
files = [
    f for f in os.listdir(results_dir)
    if f.startswith("gpt-4o_task_action_") and f.endswith("_no.xlsx")
]

# === Sort by numeric range start (e.g., from '1_10' or '11_432')
def extract_start_index(filename):
    match = re.search(r"_(\d+)_\d+_no\.xlsx", filename)
    return int(match.group(1)) if match else float('inf')

files_sorted = sorted(files, key=extract_start_index)

# === Load and merge
merged = []
for file in files_sorted:
    path = os.path.join(results_dir, file)
    try:
        df = pd.read_excel(path)
        df.insert(1, "Target", 0)  # ✅ Insert Target column = 0 for non-introducing
        merged.append(df)
        print(f"✅ Merged: {file} ({len(df)} rows)")
    except Exception as e:
        print(f"⚠️ Error reading {file}: {e}")

# === Save final merged file
if merged:
    final = pd.concat(merged, ignore_index=True)
    final.to_excel(output_file, index=False)
    print(f"\n✅ Final merged NO file saved to:\n{output_file}")
    print(f"📊 Total rows: {len(final)}")
else:
    print("⚠️ No valid files found.")


✅ Merged: gpt-4o_task_action_1_10_no.xlsx (10 rows)
✅ Merged: gpt-4o_task_action_11_432_no.xlsx (422 rows)

✅ Final merged NO file saved to:
M:\FULL_DATA_COLLECTED\FINAL_DATASET\prompt4YesNo\No\gpt4o_task_action_final_no.xlsx
📊 Total rows: 432
