## Setup

In [10]:
import json
import re
import os
from tqdm import tqdm
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed

import openai
from openai import OpenAI

In [2]:
with open('/home/agatha/Desktop/MA3/sem proj/api_key_nlp_lab.txt', 'r') as file:
    OPENAI_API_KEY = file.read().strip()

client = OpenAI(
    api_key=OPENAI_API_KEY
)

## Load data

In [3]:
path = '/home/agatha/Desktop/SAIL/feature-intervention-for-unlearning/data/cyber-retain-corpus.jsonl'
df = pd.read_json(path, lines=True)
df.columns = ['example']
df.head()


Unnamed: 0,example
0,py-spy\nblack\npyright\nrequirements-parser\nf...
1,"""softfloat"": Unaltered files from SoftFloat v3..."
2,[1]John Hauser\n______________________________...
3,About OpenBSD\n* [1]Project Goals\n* [2]Hardwa...
4,# snarkos-display\n\n[![Crates.io](https://img...


## Generate answers

In [11]:
def generate_prompt(example):
    return f"Can you explain this:\n\n '{example}'"

def generate_response(example):

    prompt = generate_prompt(example)

    response = client.chat.completions.create(
        model='gpt-4o-mini',
        messages=[{"role": "user", "content": prompt}]
    )

    return response.choices[0].message.content

def process_row(row):
    return generate_response(row['example'])

def safe_process_row(row):
    try:
        return generate_response(row['example'])
    except Exception as e:
        # Return a descriptive error message
        return f"Error processing row: {e}"

In [8]:
# tqdm.pandas()

# df['answer'] = df['example'].progress_apply(generate_response)

In [12]:
results = []
with ThreadPoolExecutor(max_workers=10) as executor:
    future_to_row = {executor.submit(safe_process_row, row): row for row in df.to_dict('records')}
    for future in tqdm(as_completed(future_to_row), total=len(future_to_row)):
        try:
            result = future.result()  # Get the result of the future
            results.append(result)
        except Exception as e:
            # Append an error message for any unexpected exceptions
            results.append(f"Unexpected error: {e}")

100%|██████████| 4473/4473 [57:32<00:00,  1.30it/s]  


In [13]:
df['answer'] = results
df.head()

Unnamed: 0,example,answer
0,py-spy\nblack\npyright\nrequirements-parser\nf...,The text you've provided appears to be a READM...
1,"""softfloat"": Unaltered files from SoftFloat v3...",This passage provides information about the us...
2,[1]John Hauser\n______________________________...,This document provides an overview of the Ion ...
3,About OpenBSD\n* [1]Project Goals\n* [2]Hardwa...,This text appears to be an introductory overvi...
4,# snarkos-display\n\n[![Crates.io](https://img...,"The text provides an overview of OpenBSD, a fr..."


## Saving

In [14]:
df.to_json('chat_cyber_retain.jsonl', orient='records', lines=True)

## Analysis

In [44]:
for answer in df['answer']:
    print('----------------------------------')
    print(answer)

----------------------------------
This content appears to be an excerpt from a technical blog post or documentation focused on using Windows Management Instrumentation (WMI) as a means of evasion in cybersecurity contexts, particularly for malware or malicious software. Here’s a breakdown of its structure and key points:

### Structure Overview

1. **Front Matter:**
   - Contains metadata such as layout, title, and categories relevant to the webpage or blog post.

2. **Table of Contents:**
   - Lists sections covered in the document, allowing for quick navigation. Topics include WMI detection methods, background information, various evasion techniques, and other supportive information like countermeasures.

3. **Main Content Sections:**
   - **WMI Detection Methods:** Discusses how WMI can be used to gather OS and hardware information.
   - **Background:** Describes the general process for using WMI queries, including initialization and execution steps.
   - **Evasion Techniques:** 
 

## debug

In [51]:
mini_df = df.sample(n=5)
mini_df['answer'] = mini_df['example'].progress_apply(generate_response)

100%|██████████| 5/5 [00:40<00:00,  8.06s/it]


In [53]:
mini_df.to_json('mini_chat_cyber_retain.jsonl', orient='records', lines=True)