In [None]:
%%capture
!pip install -U langchain langchain_experimental openai

In [None]:
# set environment variables
import os
os.environ["OPENAI_API_KEY"] =

# Imports

In [None]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.pydantic_v1 import BaseModel
from datetime import datetime
from typing import List, Optional
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator, OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import SYNTHETIC_FEW_SHOT_SUFFIX, SYNTHETIC_FEW_SHOT_PREFIX

# Schema for generating Grouping




In [None]:
class Grouping(BaseModel):
    type: str
    spec_version: str
    id: str
    created: str
    modified: str
    context: str
    object_refs: str
    name: Optional[str] = None
    description: Optional[str] = None
    created_by_ref: Optional[str] = None


# Sample Data as example

In [None]:
examples = [
   {"example": """Type: grouping, Context: 'threat-report', Object_refs: 'report123, report124', Name: 'APT29 Threat Report Group', Description: 'A grouping of threat reports related to the activities of APT29.', Created: '2023-01-01T00:00:00Z', Modified: '2023-01-02T00:00:00Z'"""},
   {"example": """Type: grouping, Context: 'attack-pattern-collection', Object_refs: 'attack123, attack124, attack125', Name: 'SQL Injection Techniques', Description: 'A collection of attack patterns and techniques related to SQL injection.', Created: '2023-02-15T00:00:00Z', Modified: '2023-02-16T00:00:00Z'"""},
   {"example": """Type: grouping, Context: 'malware-analysis', Object_refs: 'analysis123, analysis124', Name: 'Ransomware Analysis Group', Description: 'Grouping of malware analysis reports focusing on ransomware.', Created: '2023-03-10T00:00:00Z', Modified: '2023-03-11T00:00:00Z'"""},
   {"example": """Type: grouping, Context: 'incident-response', Object_refs: 'incident123, incident124', Name: 'Phishing Incident Cluster', Description: 'Cluster of related phishing incidents reported across different sectors.', Created: '2023-04-05T00:00:00Z', Modified: '2023-04-06T00:00:00Z'"""},
   {"example": """Type: grouping, Context: 'tool-collection', Object_refs: 'tool123, tool124, tool125', Name: 'Forensic Tools', Description: 'A collection of tools used in digital forensics.', Created: '2023-05-20T00:00:00Z', Modified: '2023-05-21T00:00:00Z'"""},
   {"example": """Type: grouping, Context: 'vulnerability-dataset', Object_refs: 'vuln123, vuln124, vuln125, vuln126', Name: 'Critical Infrastructure Vulnerabilities', Description: 'Dataset of vulnerabilities affecting critical infrastructure.', Created: '2023-06-15T00:00:00Z', Modified: '2023-06-16T00:00:00Z'"""},
   {"example": """Type: grouping, Context: 'campaign', Object_refs: 'campaign123', Name: 'Election Security Campaign', Description: 'A grouping of resources and information related to securing elections.', Created: '2023-07-01T00:00:00Z', Modified: '2023-07-02T00:00:00Z'"""},
   {"example": """Type: grouping, Context: 'indicator-collection', Object_refs: 'indicator123, indicator124, indicator125, indicator126', Name: 'APT40 Indicators', Description: 'Collection of indicators related to APT40 activities.', Created: '2023-08-23T00:00:00Z', Modified: '2023-08-24T00:00:00Z'"""},
   {"example": """Type: grouping, Context: 'observables', Object_refs: 'observable123, observable124', Name: 'Network Traffic Anomalies', Description: 'Group of observed data that denotes anomalies in network traffic.', Created: '2023-09-10T00:00:00Z', Modified: '2023-09-11T00:00:00Z'"""}
]

# Prompt Template for GPT-4

In [None]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

# Data Generator

In [None]:
synthetic_data_generator = create_openai_data_generator(
    output_schema=Grouping,
    llm=ChatOpenAI(temperature=1,model='gpt-4-turbo-preview'),
    prompt=prompt_template,
)

# Parameters

In [None]:
synthetic_results = synthetic_data_generator.generate(
    subject="Grouping",
    extra="Choose a unique and unconventional context and objects refs for each Grouping. Avoid common or typical contexts.",
    runs=5,
)

AuthenticationError: Error code: 401 - {'error': {'message': 'Incorrect API key provided: sk-lb6rH***************************************j1c5. You can find your API key at https://platform.openai.com/account/api-keys.', 'type': 'invalid_request_error', 'param': None, 'code': 'invalid_api_key'}}

In [None]:
len(synthetic_results)

# Display Data

In [None]:
synthetic_results

# Display as a DataFrame

In [None]:
import pandas as pd

# Create a list of dictionaries from the objects
synthetic_data = []
for item in synthetic_results:
    synthetic_data.append({
        'type': item.type,
        'name': item.name,
        'description': item.description,
        'context': item.context,
        'object_refs': item.object_refs
    })

# Create a Pandas DataFrame from the list of dictionaries
synthetic_df = pd.DataFrame(synthetic_data)

# Display the DataFrame
print(type(synthetic_df))
synthetic_df

In [None]:
# Save the DataFrame to a CSV file
synthetic_df.to_csv('campaign_data.csv', index=False)  # index=False prevents adding an extra index column
print("Threat actor data saved to 'campaign_data.csv'")