In [None]:
%%capture
!pip install -U langchain langchain_experimental openai

In [None]:
# set environment variables
import os
os.environ["OPENAI_API_KEY"] = ""

# Imports

In [None]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.pydantic_v1 import BaseModel
from datetime import datetime
from typing import List, Optional
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator, OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import SYNTHETIC_FEW_SHOT_SUFFIX, SYNTHETIC_FEW_SHOT_PREFIX

# Schema for generating Campaign


In [None]:
class Campaign(BaseModel):
    type: str
    spec_version: str
    id: str
    created: str
    modified: str
    name: str
    created_by_ref: Optional[str] = None
    description: Optional[str] = None
    aliases: Optional[str] = None
    first_seen: Optional[str] = None
    last_seen: Optional[str] = None
    objective: Optional[str] = None

# Sample Data as example

In [None]:
examples = [
    {"example": """Type: campaign, spec_version: 2.1, id: campaign--8e2e2d2b-17d4-4cbf-938f-98ee46b3cd3f, created: 2016-04-06T20:03:00.000Z, modified: 2016-04-06T20:03:00.000Z, name: Green Group Attacks Against Finance, created_by_ref: identity--f431f809-377b-45e0-aa1c-6a4751cae5ff, description: Campaign by Green Group against a series of targets in the financial services sector."""},
    {"example": """Type: campaign, spec_version: 2.1, id: campaign--e5268b6e-4931-42f1-b379-87f48eb41b1e, created: 2016-08-08T15:50:10.983Z, modified: 2016-08-08T15:50:10.983Z, name: Operation Bran Flakes, description: A concerted effort to insert false information into the BPP's web pages., first_seen: 2016-01-08T12:50:40.123Z, objective: Hack www.bpp.bn"""},
    {"example": """Type: campaign, spec_version: 2.1, id: campaign--1d8897a7-fdc2-4e59-afc9-becbe04df727, created: 2016-08-08T15:50:10.983Z, modified: 2016-08-08T15:50:10.983Z, name: Operation Raisin Bran, description: A DDOS campaign to flood BPP web servers., first_seen: 2016-02-07T19:45:32.126Z"""},
    {"example": """Type: campaign, spec_version: 2.1, id: campaign--752c225d-d6f6-4456-9130-d9580fd4007b, created: 2015-05-15T09:12:16.432Z, modified: 2015-05-15T09:12:16.432Z, name: admin@338, description: Active since 2008, this campaign mostly targets the financial services industry, though we have also seen activity in the telecom, government, and defense sectors."""},
    {"example": """Type: campaign, spec_version: 2.1, id: campaign--721976f9-56d7-4749-8c69-b3ac7c315f05, created: 2015-05-15T09:12:16.432Z, modified: 2015-05-15T09:12:16.432Z, name: menuPass, first_seen: 2009-12-14T00:00:00.000000Z"""},
    {"example": """Type: campaign, spec_version: 2.1, id: campaign--d02a1560-ff69-49f4-ac34-919b8aa4b91e, created: 2015-05-15T09:12:16.432Z, modified: 2015-05-15T09:12:16.432Z, name: th3bug, description: This ongoing campaign targets a number of industries but appears to prefer targets in higher education and the healthcare sectors."""}
]

# Prompt Template for GPT-4

In [None]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

# Data Generator

In [None]:
synthetic_data_generator = create_openai_data_generator(
    output_schema=Campaign,
    llm=ChatOpenAI(temperature=1,model='gpt-4-turbo-preview'),
    prompt=prompt_template,
)

  warn_deprecated(


# Parameters

In [None]:
synthetic_results = synthetic_data_generator.generate(
    subject="campaign",
    extra="Choose a unique and unconventional name for each campaign. Avoid common or typical names.",
    runs=5,
)

In [None]:
len(synthetic_results)

5

# Display Data

In [None]:
synthetic_results

[Campaign(type='campaign', spec_version='2.1', id='campaign--b4d5efee-f7a8-4a35-9a5b-c26b5b7a8db2', created='2023-01-24T14:22:35.623Z', modified='2023-01-24T14:22:35.623Z', name='Emerald Typhoon', created_by_ref=None, description='A sophisticated cyber espionage operation focusing on geopolitical intelligence gathering within the energy sector.', aliases=None, first_seen='2022-11-10T09:30:45.000Z', last_seen='2023-01-20T11:45:30.000Z', objective='Collecting intelligence on energy supply chains and geopolitical developments affecting the sector.'),
 Campaign(type='campaign', spec_version='2.1', id='campaign--b8e1d249-ae64-4d16-8c8c-8beb58ae96c8', created='2023-03-05T12:33:04.000Z', modified='2023-03-05T12:33:04.000Z', name='Project Neon Shadow', created_by_ref=None, description='A global initiative aimed at undermining the security frameworks of critical infrastructure through the dissemination of advanced persistent threats (APTs).', aliases=None, first_seen='2022-12-15T10:15:30.000Z',

# Display as a DataFrame

In [None]:
import pandas as pd

# Create a list of dictionaries from the objects
synthetic_data = []
for item in synthetic_results:
    synthetic_data.append({
        'type': item.type,
        'name': item.name,
        'description': item.description,
        'aliases': item.aliases,
        'first_seen': item.first_seen,
        'last_seen': item.last_seen,
        'objective': item.objective
    })

# Create a Pandas DataFrame from the list of dictionaries
synthetic_df = pd.DataFrame(synthetic_data)

# Display the DataFrame
print(type(synthetic_df))
synthetic_df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,type,name,description,aliases,first_seen,last_seen,objective
0,campaign,Emerald Typhoon,A sophisticated cyber espionage operation focu...,,2022-11-10T09:30:45.000Z,2023-01-20T11:45:30.000Z,Collecting intelligence on energy supply chain...
1,campaign,Project Neon Shadow,A global initiative aimed at undermining the s...,,2022-12-15T10:15:30.000Z,2023-02-28T17:50:00.000Z,To expose vulnerabilities in critical infrastr...
2,campaign,Quantum Paradox,A highly covert operation aimed at infiltratin...,,2023-02-01T08:00:00.000Z,2023-06-10T18:30:00.000Z,To secure a significant advantage in quantum c...
3,campaign,Aether Nexus,An elusive campaign aimed at siphoning proprie...,,2022-07-15T07:00:00.000Z,2023-07-31T14:00:00.000Z,Gaining unauthorized access to and extraction ...
4,campaign,Neon Mirage,A clandestine initiative focused on manipulati...,,2023-01-01T10:00:00.000Z,2023-06-30T18:00:00.000Z,To manipulate market perceptions and financial...


In [None]:
# Save the DataFrame to a CSV file
synthetic_df.to_csv('campaign_data.csv', index=False)  # index=False prevents adding an extra index column
print("Threat actor data saved to 'campaign_data.csv'")

Threat actor data saved to 'campaign_data.csv'
