In [None]:
%%capture
!pip install -U langchain langchain_experimental openai

In [None]:
# set environment variables
import os
os.environ["OPENAI_API_KEY"] = ""

# Imports

In [None]:
from langchain.prompts import FewShotPromptTemplate, PromptTemplate
from langchain.chat_models import ChatOpenAI
from langchain.pydantic_v1 import BaseModel
from datetime import datetime
from typing import List, Optional
from langchain_experimental.tabular_synthetic_data.base import SyntheticDataGenerator
from langchain_experimental.tabular_synthetic_data.openai import create_openai_data_generator, OPENAI_TEMPLATE
from langchain_experimental.tabular_synthetic_data.prompts import SYNTHETIC_FEW_SHOT_SUFFIX, SYNTHETIC_FEW_SHOT_PREFIX

# Schema for generating Grouping




In [None]:
class Infrastructure(BaseModel):
    type: str
    spec_version: str
    id: str
    created: str
    modified: str
    name: str
    description: Optional[str] = None
    infrastructure_types: Optional[str] = None
    aliases: Optional[str] = None
    kill_chain_phases: Optional[str] = None
    first_seen: Optional[str] = None
    last_seen: Optional[str] = None




# Sample Data as example

In [None]:
examples = [
  {"example": """Type: infrastructure, spec_version: 2.1, id: infrastructure--38c47d93-d984-4fd9-b87b-d69d0841628d, created: 2016-05-07T11:22:30.000Z, modified: 2016-05-07T11:22:30.000Z, name: Poison Ivy C2"""},
  {"example": """Type: infrastructure, spec_version: 2.1, id: infrastructure--46c31j39-s832-5dj8-n31n-b28n0834921d, created: 2020-11-30T27:18:17.000Z, modified: 2018-06-08T18:30:29.000Z, Virgina Beach Vibes"""},
  {"example": """Type: infrastructure, spec_version: 2.1, id: infrastructure--83h18d02-j183-9a9j-j1j4-j14j2849229h, created: 2018-08-10T10:29:19.000Z, modified: 2020-11-19T20:10:15.000Z, Pembroke Squad"""},
  {"example": """Type: infrastructure, spec_version: 2.1, id: infrastructure--56h37e19-j239-8s2k-h4j2-j19h2384721s, created: 2012-05-19T21:19:25.000Z, modified: 2015-10-23T23:19:27.000Z, BlackHats District 13"""},
  {"example": """Type: infrastructure, spec_version: 2.1, id: infrastructure--47x83u91-j384-5h2j-h4m2-j39d9492834d, created: 2012-04-17T23:21:38.000Z, modified: 2018-07-17T12:18:30.000Z, name: BlackHats District 12"""}
]

# Prompt Template for GPT-4

In [None]:
OPENAI_TEMPLATE = PromptTemplate(input_variables=["example"], template="{example}")

prompt_template = FewShotPromptTemplate(
    prefix=SYNTHETIC_FEW_SHOT_PREFIX,
    examples=examples,
    suffix=SYNTHETIC_FEW_SHOT_SUFFIX,
    input_variables=["subject", "extra"],
    example_prompt=OPENAI_TEMPLATE,
)

# Data Generator

In [None]:
synthetic_data_generator = create_openai_data_generator(
    output_schema=Infrastructure,
    llm=ChatOpenAI(temperature=1,model='gpt-4-turbo-preview'),
    prompt=prompt_template,
)

# Parameters

In [None]:
synthetic_results = synthetic_data_generator.generate(
    subject="Infrastructure",
    extra="Choose a unique and unconventional description, infrastructure type, alias, kill chain phase, first seen, and last seen for each Infrastructure. Avoid common or typical names.",
    runs=1,
)

In [None]:
len(synthetic_results)

1

# Display Data

In [None]:
synthetic_results

[Infrastructure(type='infrastructure', spec_version='2.1', id='infrastructure--b2c49d82-4c5f-42e2-b89a-e6b2a0c4e5d7', created='2023-01-22T15:42:19.000Z', modified='2023-04-11T10:15:28.000Z', name='Neon Gridlock', description='A decentralized web of quantum encrypted communication nodes, designed to evade conventional detection methods.', infrastructure_types='quantum-encrypted-network', aliases='QuantumNet, Grid Shadow', kill_chain_phases='reconnaissance, delivery', first_seen='2022-11-05T08:30:00.000Z', last_seen='2023-03-30T18:45:00.000Z')]

# Display as a DataFrame

In [None]:
import pandas as pd

# Create a list of dictionaries from the objects
synthetic_data = []
for item in synthetic_results:
    synthetic_data.append({
        'type': item.type,
        'name': item.name,
        'description': item.description,
        'infrastructure_types': item.infrastructure_types,
        'aliases': item.aliases,
        'kill_chain_phases': item.kill_chain_phases,
        'first_seen': item.first_seen,
        'last_seen': item.last_seen
        })

# Create a Pandas DataFrame from the list of dictionaries
synthetic_df = pd.DataFrame(synthetic_data)

# Display the DataFrame
print(type(synthetic_df))
synthetic_df

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,type,name,description,infrastructure_types,aliases,kill_chain_phases,first_seen,last_seen
0,infrastructure,Neon Gridlock,A decentralized web of quantum encrypted commu...,quantum-encrypted-network,"QuantumNet, Grid Shadow","reconnaissance, delivery",2022-11-05T08:30:00.000Z,2023-03-30T18:45:00.000Z


In [None]:
# Save the DataFrame to a CSV file
synthetic_df.to_csv('infrastructure_data.csv', index=False)  # index=False prevents adding an extra index column
print("Infrastructure data saved to 'infrastructure_data.csv'")

Infrastructure data saved to 'infrastructure_data.csv'
