In [1]:
import pandas as pd
import anthropic
import os
from dotenv import load_dotenv

In [2]:
load_dotenv()
ANTRHOPIC_API_KEY = os.environ.get("ANTHROPIC_API_KEY")

In [218]:
SYSTEM_PROMPT = "You are a helpful assistant designed to generate synthetic realistic data. Answer as a list of json"
USER_PROMPT = "Tell me a description for a business classified in any NAICS code whose first 2 digits are exactly '[NAICS_CODE]' and its corresponding complete (6 digits) NAICS code. Each description must be around 40 words. Repeat it for 20 different business. The output should be a json with the keys naics and description."

In [219]:
client = anthropic.Anthropic(
    api_key=ANTRHOPIC_API_KEY,
)


def make_api_call(naics_code: str) -> list[dict]:
    message = client.messages.create(
        model="claude-3-sonnet-20240229",
        max_tokens=4000,
        temperature=0.7,
        system=SYSTEM_PROMPT,
        messages=[
            {"role": "user", "content": USER_PROMPT.replace("[NAICS_CODE]", naics_code)}
        ],
    )
    return message

In [13]:
response = make_api_call("11")

In [14]:
eval(response.content[0].text)

[{'naics': '111998',
  'description': 'A diversified agricultural operation that cultivates a variety of crops, including grains, vegetables, and fruits, utilizing sustainable farming practices to promote soil health and biodiversity.'},
 {'naics': '112990',
  'description': 'A livestock farm specializing in raising and breeding cattle, sheep, and goats for meat and dairy production, emphasizing ethical and humane animal husbandry practices.'},
 {'naics': '113110',
  'description': 'A commercial timber harvesting operation that sustainably manages and selectively logs forests, ensuring responsible forest management and environmental conservation.'},
 {'naics': '114111',
  'description': 'A finfish farming enterprise that breeds and cultivates various species of fish in controlled aquaculture environments, adhering to strict quality and sustainability standards.'},
 {'naics': '114112',
  'description': 'A shellfish farming operation that cultivates and harvests oysters, clams, mussels, 

In [260]:
minority_classes = [21, 55, 92, 22, 49, 11]
minority_classes = [31, 52, 44]
minority_classes = [str(x) for x in minority_classes]

In [261]:
data_list = []

In [262]:
for code in minority_classes:
    for i in range(5):
        response = make_api_call(code)
        synthetic_data = eval(response.content[0].text)
        data_list.append(synthetic_data)

In [264]:
len(data_list)

15

In [265]:
df_list = []

In [266]:
for data in data_list:
    df = pd.DataFrame(data)
    df_list.append(df)

In [267]:
df_total = pd.concat(df_list)

In [268]:
df_total.head()

Unnamed: 0,naics,description
0,311811,This establishment specializes in the producti...
1,311920,This company is engaged in the processing and ...
2,312120,"This establishment operates a brewery, produci..."
3,313210,"This textile mill is involved in the spinning,..."
4,314110,This manufacturing facility specializes in the...


In [269]:
df_total.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300 entries, 0 to 19
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   naics        300 non-null    object
 1   description  300 non-null    object
dtypes: object(2)
memory usage: 7.0+ KB


In [270]:
df_total["NAICS_2"] = df_total["naics"].str[:2]

In [272]:
df_total["NAICS_2"].value_counts()

naics_2
44    100
52    100
31     76
32     18
33      6
Name: count, dtype: int64

In [273]:
df_total.rename(
    columns={"naics": "NAICS", "description": "BUSINESS_DESCRIPTION"}, inplace=True
)

In [274]:
df_total.to_parquet("naics_synthetic2.parquet", index=False)

In [275]:
clases_molestas = [11, 21, 22, 55, 92, 81]

Join sythetic data


In [276]:
syn = pd.read_csv("naics_synthetic.csv")
syn1 = pd.read_parquet("naics_synthetic1.parquet")
syn2 = pd.read_parquet("naics_synthetic2.parquet")

In [277]:
total = pd.concat([syn, syn1, syn2])

In [278]:
total["naics_2"] = total["NAICS"].astype(str).str[:2].astype(int)

In [279]:
total["naics_2"].value_counts()

naics_2
55    260
21    240
22    240
11    210
92    200
49    109
52    100
44    100
48     91
31     76
32     18
33      6
Name: count, dtype: int64

In [280]:
total.to_csv("total_naics_synthetic.csv", index=False)