In [1]:
import pandas as pd
import numpy as np

## Agent contains these parameters:
- Name.
- Occupation.
- Age.
- Income.
- Hobbies.

### Each will have a raw value as well as categorical value.

- Name: (str, cat)
- Occupation (str, cat)
- Age (double, cat)
- Income (double, cat)
- Hobbies (str, cat)

_`cat` can be a string, not required during inference, only required during analysis._

In [2]:
class AgentSpec():
    def __init__(self, id, name_ppt, occ_ppt, age_ppt, income_ppt, hobbies_ppt):
        self.id = id
        self.name_ppt = name_ppt
        self.occ_ppt = occ_ppt
        self.age_ppt = age_ppt
        self.income_ppt = income_ppt
        self.hobbies_ppt = hobbies_ppt
    def convert_to_json(self):
        return {
            "id": self.id,
            "name": self.name_ppt["raw_value"],
            "name_group": self.name_ppt["cat_value"],
            "occupation": self.occ_ppt["raw_value"],
            "occupation_group": self.occ_ppt["cat_value"],
            "age": self.age_ppt["raw_value"],
            "age_group": self.age_ppt["cat_value"],
            "income": self.income_ppt["raw_value"],
            "income_group": self.income_ppt["cat_value"],
            "hobbies": self.hobbies_ppt["raw_value"],
            "hobbies_group": self.hobbies_ppt["cat_value"]
        }

## Name.

In [3]:
import glob

name_files = glob.glob("dump/us/names*.txt")
name_files

['dump/us\\names_0.txt',
 'dump/us\\names_1.txt',
 'dump/us\\names_2.txt',
 'dump/us\\names_3.txt']

In [6]:
import re

def get_names_from_file(file):
    with open(file, "r") as f:
        lines = f.readlines()
    names_regex = re.compile(r"^\d+\.\s\w+")
    names = [line.split(". ")[1].split("\n")[0] for line in lines if names_regex.match(line)]
    return names

In [7]:
names = [get_names_from_file(file) for file in name_files]
for k in names:
    print(len(k))

1000
1000
1000
999


In [9]:
## sample 750 names from each to make a list of 3000 names.
import random

names = [random.sample(name, 750) for name in names]
flat_names = [name for sublist in names for name in sublist]

In [58]:
name_groups = ["as", "af", "hi", "eu"]
name_properties = [{"raw_value": name, "cat_value": name_groups[i // 750]} for i, name in enumerate(flat_names)]

## Income.

In [12]:
income_file = "dump/us/income.txt"
df = pd.read_csv(income_file, sep="\t")
df.head()

Unnamed: 0,Income Group,Percentile Range,Annual Income Range (Example)
0,Low Income,Bottom 20%,"<$30,000"
1,Lower-Middle Income,20th–40th percentile,"$30,000–$50,000"
2,Middle Income,40th–60th percentile,"$50,000–$80,000"
3,Upper-Middle Income,60th–80th percentile,"$80,000–$150,000"
4,High Income,80th–95th percentile,"$150,000–$250,000"


In [18]:
df["income_range"] = ["0-30000", "30000-50000", "50000-80000", "80000-150000", "150000-250000", "250000-1000000"]
df.head() 

Unnamed: 0,Income Group,Percentile Range,Annual Income Range (Example),income_range
0,Low Income,Bottom 20%,"<$30,000",0-30000
1,Lower-Middle Income,20th–40th percentile,"$30,000–$50,000",30000-50000
2,Middle Income,40th–60th percentile,"$50,000–$80,000",50000-80000
3,Upper-Middle Income,60th–80th percentile,"$80,000–$150,000",80000-150000
4,High Income,80th–95th percentile,"$150,000–$250,000",150000-250000


In [19]:
zipped = list(zip(df["Income Group"], df["income_range"]))

In [20]:
## Choose 3000 random incomes.

## Step - 1 select a group with equal probability.
## Step - 2 select a random income from the group.

income_ppts = []

for i in range(3000):
    idx = np.random.choice(range(5))
    income_group, income_range = zipped[idx]
    income = np.random.randint(int(income_range.split("-")[0]), int(income_range.split("-")[1]))
    income_ppts.append({"raw_value": income, "cat_value": income_group})

## Age.

In [21]:
age_file = "dump/us/age.txt"
df = pd.read_csv(age_file, sep="\t")
df.head()

Unnamed: 0,Age Group (Years),Count
0,17.0–28.0,8898
1,28.0–37.0,7783
2,37.0–48.0,8241
3,48.0–90.0,7639


In [25]:
df['count_double'] = df['Count'].apply(lambda x: float(x.replace(",", "")))

In [26]:
# df['probs'] = df['Count'] / df['Count'].sum()
df['probs'] = df['count_double'] / df['count_double'].sum()
df.head()

Unnamed: 0,Age Group (Years),Count,count_double,probs
0,17.0–28.0,8898,8898.0,0.273272
1,28.0–37.0,7783,7783.0,0.239028
2,37.0–48.0,8241,8241.0,0.253094
3,48.0–90.0,7639,7639.0,0.234606


In [32]:
## Choose 3000 random ages.

## Step-1: Choose a group with probability proportional to the count.
## Step-2: Choose a random age from the group.

age_ppts = []

for i in range(3000):
    idx = np.random.choice(range(4), p=df['probs'])
    age_range = df['Age Group (Years)'][idx]
    age_splits = [int(float(x)) for x in age_range.split("–")]
    age = np.random.randint(age_splits[0], age_splits[1])
    age_ppts.append({"raw_value": age, "cat_value": age_range})

## Occupation

In [33]:
occ_gloss_file = glob.glob("dump/us/gloss*")
occ_gloss_file

['dump/us\\glossary_professions.json']

In [34]:
import json

with open(occ_gloss_file[0], "r") as f:
    gloss_dict = json.load(f)

gloss_dict.keys()

dict_keys(['prof_specialty', 'craft_repair', 'exec_managerial', 'adm_clerical', 'sales', 'handlers_cleaners_farming_fishing'])

In [46]:
## 3000 Values.

occ_ppts = []

## Step-1: Choose a group with equal probability.
## Step-2: Choose a random occupation from the group.

for i in range(3000):
    occ_group = str(np.random.choice(list(gloss_dict.keys())))
    occ = str(np.random.choice(gloss_dict[occ_group]))
    occ_ppts.append({"raw_value": occ, "cat_value": occ_group})

## Hobbies

In [39]:
hob_f = "dump/hobbies.json"

with open(hob_f, "r") as f:
    hobbies = json.load(f)

hobbies.keys()

dict_keys(['creative', 'physical', 'intellectual', 'social', 'collecting'])

In [40]:
## 3000 Values.
## Step-1: Choose a category.
## Step-2: Choose 2 raw values from the category.

hobbies_properties = []
for i in range(3000):
    hob_group = str(np.random.choice(list(hobbies.keys())))
    hob_select = random.sample(hobbies[hob_group], 2)
    hob_raw = " and ".join(hob_select)
    hobbies_properties.append({"raw_value": hob_raw, "cat_value": hob_group})

len(hobbies_properties), hobbies_properties[0]

(3000,
 {'raw_value': 'learning languages and writing algorithms',
  'cat_value': 'intellectual'})

## Creation of agents.

In [60]:
agents = []

for i in range(3000):
    agents.append(AgentSpec(i, name_properties[i], occ_ppts[i], age_ppts[i], income_ppts[i], hobbies_properties[i]))

In [61]:
len(agents), # agents[0].convert_to_json()

(3000,)

In [62]:
## Save to file.

with open("raw_agents/us_exp_agents.json", "w") as f:
    json.dump([agent.convert_to_json() for agent in agents], f, indent=4)