In [1]:
import pandas as pd
import numpy as np

## Agent contains these parameters:
- Name.
- Occupation.
- Age.
- Income.
- Hobbies.

### Each will have a raw value as well as categorical value.

- Name: (str, cat)
- Occupation (str, cat)
- Age (double, cat)
- Income (double, cat)
- Hobbies (str, cat)

_`cat` can be a string, not required during inference, only required during analysis._

In [124]:
class AgentSpec():
    def __init__(self, id, name_ppt, occ_ppt, age_ppt, income_ppt, hobbies_ppt):
        self.id = id
        self.name_ppt = name_ppt
        self.occ_ppt = occ_ppt
        self.age_ppt = age_ppt
        self.income_ppt = income_ppt
        self.hobbies_ppt = hobbies_ppt
    def convert_to_json(self):
        return {
            "id": self.id,
            "name": self.name_ppt["raw_value"],
            "name_group": self.name_ppt["cat_value"],
            "occupation": self.occ_ppt["raw_value"],
            "occupation_group": self.occ_ppt["cat_value"],
            "age": self.age_ppt["raw_value"],
            "age_group": self.age_ppt["cat_value"],
            "income": self.income_ppt["raw_value"],
            "income_group": self.income_ppt["cat_value"],
            "hobbies": self.hobbies_ppt["raw_value"],
            "hobbies_group": self.hobbies_ppt["cat_value"]
        }

## Name.

In [6]:
import glob

name_files = glob.glob("dump/india/name*.txt")
name_files

['dump/india\\names_0.txt',
 'dump/india\\names_1.txt',
 'dump/india\\names_2.txt',
 'dump/india\\names_3.txt',
 'dump/india\\names_4.txt',
 'dump/india\\names_5.txt']

In [8]:
with open(name_files[0], "r") as f:
    lines = f.readlines()

In [31]:
## Get all lines with [num]. [first_name] [middle_name?] [last_name] regex pattern.
import re

names_regex = re.compile(r"^\d+\.\s\w+")
names = [line.split(". ")[1].split("\n")[0] for line in lines if names_regex.match(line)]

In [33]:
len(names), # names[50:55]

(1000,)

In [34]:
def get_names_from_file(file):
    with open(file, "r") as f:
        lines = f.readlines()
    names_regex = re.compile(r"^\d+\.\s\w+")
    names = [line.split(". ")[1].split("\n")[0] for line in lines if names_regex.match(line)]
    return names

In [35]:
names = [get_names_from_file(file) for file in name_files]
for k in names:
    print(len(k))

1000
1000
1000
1000
1000
1000


In [36]:
## Sample 3000 // 6 == 500 names from each entry list.
import random

names = [random.sample(name_list, 500) for name_list in names]
for k in names:
    print(len(k))

500
500
500
500
500
500


In [40]:
flat_names = [name for name_list in names for name in name_list]

In [56]:
name_groups = ["north", "north-east", "central", "east", "south", "west"]
name_properties = [{"raw_value": name, "cat_value": name_groups[i // 500]} for i, name in enumerate(flat_names)]

In [57]:
len(name_properties), name_properties[0].keys()

(3000, dict_keys(['raw_value', 'cat_value']))

## Occupation

In [45]:
occ_file = glob.glob("dump/india/occ*.txt")[0]
occ_file

'dump/india\\occupation.txt'

In [47]:
df = pd.read_csv(occ_file, sep="\t")
df.head()

Unnamed: 0,Sector,Percentage of Workforce,Key Occupations
0,Agriculture,44.8%,"Farming, fishing, forestry, livestock management"
1,Industry,24%,"Construction (75 million workers), manufacturi..."
2,Services,31.2%,"Trade and hotels (70 million workers), IT/BPO,..."


In [48]:
import json

json_gloss = glob.glob("dump/india/gloss*.json")[0]
json_gloss

'dump/india\\glossary_professions.json'

In [49]:
## Load the glossary file.
with open(json_gloss, "r") as f:
    glossary = json.load(f)

In [53]:
list(glossary.keys())

['Agriculture', 'Industry', 'Services']

In [60]:
### 3000 values.
### Step-1: Choose category with probability: 45%, 24% and 31%.
### Step-2: Choose raw value from category based on equal probability.

occupation_properties = []
for i in range(3000):
    occ_group = str(np.random.choice(list(glossary.keys()), p=[0.45, 0.24, 0.31]))
    occ_raw = str(np.random.choice(glossary[occ_group]))
    occ_properties = {"raw_value": occ_raw, "cat_value": occ_group}
    occupation_properties.append(occ_properties)

In [61]:
len(occupation_properties), occupation_properties[0]

(3000, {'raw_value': 'Forestry Technician', 'cat_value': 'Agriculture'})

## Age.

In [62]:
age_file = glob.glob("dump/india/age*.txt")[0]
age_file

'dump/india\\age.txt'

In [63]:
df = pd.read_csv(age_file, sep="\t")
df.head()

Unnamed: 0,Age Group,Males (millions),Females (millions)
0,0–4,12.5,11.5
1,5–9,13.0,12.0
2,10–14,13.5,12.5
3,15–19,12.5,11.5
4,20–24,12.0,11.0


In [65]:
print(df["Age Group"].to_list())

['0–4', '5–9', '10–14', '15–19', '20–24', '25–29', '30–34', '35–39', '40–44', '45–49', '50–54', '55–59', '60–64', '65–69', '70–74', '75–79', '80–84', '85–89', '90–94', '95–99', '100+']


In [67]:
## Keep between '20-24' and '65-69' age groups only.
age_groups = df["Age Group"].to_list()[4:14]
df = df[df["Age Group"].isin(age_groups)]
df.head()

Unnamed: 0,Age Group,Males (millions),Females (millions)
4,20–24,12.0,11.0
5,25–29,11.5,11.0
6,30–34,11.0,10.5
7,35–39,10.5,10.0
8,40–44,10.0,9.5


In [69]:
df['Total'] = df['Males (millions)'] + df['Females (millions)']
df.head()

Unnamed: 0,Age Group,Males (millions),Females (millions),Total
4,20–24,12.0,11.0,23.0
5,25–29,11.5,11.0,22.5
6,30–34,11.0,10.5,21.5
7,35–39,10.5,10.0,20.5
8,40–44,10.0,9.5,19.5


In [71]:
sum_total = df['Total'].sum()
df['Probability'] = df['Total'] / sum_total
df.head()

Unnamed: 0,Age Group,Males (millions),Females (millions),Total,Probability
4,20–24,12.0,11.0,23.0,0.125341
5,25–29,11.5,11.0,22.5,0.122616
6,30–34,11.0,10.5,21.5,0.117166
7,35–39,10.5,10.0,20.5,0.111717
8,40–44,10.0,9.5,19.5,0.106267


In [79]:
s, e = age_groups[0].split('–')
s = int(s)
e = int(e)
s, e

(20, 24)

In [80]:
### 3000 Values.

## Step-1: Choose Age-Group with probability based on 'Probability' column.
## Step-2: Choose raw value from Age-Group based on equal probability.

age_properties = []
for i in range(3000):
    age_group = str(np.random.choice(age_groups, p=df['Probability']))
    age_s, age_e = age_group.split('–')
    age_s, age_e = int(age_s), int(age_e)
    age_raw = np.random.randint(age_s, age_e + 1)
    age_properties.append({"raw_value": age_raw, "cat_value": age_group})

In [81]:
len(age_properties), age_properties[0]

(3000, {'raw_value': 26, 'cat_value': '25–29'})

## Income

In [113]:
income_file = glob.glob("dump/india/income*.txt")[0]
income_file

'dump/india\\income.txt'

In [114]:
df = pd.read_csv(income_file, sep="\t")
df.head()

Unnamed: 0,Income Group,Annual Income Range (INR),Approximate Percentage of Population
0,Low-Income Group,"Less than ₹1,00,000",~25%
1,Lower-Middle Class,"₹1,00,000 - ₹3,00,000",~30%
2,Middle Class,"₹3,00,000 - ₹10,00,000",~30%
3,Upper-Middle Class,"₹10,00,000 - ₹25,00,000",~10%
4,High-Income Group,"More than ₹25,00,000",~5%


In [115]:
cols = list(df.columns)
cols

['Income Group',
 'Annual Income Range (INR)',
 'Approximate Percentage of Population']

In [116]:
df['income_group_probs'] = df[cols[2]].apply(lambda x: int(str(x).split("~")[1].split("%")[0]) / 100)
df.head()

Unnamed: 0,Income Group,Annual Income Range (INR),Approximate Percentage of Population,income_group_probs
0,Low-Income Group,"Less than ₹1,00,000",~25%,0.25
1,Lower-Middle Class,"₹1,00,000 - ₹3,00,000",~30%,0.3
2,Middle Class,"₹3,00,000 - ₹10,00,000",~30%,0.3
3,Upper-Middle Class,"₹10,00,000 - ₹25,00,000",~10%,0.1
4,High-Income Group,"More than ₹25,00,000",~5%,0.05


In [117]:
df['income_range'] = ["0, 100000", "100000, 300000", "300000, 1000000", "1000000, 2500000", "2500000, 10000000"]

In [118]:
df.head()

Unnamed: 0,Income Group,Annual Income Range (INR),Approximate Percentage of Population,income_group_probs,income_range
0,Low-Income Group,"Less than ₹1,00,000",~25%,0.25,"0, 100000"
1,Lower-Middle Class,"₹1,00,000 - ₹3,00,000",~30%,0.3,"100000, 300000"
2,Middle Class,"₹3,00,000 - ₹10,00,000",~30%,0.3,"300000, 1000000"
3,Upper-Middle Class,"₹10,00,000 - ₹25,00,000",~10%,0.1,"1000000, 2500000"
4,High-Income Group,"More than ₹25,00,000",~5%,0.05,"2500000, 10000000"


In [119]:
zipped = list(zip(df['income_range'], df['Income Group']))
zipped

[('0, 100000', 'Low-Income Group'),
 ('100000, 300000', 'Lower-Middle Class'),
 ('300000, 1000000', 'Middle Class'),
 ('1000000, 2500000', 'Upper-Middle Class'),
 ('2500000, 10000000', 'High-Income Group')]

In [120]:
## 3000 values.
## Step-1: Choose income group with income_group_probs.
## Step-2: Choose raw value in the income_range

income_properties = []
for i in range(3000):
    idx = str(np.random.choice(range(len(zipped)), p=df['income_group_probs']))
    income_range, income_group = zipped[int(idx)]
    income_s, income_e = income_range.split(", ")
    income_s, income_e = int(income_s), int(income_e)
    income_raw = np.random.randint(income_s, income_e)
    income_properties.append({"raw_value": income_raw, "cat_value": income_group})

## Hobbies

In [83]:
hob_f = glob.glob("dump/h*.json")[0]
hob_f

'dump\\hobbies.json'

In [84]:
with open(hob_f, "r") as f:
    hobbies = json.load(f)

hobbies.keys()

dict_keys(['creative', 'physical', 'intellectual', 'social', 'collecting'])

In [85]:
## 3000 Values.
## Step-1: Choose a category.
## Step-2: Choose 2 raw values from the category.

hobbies_properties = []
for i in range(3000):
    hob_group = str(np.random.choice(list(hobbies.keys())))
    hob_select = random.sample(hobbies[hob_group], 2)
    hob_raw = " and ".join(hob_select)
    hobbies_properties.append({"raw_value": hob_raw, "cat_value": hob_group})

len(hobbies_properties), hobbies_properties[0]

(3000,
 {'raw_value': 'trading cards and toy collecting', 'cat_value': 'collecting'})

## Creation of agents.

In [86]:
import os

os.makedirs("raw_agents", exist_ok=True)

In [125]:
agents = []

for i in range(3000):
    agent = AgentSpec(i, name_properties[i], occupation_properties[i], age_properties[i], income_properties[i], hobbies_properties[i])
    agents.append(agent)

In [129]:
len(agents), # agents[0].convert_to_json()

(3000,)

In [130]:
## save to file.
import json

with open("raw_agents/india_exp_agents.json", "w") as f:
    json.dump([agent.convert_to_json() for agent in agents], f, indent=4)