In [8]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

In [9]:
# Params
RAND_SEED = 42
NUM_USERS = 100
NUM_RECORDS = 1000  # total records, not per user
NUMBER_OF_OCCUPATIONS = 100 # total number of unique occupation categories
NUMBER_OF_CITIES = 25 # total number of unique cities
NUMBER_OF_INDUSTRIES = 47 # total number of unique industries
OCCUPATIONS = [i for i in range(NUMBER_OF_OCCUPATIONS)]
INDUSTRIES = [i for i in range(NUMBER_OF_INDUSTRIES)]
CITIES = [i for i in range(NUMBER_OF_CITIES)]

## Number of records per user in the synthetic labor dataset
MIN_RECORDS_PER_USER = 5
MAX_RECORDS_PER_USER = 30

START_DATE = datetime(2020, 1, 1) #start date for all the "records" in the synthetic dataset

# Helper function to generate random dates
def random_dates(start, num_days, num_dates):
    return [start + timedelta(days=random.randint(0, num_days)) for _ in range(num_dates)]

def random_birthdates(start, end, n, seed=0, replace=True):
    dates = pd.date_range(start, end).to_series()
    return dates.sample(n, replace=replace, random_state=seed)

In [10]:
# SEEDS
random.seed(RAND_SEED)
np.random.seed(RAND_SEED)
# Generate random user base
users = pd.DataFrame({
    "USER_ID": range(1, NUM_USERS + 1),
    "BIRTHDAY": random_birthdates(datetime(1950, 1,1), datetime(1994, 1,1), NUM_USERS),
    "SEX": np.random.choice(["Male", "Female"], NUM_USERS),
}).reset_index(drop=True)
# Visualise the records from the user base
users.sample(10)

Unnamed: 0,USER_ID,BIRTHDAY,SEX
53,54,1962-12-19,Female
49,50,1973-08-03,Female
78,79,1980-12-15,Male
56,57,1951-12-16,Female
98,99,1958-02-04,Female
29,30,1967-01-08,Female
1,2,1979-07-27,Female
82,83,1961-12-02,Male
5,6,1963-04-22,Female
35,36,1971-07-27,Male


In [11]:
users.set_index("USER_ID").to_csv("data/users.csv")

In [12]:
# SEEDS
random.seed(RAND_SEED)
np.random.seed(RAND_SEED)
# Expand users into records
records = []
for _, user in users.iterrows():
    num_records = random.randint(MIN_RECORDS_PER_USER, MAX_RECORDS_PER_USER)  # Each user has between 12 to 100 records
    for _ in range(num_records):
        record = {
            "USER_ID": user["USER_ID"],
            "RECORD_DATE": random.choice(random_dates(START_DATE, 365 * 10, 1)),
            "CITY": random.choice(CITIES),
            "OCCUPATION": random.choice(OCCUPATIONS),
            "INDUSTRY": random.choice(INDUSTRIES),
            "INCOME": np.random.lognormal(mean=10, sigma=0.3)  # Income based on log-normal distribution
        }
        records.append(record)

# Create DataFrame
df_records = pd.DataFrame(records)
# Sort by UserID and the record date
df_records.sort_values(by=["USER_ID", "RECORD_DATE"], inplace=True)
df_records.head(20) # Display the first 20 rows to check

Unnamed: 0,USER_ID,RECORD_DATE,CITY,OCCUPATION,INDUSTRY,INCOME
3,1,2020-05-02,6,29,32,34783.866673
21,1,2020-09-28,18,91,20,20583.953807
13,1,2020-10-11,21,29,18,12406.997744
17,1,2020-10-19,17,93,15,24204.024495
14,1,2020-11-22,3,48,17,13128.289049
9,1,2021-01-31,11,77,16,25919.944599
0,1,2021-04-01,23,35,15,25565.888102
11,1,2021-05-26,2,70,18,19154.272446
23,1,2021-08-08,4,31,35,14365.367357
18,1,2021-10-31,12,34,40,16774.124628


In [13]:
df_records.to_csv("data/synth_labor.csv", index=False) #move to the data/rawdata folder