In [1]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# SEEDS
random.seed(0)
np.random.seed(0)

# Generate Synthetic User Data

In [2]:
# Params
NUM_USERS = 100
NUM_RECORDS = 1000  # total records, not per user
NUMBER_OF_OCCUPATIONS = 100 # total number of unique occupation categories
NUMBER_OF_CITIES = 50 # total number of unique cities
OCCUPATIONS = [f"Occupation_{i+1}" for i in range(100)]
CITIES = [f"City_{i+1}" for i in range(100)]
START_DATE = datetime(2020, 1, 1)

# Helper function to generate random dates
def random_dates(start, num_days, num_dates):
    return [start + timedelta(days=random.randint(0, num_days)) for _ in range(num_dates)]

def random_birthdates(start, end, n, seed=0, replace=True):
    dates = pd.date_range(start, end).to_series()
    return dates.sample(n, replace=replace, random_state=seed)

In [3]:
# Generate random user base
users = pd.DataFrame({
    "user_id": range(1, NUM_USERS + 1),
    "birthday": random_birthdates(datetime(1950, 1,1), datetime(1994, 1,1), NUM_USERS),
    "sex": np.random.choice(["male", "female"], NUM_USERS),
})


In [4]:
# Visualise the records from the user base
users.sample(10)

Unnamed: 0,user_id,birthday,sex
1988-11-25,95,1988-11-25,female
1983-03-23,89,1983-03-23,female
1984-05-23,75,1984-05-23,male
1993-01-14,8,1993-01-14,female
1962-12-19,54,1962-12-19,female
1973-12-02,55,1973-12-02,female
1963-04-22,6,1963-04-22,female
1952-01-26,45,1952-01-26,female
1977-02-01,27,1977-02-01,female
1974-08-17,66,1974-08-17,male


# Generate Synthetic Labor Data

In [6]:
# Expand users into records
records = []
for _, user in users.iterrows():
    num_records = random.randint(2, 50)  # Each user has between 12 to 50 records
    for _ in range(num_records):
        record = {
            "user_id": user["user_id"],
            "record_date": random.choice(random_dates(START_DATE, 365 * 2, 1)),
            "birthday": user["birthday"],
            "sex": user["sex"],
            "city": random.choice(CITIES),
            "occupation": random.choice(OCCUPATIONS),
            "salary": random.randint(2, 100000)  # Arbitrary salary range for example
        }
        records.append(record)

# Create DataFrame
df_records = pd.DataFrame(records)
# Sort by UserID and the record date
df_records.sort_values(by=["user_id", "record_date"], inplace=True)
df_records.head(20) # Display the first 20 rows to check

Unnamed: 0,user_id,record_date,birthday,sex,city,occupation,salary
8,1,2020-03-04,1957-06-25,male,City_12,Occupation_93,52276
3,1,2020-04-07,1957-06-25,male,City_69,Occupation_91,78894
24,1,2020-04-28,1957-06-25,male,City_76,Occupation_81,43946
4,1,2020-05-30,1957-06-25,male,City_13,Occupation_94,9667
17,1,2020-07-07,1957-06-25,male,City_24,Occupation_5,80319
25,1,2020-07-13,1957-06-25,male,City_3,Occupation_94,35527
15,1,2020-07-27,1957-06-25,male,City_57,Occupation_12,78158
7,1,2020-07-28,1957-06-25,male,City_57,Occupation_67,34145
2,1,2020-08-11,1957-06-25,male,City_37,Occupation_18,99066
10,1,2020-09-06,1957-06-25,male,City_91,Occupation_9,25045


In [7]:
# Save synthetic data
df_records.to_csv("data/records.csv", index=False)