In [61]:
import dask.dataframe as dd
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

# SEEDS
random.seed(0)
np.random.seed(0)

# Generate Synthetic User Data

In [64]:
# Params
NUM_USERS = 100
NUM_RECORDS = 1000  # total records, not per user
NUMBER_OF_OCCUPATIONS = 100 # total number of unique occupation categories
NUMBER_OF_CITIES = 50 # total number of unique cities
OCCUPATIONS = [f"Occupation_{i+1}" for i in range(100)]
CITIES = [f"City_{i+1}" for i in range(100)]
INCOME_QUANTILES = [f"Income_{i+1}" for i in range(100)]
START_DATE = datetime(2020, 1, 1)

# Helper function to generate random dates
def random_dates(start, num_days, num_dates):
    return [start + timedelta(days=random.randint(0, num_days)) for _ in range(num_dates)]

def random_birthdates(start, end, n, seed=0, replace=True):
    dates = pd.date_range(start, end).to_series()
    return dates.sample(n, replace=replace, random_state=seed)

In [65]:
# SEEDS
random.seed(0)
np.random.seed(0)
# Generate random user base
users = pd.DataFrame({
    "user_id": range(1, NUM_USERS + 1),
    "birthday": random_birthdates(datetime(1950, 1,1), datetime(1994, 1,1), NUM_USERS),
    "sex": np.random.choice(["Male", "Female"], NUM_USERS),
})

users["birth_month"] = users.apply(lambda x: "Month_" + str(x["birthday"].month), axis=1) 
users["birth_year"] =  users.apply(lambda x: "Year_" + str(x["birthday"].year), axis=1) 
users.set_index("user_id").to_csv("data/users.csv")
# Visualise the records from the user base
users.sample(10)

Unnamed: 0,user_id,birthday,sex,birth_month,birth_year
1988-11-25,95,1988-11-25,Female,Month_11,Year_1988
1983-03-23,89,1983-03-23,Female,Month_3,Year_1983
1984-05-23,75,1984-05-23,Male,Month_5,Year_1984
1993-01-14,8,1993-01-14,Female,Month_1,Year_1993
1962-12-19,54,1962-12-19,Female,Month_12,Year_1962
1973-12-02,55,1973-12-02,Female,Month_12,Year_1973
1963-04-22,6,1963-04-22,Female,Month_4,Year_1963
1952-01-26,45,1952-01-26,Female,Month_1,Year_1952
1977-02-01,27,1977-02-01,Female,Month_2,Year_1977
1974-08-17,66,1974-08-17,Male,Month_8,Year_1974


# Generate Synthetic Labor Data

In [72]:
# SEEDS
random.seed(0)
np.random.seed(0)
# Expand users into records
records = []
for _, user in users.iterrows():
    num_records = random.randint(2, 100)  # Each user has between 12 to 100 records
    for _ in range(num_records):
        record = {
            "user_id": user["user_id"],
            "record_date": random.choice(random_dates(START_DATE, 365 * 10, 1)),
            "city": random.choice(CITIES),
            "occupation": random.choice(OCCUPATIONS),
            "income": random.choice(INCOME_QUANTILES)  # Arbitrary salary range for example
        }
        records.append(record)

# Create DataFrame
df_records = pd.DataFrame(records)
# Sort by UserID and the record date
df_records.sort_values(by=["user_id", "record_date"], inplace=True)
df_records.head(20) # Display the first 20 rows to check

Unnamed: 0,user_id,record_date,city,occupation,income
36,1,2020-09-07,City_6,Occupation_77,Income_13
8,1,2020-09-12,City_12,Occupation_93,Income_52
35,1,2020-10-21,City_45,Occupation_56,Income_24
5,1,2020-10-29,City_61,Occupation_72,Income_13
28,1,2021-02-16,City_90,Occupation_29,Income_6
24,1,2021-04-17,City_76,Occupation_81,Income_43
41,1,2021-04-25,City_86,Occupation_23,Income_2
48,1,2021-06-26,City_50,Occupation_96,Income_54
3,1,2021-07-26,City_80,Occupation_33,Income_69
44,1,2021-09-20,City_59,Occupation_95,Income_11


In [73]:
# Save synthetic data
df_records.to_csv("data/records.csv", index=False)

# Create Vocab

In [74]:
special_tokens = ["[PAD]",
            "[CLS]",
            "[SEP]",
            "[MASK]",
            "[PLCH0]",  # placeholder tokens, that one can use for the finetuning tasks
            "[PLCH1]",
            "[PLCH2]",
            "[PLCH3]",
            "[PLCH4]",
            "[UNK]"]
data_tokens = users["sex"].unique().tolist() + \
users["birth_month"].unique().tolist() + \
users["birth_year"].unique().tolist()  + \
df_records["city"].unique().tolist()   + \
df_records["occupation"].unique().tolist() + \
df_records["income"].unique().tolist()

vocab = pd.DataFrame({"token": special_tokens + data_tokens})
vocab.to_csv("data/vocab.csv", index=True)

In [75]:
vocab.head(15) # Display the first 15 rows to check

Unnamed: 0,token
0,[PAD]
1,[CLS]
2,[SEP]
3,[MASK]
4,[PLCH0]
5,[PLCH1]
6,[PLCH2]
7,[PLCH3]
8,[PLCH4]
9,[UNK]


In [76]:
## test
# Load the data

In [95]:
sentences = df_records.groupby("user_id").get_group(1)[["city", "occupation", "income"]].values.tolist()

In [96]:
sentences

[['City_6', 'Occupation_77', 'Income_13'],
 ['City_12', 'Occupation_93', 'Income_52'],
 ['City_45', 'Occupation_56', 'Income_24'],
 ['City_61', 'Occupation_72', 'Income_13'],
 ['City_90', 'Occupation_29', 'Income_6'],
 ['City_76', 'Occupation_81', 'Income_43'],
 ['City_86', 'Occupation_23', 'Income_2'],
 ['City_50', 'Occupation_96', 'Income_54'],
 ['City_80', 'Occupation_33', 'Income_69'],
 ['City_59', 'Occupation_95', 'Income_11'],
 ['City_44', 'Occupation_68', 'Income_33'],
 ['City_51', 'Occupation_12', 'Income_48'],
 ['City_92', 'Occupation_16', 'Income_62'],
 ['City_57', 'Occupation_12', 'Income_77'],
 ['City_57', 'Occupation_67', 'Income_34'],
 ['City_87', 'Occupation_3', 'Income_70'],
 ['City_46', 'Occupation_79', 'Income_37'],
 ['City_91', 'Occupation_9', 'Income_25'],
 ['City_91', 'Occupation_29', 'Income_48'],
 ['City_25', 'Occupation_24', 'Income_5'],
 ['City_70', 'Occupation_36', 'Income_18'],
 ['City_41', 'Occupation_79', 'Income_82'],
 ['City_34', 'Occupation_9', 'Income_2

In [100]:
sequence = [s + ["[SEP]"] for s in sentences]

In [101]:
sequence

[['City_6', 'Occupation_77', 'Income_13', '[SEP]'],
 ['City_12', 'Occupation_93', 'Income_52', '[SEP]'],
 ['City_45', 'Occupation_56', 'Income_24', '[SEP]'],
 ['City_61', 'Occupation_72', 'Income_13', '[SEP]'],
 ['City_90', 'Occupation_29', 'Income_6', '[SEP]'],
 ['City_76', 'Occupation_81', 'Income_43', '[SEP]'],
 ['City_86', 'Occupation_23', 'Income_2', '[SEP]'],
 ['City_50', 'Occupation_96', 'Income_54', '[SEP]'],
 ['City_80', 'Occupation_33', 'Income_69', '[SEP]'],
 ['City_59', 'Occupation_95', 'Income_11', '[SEP]'],
 ['City_44', 'Occupation_68', 'Income_33', '[SEP]'],
 ['City_51', 'Occupation_12', 'Income_48', '[SEP]'],
 ['City_92', 'Occupation_16', 'Income_62', '[SEP]'],
 ['City_57', 'Occupation_12', 'Income_77', '[SEP]'],
 ['City_57', 'Occupation_67', 'Income_34', '[SEP]'],
 ['City_87', 'Occupation_3', 'Income_70', '[SEP]'],
 ['City_46', 'Occupation_79', 'Income_37', '[SEP]'],
 ['City_91', 'Occupation_9', 'Income_25', '[SEP]'],
 ['City_91', 'Occupation_29', 'Income_48', '[SEP]']