In [4]:
import pandas as pd
import random
from faker import Faker

# CONSTANTS SECTION
# M: Male, F: Female, B: Bisexual, N : Non Binary, A: All
GENDER_MALE, GENDER_FEMALE, GENDER_NON_BINARY, = "M", "F", "N"
SEARCHING_GENDER = ["M", "F", "B", "N", "A"]
MIN_AGE, MAX_AGE = 18, 50
INTEREST_MIN_SIZE, INTEREST_MAX_SIZE = 3, 6
DS_SIZE = 100_000

faker = Faker()

# load datasets 
# taken from https://simplemaps.com/data/us-cities
cities_df = pd.read_csv("./us_cities.csv")
interests = pd.read_csv("./interests.csv")['interest'].tolist()

ds_list = []

for i in range(DS_SIZE):
    row = {}

    match random.randrange(1, 4):
        case 1:
            row["name"] = faker.first_name_male()
            row["surname"] = faker.last_name_male()
            row["gender"] = GENDER_MALE
        case 2:
            row["name"] = faker.first_name_female()
            row["surname"] = faker.last_name_female()
            row["gender"] = GENDER_FEMALE
        case _:
            row["name"] = faker.first_name_nonbinary()
            row["surname"] = faker.last_name_nonbinary()
            row["gender"] = GENDER_NON_BINARY

    row["searching_gender"] = random.choice(SEARCHING_GENDER)
    row["age"] = random.randrange(MIN_AGE, MAX_AGE)
    row_interests_size = random.randrange(INTEREST_MIN_SIZE, INTEREST_MAX_SIZE)

    row['interests'] = ', '.join(random.sample(interests, row_interests_size))
    cities_random_idx = random.randrange(cities_df.index.start, cities_df.index.stop)
    row['latitude'] = cities_df['lat'][cities_random_idx]
    row['longitude'] = cities_df['lng'][cities_random_idx]

    ds_list.append(row)

random.shuffle(ds_list)
df = pd.DataFrame(ds_list)

print(f"Gender distribution: \n{df['gender'].value_counts(normalize=True) * 100}")
print(f"Searching Gender distribution: \n{df['searching_gender'].value_counts(normalize=True) * 100}")

df.to_csv('dataset.csv', index=False, header=True)

Gender distribution: 
F    33.556
M    33.234
N    33.210
Name: gender, dtype: float64
Searching Gender distribution: 
F    20.155
A    20.039
B    20.037
N    19.967
M    19.802
Name: searching_gender, dtype: float64
