In [1]:
import pandas as pd
import random
from faker import Faker

# CONSTANTS SECTION
# M: Male, F: Female, B: Bisexual
SEX_MALE, SEX_FEMALE = "M", "F"
SEX_ORIENTATION = ["M", "F", "B"]
MIN_AGE, MAX_AGE = 18, 40
INTEREST_MIN_SIZE, INTEREST_MAX_SIZE = 1, 5
DS_SIZE = 100_000

faker = Faker()

# load datasets 
# taken from https://simplemaps.com/data/us-cities
cities_df = pd.read_csv("./us_cities.csv")
# set because we want to avoid duplicate interests
interests = pd.read_csv("./interests.csv")['interest'].tolist()

ds_list = []

for i in range(DS_SIZE):
    row = {}

    if i % 2 == 0:
        row["name"] = faker.first_name_male()
        row["surname"] = faker.last_name_male()
        row["sex"] = SEX_MALE
    else:
        row["name"] = faker.first_name_female()
        row["surname"] = faker.last_name_female()
        row["sex"] = SEX_FEMALE

    row["searching_sex"] = random.choice(SEX_ORIENTATION)
    row["age"] = random.randrange(MIN_AGE, MAX_AGE)
    row_interests_size = random.randrange(INTEREST_MIN_SIZE, INTEREST_MAX_SIZE)

    row['interests'] = ', '.join(random.sample(interests, row_interests_size))
    cities_random_idx = random.randrange(cities_df.index.start, cities_df.index.stop)
    row['latitude'] = cities_df['lat'][cities_random_idx]
    row['longitude'] = cities_df['lng'][cities_random_idx]

    ds_list.append(row)

random.shuffle(ds_list)
df = pd.DataFrame(ds_list)

print(f"Sex distribution: \n{df['sex'].value_counts(normalize=True) * 100}")
print(f"Searching sex distribution: \n{df['searching_sex'].value_counts(normalize=True) * 100}")

df.to_csv('dataset.csv', index=False, header=True)

Sex distribution: 
M    50.0
F    50.0
Name: sex, dtype: float64
Searching sex distribution: 
M    33.53
F    33.44
B    33.03
Name: searching_sex, dtype: float64
