In [1]:
import pandas as pd
import random
from faker import Faker

# CONSTANTS SECTION
# M: Male, F: Female, B: Bisexual, N : Non Binary, A: All
GENDER_MALE, GENDER_FEMALE, GENDER_NON_BINARY, = "M", "F", "N"
SEARCHING_GENDER = ["M", "F", "B", "N", "A"]
MIN_AGE, MAX_AGE = 18, 50
INTEREST_MIN_SIZE, INTEREST_MAX_SIZE = 3, 6
DS_SIZE = 500_000

faker = Faker()

# load datasets 
# taken from https://simplemaps.com/data/us-cities
cities_df = pd.read_csv("./us_cities.csv")
interests = pd.read_csv("./interests.csv")['interest'].tolist()

ds_list = []

for i in range(DS_SIZE):
    row = {}


    choice = random.randrange(1, 4)

    if choice == 1:
        row["name"] = faker.first_name_male()
        row["surname"] = faker.last_name_male()
        row["gender"] = GENDER_MALE
    elif choice == 2:
        row["name"] = faker.first_name_female()
        row["surname"] = faker.last_name_female()

        row["gender"] = GENDER_FEMALE
    else:
        row["name"] = faker.first_name_nonbinary()
        row["surname"] = faker.last_name_nonbinary()
        row["gender"] = GENDER_NON_BINARY

    row["searching_gender"] = random.choice(SEARCHING_GENDER)
    row["age"] = random.randrange(MIN_AGE, MAX_AGE)
    row_interests_size = random.randrange(INTEREST_MIN_SIZE, INTEREST_MAX_SIZE)

    row['interests'] = ', '.join(random.sample(interests, row_interests_size))
    cities_random_idx = random.randrange(cities_df.index.start, cities_df.index.stop)
    row['latitude'] = cities_df['lat'][cities_random_idx]
    row['longitude'] = cities_df['lng'][cities_random_idx]

    ds_list.append(row)

random.shuffle(ds_list)
df = pd.DataFrame(ds_list)

print(f"Gender distribution: \n{df['gender'].value_counts(normalize=True) * 100}")
print(f"Searching Gender distribution: \n{df['searching_gender'].value_counts(normalize=True) * 100}")

df.to_csv('dataset.csv', index=False, header=True)

Gender distribution: 
N    33.4180
M    33.3676
F    33.2144
Name: gender, dtype: float64
Searching Gender distribution: 
F    20.0932
M    20.0758
A    19.9492
N    19.9436
B    19.9382
Name: searching_gender, dtype: float64


In [5]:
import re #mi serve per usare textsplit
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer #lib per vect

vect = CountVectorizer(tokenizer = lambda text: re.split(", ", text), lowercase = False)
label = vect.fit_transform(df['interests'])
vect.get_feature_names_out()




array(['Acting', 'Advertising', 'Air hockey', 'American football', 'Art',
       'Art history', 'Astronomy', 'Astrophysics', 'Badminton', 'Baking',
       'Ballet', 'Ballroom dancing', 'Bar tending', 'Baseball',
       'Basketball', 'Biking', 'Billiards', 'Blacksmithing',
       'Board games', 'Boating', 'Bouldering', 'Bowling', 'Boxing',
       'Calligraphy', 'Card games', 'Carpentry', 'Ceramics',
       'Choreography', 'Computer programming', 'Cooking', 'Cybersecurity',
       'Cycling', 'DJing', 'Dancing', 'Darts', 'Data analysis', 'Diving',
       'Drawing', 'Equestrian', 'Escape rooms', 'Ethical hacking',
       'Fashion design', 'Field hockey', 'Filmmaking', 'Filmography',
       'Financial analysis', 'Fishing', 'Fitness', 'Floristry',
       'Food criticism', 'Food styling', 'Football', 'Gaming',
       'Gardening', 'Geocaching', 'Glassblowing', 'Golf', 'Guitar',
       'Gymnastics', 'Hang gliding', 'Hiking', 'Horseback riding',
       'Hunting', 'Ice skating', 'Improv', 'Intere

In [6]:
df_interests = pd.DataFrame(label.toarray(), columns = vect.get_feature_names_out())
df = pd.concat([df, df_interests], axis = 1)
df.drop("interests", axis = 1, inplace = True)
df

Unnamed: 0,name,surname,gender,searching_gender,age,latitude,longitude,Acting,Advertising,Air hockey,...,Watercolor,Web design,Weightlifting,Windsurfing,Wine tasting,Woodworking,Wrestling,Writing,Yachting,Yoga
0,Suzanne,Fox,N,N,32,36.5367,-95.9264,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,Matthew,Coleman,M,B,45,39.2920,-75.0097,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,Elizabeth,Jones,N,A,39,41.9170,-104.2955,0,0,1,...,0,0,0,0,0,0,0,0,0,0
3,Jenna,Perez,N,B,35,44.1810,-94.0391,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,Michael,Smith,M,N,23,32.8560,-116.9040,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
499995,Laura,Smith,F,B,46,43.9987,-95.8575,0,0,0,...,0,0,0,0,0,0,0,0,0,0
499996,Alicia,Gray,F,N,19,40.7974,-76.4284,0,0,0,...,0,0,0,0,0,0,0,0,0,0
499997,Martin,Anderson,M,M,31,38.5116,-76.6797,0,0,0,...,0,0,0,0,0,0,0,0,0,0
499998,Lindsay,Hunt,F,N,21,42.0208,-95.9662,0,0,0,...,0,0,1,0,0,0,0,0,0,0
