In [1]:
import numpy as np
import pandas as pd
import lux
from faker import Faker
fake = Faker(['fr-CA'])
fake.locales
from faker.providers.person.fr_CA import Provider

In [2]:
def random_names(name_type, size):
    """
    Generate n-length ndarray of person names.
    name_type: a string, either first_names or last_names
    """
    names = getattr(Provider, name_type)
    return np.random.choice(names, size=size)

In [3]:
def random_genders(size, p=None):
    """Generate n-length ndarray of genders."""
    if not p:
        # default probabilities
        p = (0.49, 0.49, 0.01, 0.01)
    gender = ("M", "F", "O", "")
    return np.random.choice(gender, size=size, p=p)

In [4]:
def random_dates(start, end, size):
    """
    Generate random dates within range between start and end.    
    Adapted from: https://stackoverflow.com/a/50668285
    """
    # Unix timestamp is in nanoseconds by default, so divide it by
    # 24*60*60*10**9 to convert to days.
    divide_by = 24 * 60 * 60 * 10**9
    start_u = start.value // divide_by
    end_u = end.value // divide_by
    return pd.to_datetime(np.random.randint(start_u, end_u, size), unit="D")

In [5]:
def random_address(size):
    """
    Generate n-length ndarray of address.
    """
    fake = Faker('en_CA')
    addressList = []
    for _ in range(size):
        address = fake.address().split(", ")[0].replace('\n', ', ')
        addressList.append(address + ", QC " + fake.postalcode_in_province("QC"))
    
    return np.random.choice(addressList, size=size)

In [6]:
def random_nas(size):
    fake = Faker('en_CA')
    nasList = []
    for _ in range(size):
        nasList.append(fake.ssn())
    
    return np.random.choice(nasList, size=size)

In [7]:
# How many records do we want to create in our CSV? In this example
# we are generating 100, but you could also find relatively fast results generating 
# much larger datasets
size = 1000 
df = pd.DataFrame(columns=['First', 'Last', 'Gender', 'Birthdate','Address', 'NAS'])

df['First'] = random_names('first_names', size)
df['Last'] = random_names('last_names', size) 
df['Gender'] = random_genders(size) 
df['Birthdate'] = random_dates(start=pd.to_datetime('1940-01-01'), end=pd.to_datetime('2008-01-01'), size=size)
df['Address'] = random_address(size)
df['NAS'] = random_nas(size)
df.to_csv('fake-file.csv')
df

Unnamed: 0,First,Last,Gender,Birthdate,Address,NAS
0,Joséphine,Tardif,F,1950-10-20,"535 Richardson Spring Apt. 961, Dakotaview, QC...",347 207 656
1,Olivier,Chartrand,F,2005-05-07,"57978 Perry Lake, Copelandtown, QC G3P 8H1",674 620 745
2,Jessica,Richer,F,2002-05-07,"23828 Wallace Canyon, Johnstonburgh, QC G2N 9H2",665 074 258
3,Nicole,Legault,F,1998-02-24,"091 Stephanie Motorway Suite 079, Johnton, QC ...",331 147 801
4,Virginie,Thibault,M,1996-07-10,"22483 Bailey Hollow Suite 832, Williamsmouth, ...",273 802 645
...,...,...,...,...,...,...
995,Denise,Blais,M,1969-11-13,"085 Brandi Grove, Lake Alexandrahaven, QC G8E 8E8",710 582 537
996,Timothée,Duval,F,1967-02-26,"439 Ali Fields, New Daniel, QC G7B3T9",173 823 048
997,Emmanuelle,Croteau,M,1972-04-25,"29612 James Flats Suite 393, Lake James, QC G5...",480 731 835
998,Frédéric,Moreau,F,1993-11-05,"856 Wilkerson Inlet Apt. 099, East Monica, QC ...",856 250 287
