# Generating fake specimen data

In this notebook, we will generate fake specimen data using the [faker](https://faker.readthedocs.io/en/master/) library. 


In [None]:
!{sys.executable} -m pip install -qq pandas faker


In [None]:
import random
import numpy as np
import pandas as pd
import ipywidgets as widgets


In [None]:
import faker

fake = faker.Faker()


In [None]:
SAMPLETYPES = ["Blood", "Urine", "Serum"]


In [None]:
DATASET_SIZE = widgets.IntSlider(
    value=1000,
    min=100,
    max=10000,
    step=100,
    description="Dataset size:",
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="d",
)
DATASET_SIZE


IntSlider(value=1000, continuous_update=False, description='Dataset size:', max=10000, min=100, step=100)

In [None]:
# Create a widget to select the sample type
sample_type = widgets.SelectMultiple(
    options=SAMPLETYPES, value=SAMPLETYPES, description="Sample Type:"
)
display(sample_type)


SelectMultiple(description='Sample Type:', index=(0, 1, 2), options=('Blood', 'Urine', 'Serum'), value=('Blood…

In [None]:
# Create a widget to to select minimum and maximum values for the amount left
amount_left_min = widgets.IntRangeSlider(
    value=[0, 500],
    min=0,
    max=500,
    step=50,
    description="Amount Left (uL):",
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="d",
)
display(amount_left_min)


IntRangeSlider(value=(0, 500), continuous_update=False, description='Amount Left (uL):', max=500, step=50)

In [None]:
AMOUNTLEFT_LIST = list(
    range(amount_left_min.value[0], amount_left_min.value[1] + 1, 50)
)
AMOUNTLEFT_LIST


[0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500]

In [None]:
min_collect_date = widgets.DatePicker(
    description="Minimum Collection Date", value=pd.to_datetime("2020-01-01")
)
display(min_collect_date)


DatePicker(value=Timestamp('2020-01-01 00:00:00'), description='Minimum Collection Date', step=1)

In [None]:
max_collect_date = widgets.DatePicker(
    description="Maximum Collection Date", value=pd.to_datetime("today")
)
display(max_collect_date)


DatePicker(value=Timestamp('2025-02-03 13:01:26.938152'), description='Maximum Collection Date', step=1)

In [None]:
%%time
specimens = pd.DataFrame(
    {
        # Lab and Subject/Patient IDs will be random 8-digit strings
        "LABID": [fake.ean8() for i in range(DATASET_SIZE.value)],
        "SUBJECTID": [fake.ean8() for i in range(DATASET_SIZE.value)],
        "CONTAINERID": [fake.ean8() for i in range(DATASET_SIZE.value)],
        # Accession numbers will be random 8-digit strings starting with "A"
        "ACCESSION": ["A" + fake.ean8() for i in range(DATASET_SIZE.value)],
        # Sample type will be one of the three options in the list above
        "SAMPLETYPE": [
            random.choice(sample_type.value) for i in range(DATASET_SIZE.value)
        ],
        # Amount left will be a random number between 0 and 500
        "AMOUNTLEFT": [
            random.choice(AMOUNTLEFT_LIST) for i in range(DATASET_SIZE.value)
        ],
        "AMOUNT_UNITS": "µL",
        # Date collected will be a random date in the last ~25 years
        "DATE_COLLECTED": [
            fake.date_time_between_dates(min_collect_date.value, max_collect_date.value)
            for i in range(DATASET_SIZE.value)
        ],
    }
)


73.5 ms ± 883 μs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [None]:
# Create a widget to select the minimum and maximum delays in specimen receipt (i.e. the time between collection and receipt)
delay = widgets.IntRangeSlider(
    value=[1, 168],
    min=0,
    max=256,
    step=1,
    description="Delay in Receipt (hours):",
    disabled=False,
    continuous_update=False,
    orientation="horizontal",
    readout=True,
    readout_format="d",
)
display(delay)


IntRangeSlider(value=(1, 168), continuous_update=False, description='Delay in Receipt (hours):', max=256)

In [None]:
# Add date received that is between 4 and 168 hours (one week) after date collected
MIN_DELAY_HOURS = delay.value[0]
MAX_DELAY_HOURS = delay.value[1]
specimens["RECEIVE_DELAY_MINUTES"] = np.random.randint(
    MIN_DELAY_HOURS * 60, MAX_DELAY_HOURS * 60, size=DATASET_SIZE.value
)
specimens["DATE_RECEIVED"] = specimens["DATE_COLLECTED"] + pd.to_timedelta(
    specimens["RECEIVE_DELAY_MINUTES"], unit="m"
)
specimens.drop(columns=["RECEIVE_DELAY_MINUTES"], inplace=True)


In [None]:
specimens


Unnamed: 0,LABID,SUBJECTID,CONTAINERID,ACCESSION,SAMPLETYPE,AMOUNTLEFT,AMOUNT_UNITS,DATE_COLLECTED,DATE_RECEIVED
0,02551463,66344605,21859571,A44994013,Blood,300,µL,2022-10-28 20:58:06,2022-11-02 04:53:06
1,05890934,76705410,75701796,A37764395,Urine,450,µL,2022-09-17 22:13:08,2022-09-24 21:14:08
2,02845258,99080747,11340683,A29878307,Serum,450,µL,2023-02-27 22:47:58,2023-03-01 07:22:58
3,29771752,96622506,69978616,A96679586,Blood,100,µL,2023-06-03 22:59:32,2023-06-07 16:07:32
4,53032140,90718762,12976140,A05790722,Blood,350,µL,2020-01-31 20:32:10,2020-02-07 16:52:10
...,...,...,...,...,...,...,...,...,...
995,76966958,14292774,46543851,A44838805,Urine,400,µL,2021-07-17 04:18:45,2021-07-21 14:21:45
996,08276025,13882723,92084803,A13070663,Serum,300,µL,2020-08-01 10:17:40,2020-08-07 09:59:40
997,29777549,26931142,29983803,A41022917,Blood,150,µL,2020-12-14 17:02:00,2020-12-18 08:57:00
998,09112209,94079142,56561593,A11544456,Serum,100,µL,2022-01-20 04:54:35,2022-01-24 10:50:35
