# Generating fake specimen data

In this notebook, we will generate fake specimen data using the [faker](https://faker.readthedocs.io/en/master/) library. 


In [25]:
import sys
!{sys.executable} -m pip install -qq pandas faker

In [26]:
import random
import numpy as np
import pandas as pd


In [27]:
import faker

fake = faker.Faker()


In [29]:
SAMPLETYPES = ["Blood", "Urine", "Serum"]


In [30]:
DATASET_SIZE = 1000


In [31]:
specimens = pd.DataFrame(
    {
        "LABID": [fake.ean8() for i in range(DATASET_SIZE)],
        "SUBJECTID": [fake.ean8() for i in range(DATASET_SIZE)],
        "SAMPLETYPE": [random.choice(SAMPLETYPES) for i in range(DATASET_SIZE)],
        "CONTAINERID": [fake.ean8() for i in range(DATASET_SIZE)],
        "ACCESSION": ["A" + fake.ean8() for i in range(DATASET_SIZE)],
        "AMOUNTLEFT": [
            random.choice([0, 150, 250, 350, 400, 500]) for i in range(DATASET_SIZE)
        ],
        "AMOUNT_UNITS": "µL",
        "DATE_COLLECTED": [fake.date_time_this_century() for i in range(DATASET_SIZE)],
    }
)


In [32]:
MIN_DELAY_HOURS = 4
MAX_DELAY_HOURS = 168


In [33]:
# Add date received that is between 4 and 168 hours (one week) after date collected
specimens["RECEIVE_DELAY_MINUTES"] = np.random.randint(
    MIN_DELAY_HOURS * 60, MAX_DELAY_HOURS * 60, size=DATASET_SIZE
)
specimens["DATE_RECEIVED"] = specimens["DATE_COLLECTED"] + pd.to_timedelta(
    specimens["RECEIVE_DELAY_MINUTES"], unit="m"
)
specimens.drop(columns=["RECEIVE_DELAY_MINUTES"], inplace=True)


In [34]:
specimens


Unnamed: 0,LABID,SUBJECTID,SAMPLETYPE,CONTAINERID,ACCESSION,AMOUNTLEFT,AMOUNT_UNITS,DATE_COLLECTED,DATE_RECEIVED
0,11302247,68958541,Blood,20429966,A60370686,0,µL,2020-08-28 08:13:08,2020-08-30 18:48:08
1,69456015,09854222,Blood,85723801,A23357792,0,µL,2020-04-07 11:48:41,2020-04-08 19:52:41
2,68783297,51833770,Serum,14788604,A74224142,250,µL,2001-05-06 14:48:36,2001-05-08 14:11:36
3,44441494,09859012,Urine,38260308,A94937381,150,µL,2012-08-21 22:47:17,2012-08-25 10:05:17
4,29573950,40351261,Serum,92076778,A84706843,350,µL,2018-06-06 19:28:48,2018-06-10 15:08:48
...,...,...,...,...,...,...,...,...,...
995,47369993,66861850,Blood,33085982,A14376078,350,µL,2010-12-02 23:31:10,2010-12-05 00:39:10
996,19635378,32805888,Blood,65098080,A82723095,350,µL,2018-09-28 20:29:28,2018-09-29 01:47:28
997,12218653,79915731,Urine,86639460,A05247196,0,µL,2021-03-10 10:14:56,2021-03-17 09:27:56
998,99295165,53573698,Blood,52362613,A72341193,150,µL,2023-01-10 12:10:30,2023-01-11 18:08:30
