# Generate BCPP requisition and aliquot datasets

This notebook creates two files:
1. `lab_requisition_{timestamp}.csv`
2. `lab_aliquot_{timestamp}.csv`

## Usage:

* Run the notebook server in a valid BCPP VENV
* Start the `jupyter notebook` server in the root of the main project (e.g. bcpp/) or, if the main project is part of the VENV, from any folder with this notebook file.
* the database connection is that configured in `bcpp.settings.py`. To change this, create your own `settings.py` and specify with `manage.py`'s `--settings` option.
* start the server using:

        python manage.py shell_plus --notebook
        
## Tunnel to DB

Open a tunnel to the live DB:

    ssh -f <user>@<server> -L5002:localhost:3306 -N

Change the mysql settings (e.g. in your `/etc/bcpp/mysql.conf`):

    port: 5002
    name: <database name>
    user: <readonly user>
    password: <password>
    


In [1]:
import numpy as np
import pandas as pd
import sys

from arrow import Arrow
from bcpp_community import communities
from bcpp_status.models import StatusHistory
from bcpp_subject.models import SubjectVisit, SubjectConsent
from copy import copy
from datetime import datetime
from django.db import connection
from edc_constants.constants import YES, NO, NEG, UNK
from edc_pdutils.model_to_dataframe import ModelToDataframe, SubjectModelToDataframe, Helper
from pprint import pprint
from edc_base.model_mixins.constants import DEFAULT_BASE_FIELDS
from edc_lab.model_mixins.requisition import RequisitionStatusMixin

date_format = '%Y-%m-%d %H:%M:%S.%f'
export_date_format = '%Y-%m-%d'
local_tz = 'Africa/Gaborone'
yes_no = {True: YES, False: NO}
timestamp = datetime.today().strftime('%Y%m%d%H%M%S')


delimiter = ',' # '|'
start_date = datetime(2013, 10, 1)

In [2]:
class StatusHistoryModelToDataframe(ModelToDataframe):
    columns = {fld.name: fld.name for fld in StatusHistory._meta.get_fields()}    

In [3]:
helper = Helper()

In [4]:
df_subjects = SubjectModelToDataframe(model='bcpp_subject.subjectconsent').dataframe
df_subjects = df_subjects.drop_duplicates()
# df_subjects.info()

In [5]:
df_subjects.head()

Unnamed: 0,subject_identifier,gender,dob
0,066-26540006-2,F,1993-07-26
1,066-26150345-3,F,1991-01-22
2,066-26310014-2,F,1961-11-10
3,066-25110020-4,F,1974-11-24
4,066-25870017-4,F,1961-07-01


In [6]:
# start with subject visit model
cols = ['subject_identifier', 'report_date', 'visit_code', 'consent_version', 'survey', 'household_member_id', ]
df = ModelToDataframe(model='bcpp_subject.subjectvisit', drop_sys_columns=True).dataframe
df = df.rename(columns={'id': 'subject_visit_id'})
df = df.set_index('subject_visit_id')
df['report_date'] = helper.to_local_datetime(df['report_datetime'])
df['report_date'] = df['report_date'].dt.normalize()
df = df[cols]
df_original = df.copy()

# only keep records after start_date
# df = df[df['report_date'] >= start_date]

In [7]:
# subjectconsent
model = 'bcpp_subject.subjectconsent'
cols = ['subject_identifier', 'gender', 'dob', 'consent_datetime', 'version']
df_consent = SubjectModelToDataframe(model=model, columns=cols).dataframe
df_consent['consent_datetime'] = helper.to_local_datetime(df_consent['consent_datetime'])
df_consent['dob'] = helper.date_to_local_datetime(df_consent['dob'])

# remove subject identifier as UUID
df_consent = df_consent[df_consent['subject_identifier'].str.len() != 32]

# drop duplicates (because of versions)
df_consent = df_consent.sort_values(['subject_identifier'])
df_consent = df_consent.drop_duplicates(['subject_identifier'], keep='first')
df_consent = df_consent[['subject_identifier', 'gender', 'dob']]

In [8]:
# merge subjectconsent to main df
df = df.reset_index()
df = pd.merge(df, df_consent, on='subject_identifier', how='left')

In [9]:
# set index for joins with CRFs
df = df.set_index('subject_visit_id')

In [10]:
# subjectrequisition
status_cols = [f.name for f in RequisitionStatusMixin._meta.get_fields()
               if f not in ['processed_datetime', 'packed_datetime', 'shipped_datetime']]
cols = ['requisition_identifier', 'panel_name', 'requisition_datetime', 'is_drawn', 'reason_not_drawn',
        'drawn_datetime', 'specimen_type', 'study_site', 'study_site_name']
cols.extend([f.name for f in RequisitionStatusMixin._meta.get_fields()])
df_crf = helper.get_crf_dataframe(
    model='bcpp_subject.subjectrequisition', cols=cols)
df_crf['requisition_datetime'] = helper.to_local_datetime(df_crf['requisition_datetime'])
df_crf['drawn_datetime'] = helper.to_local_datetime(df_crf['drawn_datetime'])
df_crf['received_datetime'] = helper.to_local_datetime(df_crf['received_datetime'])
df_crf['received'] = df_crf['received'].map(yes_no, na_action='ignore')
df_crf['processed'] = df_crf['processed'].map(yes_no, na_action='ignore')
df_crf['packed'] = df_crf['packed'].map(yes_no, na_action='ignore')
df_crf['shipped'] = df_crf['shipped'].map(yes_no, na_action='ignore')

# join to main df
df = df.join(df_crf)

# show grouping
# df.groupby('circumcised').size()

In [11]:
df.head()

Unnamed: 0_level_0,subject_identifier,report_date,visit_code,consent_version,survey,household_member_id,gender,dob,requisition_identifier,panel_name,...,study_site,study_site_name,received,received_datetime,processed,processed_datetime,packed,packed_datetime,shipped,shipped_datetime
subject_visit_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0000ded4-3abd-4a7a-8a9e-4df4104e36d5,066-11860016-2,2013-11-16 00:00:00+02:00,T0,?,bcpp-survey.bcpp-year-1.bhs.ranaka,dbeeff70-e29b-4299-9c8f-abc9b66823f1,M,1994-05-08 00:00:00+02:00,86A92K7,Microtube,...,11,ranaka,Yes,2013-11-16 17:13:11+02:00,Yes,NaT,Yes,,No,
0003414b-a8b1-4727-ad3a-129df95ccb4d,066-16240013-5,2014-09-26 00:00:00+02:00,T0,?,bcpp-survey.bcpp-year-1.bhs.lentsweletau,82e8e37f-1eed-4369-a17c-249c4944469e,M,1955-07-17 00:00:00+02:00,24R3NZN,Microtube,...,16,lentsweletau,Yes,2014-09-26 12:15:34+02:00,No,NaT,No,,No,
00040a36-4c01-41dc-9b9d-b6057d0b06f9,066-30220019-3,2015-05-17 00:00:00+02:00,T0,?,bcpp-survey.bcpp-year-1.bhs.tati_siding,688273ac-f944-4fa4-8a5b-af18b9864214,F,1991-07-02 00:00:00+02:00,22VDZ9N,Microtube,...,30,tati_siding,Yes,2015-05-17 16:21:08+02:00,No,NaT,No,,No,
00043eb2-7261-45cd-bf39-36bd689c9319,066-26270001-1,2016-04-07 00:00:00+02:00,T1,4,bcpp-survey.bcpp-year-2.ahs.mmadinare,ba3da451-0e40-48eb-bd05-d84cb3f930ad,F,1979-01-01 00:00:00+02:00,312Z7YD,Viral Load,...,26,mmadinare,Yes,2016-04-07 13:11:11+02:00,No,NaT,No,,No,
0004bb3d-fb2e-43fb-a2b5-8f5de412f3ef,066-16400037-2,2017-04-01 00:00:00+02:00,T2,5,bcpp-survey.bcpp-year-3.ahs.lentsweletau,b6d84fe6-764e-4819-a185-37109c6075db,F,1965-10-14 00:00:00+02:00,38DBDE9,Microtube,...,16,lentsweletau,Yes,2017-04-01 17:54:01.246000+02:00,Yes,NaT,No,,No,


In [12]:
# 'bcpp_status.statushistory'
df_status = StatusHistoryModelToDataframe(model='bcpp_status.statushistory').dataframe
cols = ['subject_identifier', 'status_date', 'timepoint', 'final_hiv_status', 'final_hiv_status_date', 'final_arv_status', ]
df_status = df_status[cols]
df_status = df_status.rename(columns={'status_date': 'report_date', 'timepoint': 'visit_code'})
df_status['report_date'] = helper.date_to_local_datetime(df_status['report_date'])
df_status['final_hiv_status_date'] = helper.date_to_local_datetime(df_status['final_hiv_status_date'])
df_status = df_status.groupby(['subject_identifier', 'report_date']).last()
df_status = df_status.reset_index()
df_status = df_status[['subject_identifier',  'report_date', 'visit_code', 'final_hiv_status', 'final_hiv_status_date', 'final_arv_status']]
df = pd.merge(df, df_status, on=['subject_identifier', 'report_date', 'visit_code'], how='left')


In [13]:
# remove invalid sites
df = df[-df['study_site_name'].isin(['bhp', 'test_community'])]
df = df.drop(['specimen_type', 'household_member_id', 'final_hiv_status', 'final_hiv_status_date', 'final_arv_status'], axis=1)

In [14]:
# fix reason_not_drawn
is_blank = (df.reason_not_drawn.notnull()) & (df.is_drawn == YES)
df.loc[is_blank, 'reason_not_drawn'] = np.nan

In [15]:
df.head()

Unnamed: 0,subject_identifier,report_date,visit_code,consent_version,survey,gender,dob,requisition_identifier,panel_name,requisition_datetime,...,study_site,study_site_name,received,received_datetime,processed,processed_datetime,packed,packed_datetime,shipped,shipped_datetime
0,066-11860016-2,2013-11-16 00:00:00+02:00,T0,?,bcpp-survey.bcpp-year-1.bhs.ranaka,M,1994-05-08 00:00:00+02:00,86A92K7,Microtube,2013-11-16 15:18:51+02:00,...,11,ranaka,Yes,2013-11-16 17:13:11+02:00,Yes,NaT,Yes,,No,
1,066-16240013-5,2014-09-26 00:00:00+02:00,T0,?,bcpp-survey.bcpp-year-1.bhs.lentsweletau,M,1955-07-17 00:00:00+02:00,24R3NZN,Microtube,2014-09-26 13:15:32+02:00,...,16,lentsweletau,Yes,2014-09-26 12:15:34+02:00,No,NaT,No,,No,
2,066-30220019-3,2015-05-17 00:00:00+02:00,T0,?,bcpp-survey.bcpp-year-1.bhs.tati_siding,F,1991-07-02 00:00:00+02:00,22VDZ9N,Microtube,2015-05-17 17:07:15+02:00,...,30,tati_siding,Yes,2015-05-17 16:21:08+02:00,No,NaT,No,,No,
3,066-26270001-1,2016-04-07 00:00:00+02:00,T1,4,bcpp-survey.bcpp-year-2.ahs.mmadinare,F,1979-01-01 00:00:00+02:00,312Z7YD,Viral Load,2016-04-07 13:11:56+02:00,...,26,mmadinare,Yes,2016-04-07 13:11:11+02:00,No,NaT,No,,No,
4,066-16400037-2,2017-04-01 00:00:00+02:00,T2,5,bcpp-survey.bcpp-year-3.ahs.lentsweletau,F,1965-10-14 00:00:00+02:00,38DBDE9,Microtube,2017-04-01 16:50:58+02:00,...,16,lentsweletau,Yes,2017-04-01 17:54:01.246000+02:00,Yes,NaT,No,,No,


In [16]:
# export requisitions to CSV as a single file
path = f'~/lab_requisitions_{timestamp}.csv'
df.to_csv(path, index=True, date_format=export_date_format, sep=delimiter)
sys.stdout.write(f'* {path}\n')

* ~/lab_requisitions_20171004143918.csv


In [17]:
# df.groupby('study_site_name').size()
# df.groupby('is_drawn').size()
# df.groupby('panel_name').size()
# df.groupby('visit_code').size()
# df.groupby('survey').size()
# df.groupby('gender').size()
# print(df['report_date'].min())
# print(df['report_date'].max())
# print(df['drawn_datetime'].min())
# print(df['drawn_datetime'].max())
# print(df['requisition_datetime'].min())
# print(df['requisition_datetime'].max())
# print(df['dob'].min())
# print(df['dob'].max())


In [18]:
model = 'edc_lab.aliquot'
cols = None
df_aliquot = ModelToDataframe(model=model).dataframe
df_aliquot['aliquot_datetime'] = helper.to_local_datetime(df_aliquot['aliquot_datetime'])

In [19]:
# df_aliquot.groupby('hostname_created').size()

In [20]:
# drop sys and other unwanted columns
base_cols = copy(DEFAULT_BASE_FIELDS)
base_cols.pop(base_cols.index('hostname_created'))
base_cols
# columns = DEFAULT_BASE_FIELDS + ['slug', 'comment', 'shipped']
# df_aliquot = df_aliquot.drop(columns, axis=1)

['id',
 'created',
 'modified',
 'user_created',
 'user_modified',
 'hostname_modified',
 'revision',
 'device_created',
 'device_modified']

In [21]:
df_aliquot['is_primary'] = df_aliquot['is_primary'].map(yes_no, na_action='ignore')
df_aliquot['medium'] = df_aliquot['medium'].str.lower()

In [23]:
# add column to indicate missing requisitions
cols = ['requisition_identifier',
        'panel_name', 'requisition_datetime',
        'gender', 'dob', 'study_site', 'study_site_name']
df1 = pd.merge(df_aliquot, df[cols],
               on='requisition_identifier', how='left')
df_aliquot['missing_requisition'] = df1['panel_name'].isnull()
df_aliquot['is_primary'] = df_aliquot['is_primary'].map(yes_no, na_action='ignore')
df_aliquot.head()
df_aliquot.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63448 entries, 0 to 63447
Data columns (total 32 columns):
created                   63448 non-null datetime64[ns]
modified                  63448 non-null datetime64[ns]
user_created              63448 non-null object
user_modified             63448 non-null object
hostname_created          63448 non-null object
hostname_modified         63448 non-null object
revision                  63448 non-null object
device_created            63448 non-null object
device_modified           63448 non-null object
id                        63448 non-null object
slug                      63448 non-null object
aliquot_identifier        63448 non-null object
parent_identifier         63448 non-null object
identifier_prefix         63448 non-null object
subject_identifier        63448 non-null object
requisition_identifier    63448 non-null object
aliquot_datetime          63448 non-null datetime64[ns, Africa/Gaborone]
is_primary                0 non-nu

In [None]:
# export aliquots to CSV as a single file
path = f'~/lab_aliquots_{timestamp}.csv'
df_aliquot.to_csv(path, index=True, date_format=export_date_format, sep=delimiter)
sys.stdout.write(f'* {path}\n')

In [24]:
print(df_aliquot.groupby('medium').size())
print('---')

print(df_aliquot.groupby('aliquot_type').size())
print('---')

print(df_aliquot.groupby('alpha_code').size())
print('---')

print(df_aliquot.groupby('numeric_code').size())
print('---')

print(df_aliquot.groupby('condition').size())
print('---')

print(df_aliquot.groupby('missing_requisition').size())
print('---')

medium
other        1
tube     63447
dtype: int64
---
aliquot_type
Buffy Coat     20132
Plasma         29215
Whole Blood    14101
dtype: int64
---
alpha_code
BC    20132
PL    29215
WB    14101
dtype: int64
---
numeric_code
02    14101
12    20132
36    29215
dtype: int64
---
condition
10    63448
dtype: int64
---
missing_requisition
False    63318
True       130
dtype: int64
---


In [25]:
df1 = df_aliquot[df_aliquot['missing_requisition']]
df1 = df1.sort_values(['aliquot_datetime'])

In [27]:
df1.groupby(['hostname_created').size()

hostname_created
lerala                     3
maunatlala.bhp.org.bw    112
mmathethe.bhp.org.bw      15
dtype: int64

In [28]:
df1.groupby(['hostname_created', 'device_created']).size()

hostname_created       device_created
lerala                                     3
maunatlala.bhp.org.bw  98                112
mmathethe.bhp.org.bw                      15
dtype: int64

In [33]:
df2 = df1[df1['device_created'] == '']
df2[['subject_identifier', 'created', 'device_created', 'hostname_created']]

Unnamed: 0,subject_identifier,created,device_created,hostname_created
34417,066-205400081-2,2017-05-18 06:08:16.335,,mmathethe.bhp.org.bw
10674,066-205400081-2,2017-05-18 06:08:24.316,,mmathethe.bhp.org.bw
61760,066-205400081-2,2017-05-18 06:08:24.442,,mmathethe.bhp.org.bw
29533,066-205400081-2,2017-05-18 06:08:24.503,,mmathethe.bhp.org.bw
35738,066-205400081-2,2017-05-18 06:08:24.608,,mmathethe.bhp.org.bw
45149,066-205400081-2,2017-05-18 06:08:24.669,,mmathethe.bhp.org.bw
33980,066-205400081-2,2017-05-18 06:09:46.213,,mmathethe.bhp.org.bw
11642,066-205400081-2,2017-05-18 06:09:50.491,,mmathethe.bhp.org.bw
17317,066-205400081-2,2017-05-18 06:09:50.615,,mmathethe.bhp.org.bw
38738,066-205400081-2,2017-05-18 06:09:50.803,,mmathethe.bhp.org.bw


In [None]:
missing = [
    '06676M8ZHK02011602',
'06665MFYE402011602',
'06676EURU402011603',
'066714K7RW02011603',
'06665WC99A02011603',
'066656HCN402011602',
'06671A93F602011602',
'06676N923V02011603']

In [None]:
df1 = df_aliquot[df_aliquot['aliquot_identifier'].isin(missing)]

In [None]:
df1 = df[pd.notnull(df['requisition_identifier'])]
df1['hostname'] = df1['hostname']
df1.groupby(['hostname_created']).size()
#df1[df1['requisition_identifier'].str.startswith('06676')]


In [None]:
df1 = pd.read_csv('~/Documents/bcpp/moyo/aliquot.csv')

In [None]:
df1.head()

In [None]:
df1[df1['aliquot_identifier'].isin(missing)]