In [None]:
import numpy as np
import pandas as pd
import sys

from arrow import Arrow
from bcpp_community import communities
from bcpp_status.models import StatusHistory
from bcpp_subject.models import SubjectVisit, SubjectConsent
from datetime import datetime
from django.db import connection
from edc_constants.constants import YES, NO, NEG, UNK
from edc_pdutils.model_to_dataframe import ModelToDataframe
from pprint import pprint
from edc_base.model_mixins.constants import DEFAULT_BASE_FIELDS
from edc_lab.model_mixins.requisition import RequisitionStatusMixin

date_format = '%Y-%m-%d %H:%M:%S.%f'
export_date_format = '%Y-%m-%d'
local_tz = 'Africa/Gaborone'
yes_no = {True: YES, False: NO}
timestamp = datetime.today().strftime('%Y%m%d%H%M%S')


delimiter = ',' # '|'
start_date = datetime(2013, 10, 1)

In [None]:
class StatusHistoryModelToDataframe(ModelToDataframe):
    columns = {fld.name: fld.name for fld in StatusHistory._meta.get_fields()}    

class SubjectModelToDataframe(ModelToDataframe):
    
    columns = {k: k for k in ['subject_identifier', 'gender', 'dob']}
    
    def __init__(self, columns=None, **kwargs):
        if columns:
            self.columns = {k: k for k in columns}
        super().__init__(**kwargs)

In [None]:
def get_crf_dataframe(model=None, cols=None):
    """Returns a CRF dataframe characterized by having an index set
    to 'subject_visit_id'.
    """
    df_crf = ModelToDataframe(model=model, drop_sys_columns=True).dataframe
    if cols:
        cols.append('subject_visit_id')
        df_crf = df_crf[cols]
    df_crf = df_crf.set_index('subject_visit_id')
    return df_crf

def to_local_datetime(s):
    """Returns a localized datetime series given tz-aware dates.
    """
    s = s.dt.tz_localize('UTC')
    s = s.dt.tz_convert(local_tz)
    return s

def date_to_local_datetime(s):
    """Returns a localized datetime series given naive dates.
    """
    s = pd.to_datetime(s, format=date_format, exact=True, utc=True, box=True)
    s = s.dt.tz_localize(local_tz)
    return s

def get_yesno_etc(value):
    """Returns a value as Yes/No/not_sure given
    1, 0 True, False, etc.
    """
    try:
        value = str(int(value))
    except ValueError:
        pass
    if value in ['1', 'True']:
        value = YES
    elif value in ['0', 'False']:
        value = NO
    elif value == '2':
        value = 'not_sure'
    else:
        pass
    return value

In [None]:
df_subjects = SubjectModelToDataframe(model='bcpp_subject.subjectconsent').dataframe
df_subjects = df_subjects.drop_duplicates()
# df_subjects.info()

In [None]:
# start with subject visit model
cols = ['subject_identifier', 'report_date', 'visit_code', 'consent_version', 'survey', 'household_member_id', ]
df = ModelToDataframe(model='bcpp_subject.subjectvisit', drop_sys_columns=True).dataframe
df = df.rename(columns={'id': 'subject_visit_id'})
df = df.set_index('subject_visit_id')
df['report_date'] = to_local_datetime(df['report_datetime'])
df['report_date'] = df['report_date'].dt.normalize()
df = df[cols]
df_original = df.copy()

# only keep records after start_date
# df = df[df['report_date'] >= start_date]

In [None]:
# subjectconsent
model = 'bcpp_subject.subjectconsent'
cols = ['subject_identifier', 'gender', 'dob', 'consent_datetime', 'version']
df_consent = SubjectModelToDataframe(model=model, columns=cols).dataframe
df_consent['consent_datetime'] = to_local_datetime(df_consent['consent_datetime'])
df_consent['dob'] = date_to_local_datetime(df_consent['dob'])

# remove subject identifier as UUID
df_consent = df_consent[df_consent['subject_identifier'].str.len() != 32]

# drop duplicates (because of versions)
df_consent = df_consent.sort_values(['subject_identifier'])
df_consent = df_consent.drop_duplicates(['subject_identifier'], keep='first')
df_consent = df_consent[['subject_identifier', 'gender', 'dob']]

In [None]:
# merge subjectconsent to main df
df = df.reset_index()
df = pd.merge(df, df_consent, on='subject_identifier', how='left')

In [None]:
# set index for joins with CRFs
df = df.set_index('subject_visit_id')

In [None]:
# subjectrequisition
status_cols = [f.name for f in RequisitionStatusMixin._meta.get_fields()
               if f not in ['processed_datetime', 'packed_datetime', 'shipped_datetime']]
cols = ['requisition_identifier', 'panel_name', 'requisition_datetime', 'is_drawn', 'reason_not_drawn',
        'drawn_datetime', 'specimen_type', 'study_site', 'study_site_name']
cols.extend([f.name for f in RequisitionStatusMixin._meta.get_fields()])
df_crf = get_crf_dataframe(model='bcpp_subject.subjectrequisition', cols=cols)
df_crf['requisition_datetime'] = to_local_datetime(df_crf['requisition_datetime'])
df_crf['drawn_datetime'] = to_local_datetime(df_crf['drawn_datetime'])
df_crf['received_datetime'] = to_local_datetime(df_crf['received_datetime'])
df_crf['received'] = df_crf['received'].map(yes_no, na_action='ignore')
df_crf['processed'] = df_crf['processed'].map(yes_no, na_action='ignore')
df_crf['packed'] = df_crf['packed'].map(yes_no, na_action='ignore')
df_crf['shipped'] = df_crf['shipped'].map(yes_no, na_action='ignore')

# join to main df
df = df.join(df_crf)

# show grouping
# df.groupby('circumcised').size()

In [None]:
df.head()

In [None]:
# 'bcpp_status.statushistory'
df_status = StatusHistoryModelToDataframe(model='bcpp_status.statushistory').dataframe
cols = ['subject_identifier', 'status_date', 'timepoint', 'final_hiv_status', 'final_hiv_status_date', 'final_arv_status', ]
df_status = df_status[cols]
df_status = df_status.rename(columns={'status_date': 'report_date', 'timepoint': 'visit_code'})
df_status['report_date'] = date_to_local_datetime(df_status['report_date'])
df_status['final_hiv_status_date'] = date_to_local_datetime(df_status['final_hiv_status_date'])
df_status = df_status.groupby(['subject_identifier', 'report_date']).last()
df_status = df_status.reset_index()
df_status = df_status[['subject_identifier',  'report_date', 'visit_code', 'final_hiv_status', 'final_hiv_status_date', 'final_arv_status']]
df = pd.merge(df, df_status, on=['subject_identifier', 'report_date', 'visit_code'], how='left')


In [None]:
# remove invalid sites
df = df[-df['study_site_name'].isin(['bhp', 'test_community'])]
df = df.drop(['specimen_type', 'household_member_id', 'final_hiv_status', 'final_hiv_status_date', 'final_arv_status'], axis=1)

In [None]:
# fix reason_not_drawn
is_blank = (df.reason_not_drawn.notnull()) & (df.is_drawn == YES)
df.loc[is_blank, 'reason_not_drawn'] = np.nan

In [None]:
df.head()

In [None]:
# export requisitions to CSV as a single file
path = f'~/lab_requisitions_{timestamp}.csv'
df.to_csv(path, index=True, date_format=export_date_format, sep=delimiter)
sys.stdout.write(f'* {path}\n')

In [None]:
# df.groupby('study_site_name').size()
# df.groupby('is_drawn').size()
# df.groupby('panel_name').size()
# df.groupby('visit_code').size()
# df.groupby('survey').size()
# df.groupby('gender').size()
# print(df['report_date'].min())
# print(df['report_date'].max())
# print(df['drawn_datetime'].min())
# print(df['drawn_datetime'].max())
# print(df['requisition_datetime'].min())
# print(df['requisition_datetime'].max())
# print(df['dob'].min())
# print(df['dob'].max())


In [None]:
model = 'edc_lab.aliquot'
cols = None
df_aliquot = ModelToDataframe(model=model).dataframe
df_aliquot['aliquot_datetime'] = to_local_datetime(df_aliquot['aliquot_datetime'])

In [None]:
# drop sys and other unwanted columns
columns = DEFAULT_BASE_FIELDS + ['slug', 'comment', 'shipped']
df_aliquot = df_aliquot.drop(columns, axis=1)

In [None]:
df_aliquot['is_primary'] = df_aliquot['is_primary'].map(yes_no, na_action='ignore')
df_aliquot['medium'] = df_aliquot['medium'].str.lower()

In [None]:
# add column to indicate missing requisitions
cols = ['requisition_identifier', 'panel_name', 'requisition_datetime', 'gender', 'dob', 'study_site', 'study_site_name']
df1 = pd.merge(df_aliquot, df[cols],
               on='requisition_identifier', how='left')
df_aliquot['missing_requisition'] = df1['panel_name'].isnull()
df_aliquot['is_primary'] = df_aliquot['is_primary'].map(yes_no, na_action='ignore')
df_aliquot.head()
df_aliquot.info()

In [None]:
# export requisitions to CSV as a single file
path = f'~/lab_aliquots_{timestamp}.csv'
df.to_csv(path, index=True, date_format=export_date_format, sep=delimiter)
sys.stdout.write(f'* {path}\n')

In [None]:
print(df_aliquot.groupby('medium').size())
print('---')

print(df_aliquot.groupby('aliquot_type').size())
print('---')

print(df_aliquot.groupby('alpha_code').size())
print('---')

print(df_aliquot.groupby('numeric_code').size())
print('---')

print(df_aliquot.groupby('condition').size())
print('---')

print(df_aliquot.groupby('missing_requisition').size())
print('---')