In [1]:
import numpy as np
import pandas as pd
import sys

from arrow import Arrow
from bcpp_community import communities
from bcpp_status.models import StatusHistory
from bcpp_subject.models import SubjectVisit, SubjectConsent
from datetime import datetime
from django.db import connections
from edc_constants.constants import YES, NO, NEG, UNK
from edc_pdutils.model_to_dataframe import ModelToDataframe, SubjectModelToDataframe
from edc_pdutils.model_to_dataframe import Helper as HelperBase, missing_subject_identifiers
from pprint import pprint

VIRAL_LOAD = 'Viral Load'
start_date = datetime(2017, 1, 1)
date_format = '%Y-%m-%d %H:%M:%S.%f'
export_date_format = '%Y-%m-%d'
local_tz = 'Africa/Gaborone'

load_consent_from_csv = True
consent_csv_filename = '~/Documents/bcpp/referral/consent20170927142648.csv'
split_csv_by_community = True
delimiter = '|'

In [49]:
configfile = connections['default'].__dict__['settings_dict']['OPTIONS']['read_default_file']
with open(configfile, 'r') as f:
    data = f.read()
data = data.split('\n')
values = {}
for d in data:
    d = d.split('#')[0]
    d = d.split('=')
    try:
        values.update({d[0].strip(): d[1].strip()})
    except IndexError:
        pass
values.pop('password')
if values.get('port') != '3306' and values.get('user') == 'root':
    raise ValueError(
        'Connection to remote DB is with user=root! Use a readonly acccount.')
pprint(values)
#     if 'password' not in d:
#         print(d)
#     elif 'root' in d:
#         raise ValueError('Connection user=root! Use \'dmc\'.')
#     else:
#         print('password: *******')

{'database': 'edc',
 'default-character-set': 'utf8',
 'host': '127.0.0.1',
 'port': '5002',
 'user': 'dmc'}


In [2]:
class MemberModelToDataframe(ModelToDataframe):
    columns = {k: k for k in ['id', 'spouse_of_citizen', 'citizen', 'study_resident']}
    
class StatusHistoryModelToDataframe(ModelToDataframe):
    columns = {fld.name: fld.name for fld in StatusHistory._meta.get_fields()}    

class LocatorModelToDataframe(ModelToDataframe):
    columns = {k: k for k in ['id', 'subject_identifier', 'may_follow_up']}

In [3]:
def get_cdc_columns():
    """Returns a list of column names.
    """
    columns = list(df.columns)
    columns.remove('household_member_id')
    columns.remove('consent_version')
    columns.remove('survey')
    columns.remove('subject_identifier')
    columns.remove('community')
    columns.sort()
    columns.insert(0, 'community')
    columns.insert(0, 'subject_identifier')
    return columns

In [4]:
class Helper(HelperBase):

    def tb_symptoms(self, row):
        """Returns a list of tb symptoms joined
        by semi-colon.
        """
        symptoms = ['--' for i in range(0,5)]
        if row['cough'] == YES:
            symptoms[0] = 'cough'
        if row['lymph_nodes'] == YES:
            symptoms[1] = 'lymph_nodes'
        if row['night_sweat'] == YES:
            symptoms[2] = 'night_sweat'
        if row['cough_blood'] == YES:
            symptoms[3] = 'cough_blood'
        if row['weight_loss'] == YES:
            symptoms[4] = 'weight_loss'
        # symptoms.sort()
        symptoms = ';'.join(symptoms)
        symptoms =  symptoms.replace('--', '')
        return np.nan if symptoms == ';;;;' else symptoms

In [5]:
helper = Helper(local_tz=local_tz, date_format=date_format)

In [6]:
df_subjects = SubjectModelToDataframe(model='bcpp_subject.subjectconsent').dataframe
df_subjects = df_subjects.drop_duplicates()
# df_subjects.info()

In [7]:
# start with subject visit model
cols = ['subject_identifier', 'report_date', 'visit_code', 'consent_version', 'survey', 'household_member_id', ]
df = ModelToDataframe(model='bcpp_subject.subjectvisit', drop_sys_columns=True).dataframe
df = df.rename(columns={'id': 'subject_visit_id'})
df = df.set_index('subject_visit_id')
df['report_date'] = helper.to_local_datetime(df['report_datetime'])
df['report_date'] = df['report_date'].dt.normalize()
df = df[cols]
df_original = df.copy()

# only keep records after start_date
df = df[df['report_date'] >= start_date]

In [None]:
# df_original.groupby('visit_code').size()

In [None]:
# df.groupby('visit_code').size()

In [None]:
# unique list of all know subject identifiers
total_subject_identifiers = df_original['subject_identifier']
total_subject_identifiers = total_subject_identifiers.drop_duplicates()
subject_identifiers = df['subject_identifier']
subject_identifiers = subject_identifiers.drop_duplicates()
# assert len(subject_identifiers) == len(df['subject_identifier'])
print(f'There are {len(subject_identifiers)}/{len(total_subject_identifiers)} subject identifiers were reported on after {start_date}.')

In [None]:
# Validate all subject identifiers from subject_visit are in registered subject
# find_missing_subject_identifiers(
#     model='edc_registration.registeredsubject', subject_identifiers=subject_identifiers)

In [None]:
# Validate all subject identifiers from subject_visit are in subject consent
# find_missing_subject_identifiers(
#     model='bcpp_subject.subjectconsent', subject_identifiers=subject_identifiers)

In [None]:
model = 'bcpp_subject.subjectreferral'
cols = ['subject_referred', 'referral_code', 'referral_clinic', 'referral_appt_date', 'scheduled_appt_date']
df_crf = helper.get_crf_dataframe(model=model, cols=cols)
df_crf = df_crf[df_crf['referral_code'] != 'not_referred']

# remove not referred and pending
df_pending = df_crf[(df_crf['referral_code'] == 'pending') & (df_crf['subject_referred'].isin([NO, 'refused']))]
df_crf = df_crf.drop(df_pending.index, axis=0)

# localize dates and datetimes
df_crf['referral_appt_date'] = helper.to_local_datetime(df_crf['referral_appt_date'])
df_crf['scheduled_appt_date'] = helper.date_to_local_datetime(df_crf['scheduled_appt_date'])

# replace referral_appt_date with scheduled_appt_date if scheduled_appt_date
df_crf['referral_appt_date'] = df_crf.apply(
    lambda row: row['scheduled_appt_date']
        if pd.notnull(row['scheduled_appt_date']) else row['referral_appt_date'], axis=1)
# join to main df
df = df.join(df_crf)
# filter out those not referred
df = df[-df['subject_referred'].isnull()]
# df.head()

In [None]:
# import subject consent model
if load_consent_from_csv:
    df_consent = pd.read_csv(consent_csv_filename)
    df_consent['consent_datetime'] = helper.date_to_local_datetime(df_consent['consent_datetime'])
    df_consent['dob'] = helper.date_to_local_datetime(df_consent['dob'])
else:
    # is slow because it gets the encrypted field "identity"
    model = 'bcpp_subject.subjectconsent'
    cols = ['subject_identifier', 'gender', 'dob', 'consent_datetime', 'version', 'identity', 'identity_type']
    # query_filter = {'consent_datetime__gte': Arrow.fromdatetime(start_date).datetime}
    consents = SubjectModelToDataframe(model=model, columns=cols)

    # !!! is slow because it gets the encrypted field "identity"
    df_consent = consents.dataframe
    df_consent['consent_datetime'] = helper.to_local_datetime(df_consent['consent_datetime'])
    df_consent['dob'] = helper.date_to_local_datetime(df_consent['dob'])

In [None]:
# df_consent = df_consent.set_index('subject_identifier')

# remove subject identifier as UUID
df_consent = df_consent[df_consent['subject_identifier'].str.len() != 32]

df_consent = df_consent.sort_values(['subject_identifier', 'consent_datetime'])
df_consent = df_consent.drop_duplicates(['subject_identifier'], keep='first')
df_consent = df_consent[['subject_identifier', 'gender', 'dob', 'consent_datetime', 'identity', 'identity_type']]

In [None]:
# export consents to CSV
if not load_consent_from_csv:
    timestamp = datetime.today().strftime('%Y%m%d%H%M%S')
    df_consent.to_csv(f'~/consent{timestamp}.csv', index=False, date_format=export_date_format, sep=',')

In [None]:
# merge to main df
df = df.reset_index()
df = pd.merge(df, df_consent[['subject_identifier', 'gender', 'dob', 'consent_datetime', 'identity', 'identity_type']],
              on='subject_identifier', how='left')

In [None]:
# set index for joins with CRFs
df = df.set_index('subject_visit_id')

In [None]:
# circumcision
df_crf = helper.get_crf_dataframe(
    model='bcpp_subject.circumcision', cols=['circumcised'])

# clean up column
df_crf['circumcised'] = df_crf.apply(
    lambda row: helper.get_yesno_etc(row['circumcised']), axis=1)

# join to main df
df = df.join(df_crf)

# show grouping
# df.groupby('circumcised').size()

In [None]:
# residencymobility
df_crf = helper.get_crf_dataframe(
    model='bcpp_subject.residencymobility', cols=['permanent_resident'])

# join to main df
df = df.join(df_crf)

# show grouping
# df.groupby('permanent_resident').size()

In [None]:
# subjectrequisition
df_crf = helper.get_crf_dataframe(
    model='bcpp_subject.subjectrequisition', cols=['panel_name', 'is_drawn', 'drawn_datetime'])
df_crf = df_crf[df_crf['panel_name'] == VIRAL_LOAD]
df_crf['vl_sample_drawn_date'] = helper.to_local_datetime(df_crf['drawn_datetime'])
df_crf = df_crf.rename(columns={'is_drawn': 'vl_sample_drawn'})
df_crf = df_crf[['vl_sample_drawn', 'vl_sample_drawn_date']]

# join to main df
df = df.join(df_crf)

# show grouping
# df.groupby('vl_sample_drawn').size()

In [None]:
# 'bcpp_subject.tbsymptoms'
model = 'bcpp_subject.tbsymptoms'
cols = ['cough', 'lymph_nodes', 'night_sweat', 'cough_blood', 'weight_loss']
df_crf = helper.get_crf_dataframe(model=model, cols=cols)
df_crf['tb_symptoms'] = df_crf.apply(lambda row: tb_symptoms(row), axis=1)
df_crf = df_crf[['tb_symptoms']]

# join to main df
df = df.join(df_crf)
# show grouping
# print(df.groupby('symptoms').size())

In [None]:
# 'bcpp_subject.hivcareadherence'
model = 'bcpp_subject.hivcareadherence'
df_crf = helper.get_crf_dataframe(model=model, cols=['clinic_receiving_from'])
df_crf = df_crf.rename(columns={'clinic_receiving_from': 'arv_clinic'})
# join to main df
df = df.join(df_crf)

In [None]:
# 'bcpp_subject.reproductivehealth'
model = 'bcpp_subject.reproductivehealth'
df_crf = helper.get_crf_dataframe(model=model, cols=['currently_pregnant'])
# join to main df
df = df.join(df_crf)

In [None]:
# 'bcpp_subject.pimacd4'
model = 'bcpp_subject.pimacd4'
df_crf = helper.get_crf_dataframe(
    model=model, cols=['result_value', 'result_datetime'])
df_crf = df_crf.rename(
    columns={'result_value': 'cd4_result_value', 'result_datetime': 'cd4_result_datetime'})
df_crf['cd4_result_datetime'] = helper.to_local_datetime(df_crf['cd4_result_datetime'])
# join to main df
df = df.join(df_crf)

In [None]:
# add community
df['community'] = df.apply(lambda row: row['survey'].split('.')[-1], axis=1)
df = df[df['community'] != 'botswana']

In [None]:
# add pair
pairs = {k: v.pair for k, v in communities.items()}
df_pairs = pd.DataFrame.from_dict(pairs, orient='index')
df_pairs.reset_index(level=0, inplace=True)
df_pairs = df_pairs.rename(columns={'index': 'community', 0: 'pair'})

df = pd.merge(df, df_pairs, on='community')

In [None]:
# household member
model = 'member.householdmember'
cols = ['id', 'spouse_of_citizen', 'citizen', 'study_resident']
df_members = MemberModelToDataframe(model=model).dataframe
df_members = df_members.rename(columns={
    'id': 'household_member_id',
    'spouse_of_citizen': 'citizen_spouse',
    'study_resident': 'part_time_resident'})
# df_members = df_members.set_index('household_member_id')
df_members['citizen'] = df_members.apply(
    lambda row: helper.get_yesno_etc(row['citizen']), axis=1)
df_members['citizen_spouse'] = df_members.apply(
    lambda row: helper.get_yesno_etc(row['citizen_spouse']), axis=1)
df_members['part_time_resident'] = df_members.apply(
    lambda row: helper.get_yesno_etc(row['part_time_resident']), axis=1)

df = pd.merge(df, df_members, on='household_member_id', how='left')

In [None]:
# 'bcpp_status.statushistory'
df_status = StatusHistoryModelToDataframe(model='bcpp_status.statushistory').dataframe
cols = ['subject_identifier', 'status_date', 'timepoint', 'final_hiv_status', 'final_hiv_status_date', 'final_arv_status', ]
df_status = df_status[cols]
df_status = df_status.rename(
    columns={'status_date': 'report_date', 'timepoint': 'visit_code'})
df_status['report_date'] = helper.date_to_local_datetime(df_status['report_date'])
df_status['final_hiv_status_date'] = helper.date_to_local_datetime(df_status['final_hiv_status_date'])
df_status = df_status.groupby(['subject_identifier', 'report_date']).last()
df_status = df_status.reset_index()
df_status = df_status[['subject_identifier',  'report_date', 'visit_code', 'final_hiv_status', 'final_hiv_status_date', 'final_arv_status']]
df = pd.merge(df, df_status, on=['subject_identifier', 'report_date', 'visit_code'], how='left')

In [None]:
# remove subjects with missing consent (15 recs)
df = df[-df['identity'].isnull()]

In [None]:
# set final_arv_status to null for NEGs/UNKs with arv status (13 recs)
null_status = (df.final_arv_status.notnull()) & (df.final_hiv_status.isin([NEG, UNK]))
df.loc[null_status, 'final_arv_status'] = np.nan

In [None]:
# export to CSV as a single file
timestamp = datetime.today().strftime('%Y%m%d%H%M%S')
path = f'~/referral_{timestamp}.csv'
df.to_csv(path, index=False, date_format=export_date_format, sep=delimiter)
sys.stdout.write(f'* {path}\n')

In [None]:
# Write a CSV file for each community
if split_csv_by_community:
    timestamp = datetime.today().strftime('%Y%m%d%H%M%S')
    sys.stdout.write(f'Files:\n')
    for community in df.groupby('community').size().index:
        path = f'~/referral_{community}_{timestamp}.csv'
        df[df['community'] == community].to_csv(
            f'~/referral_{community}_{timestamp}.csv',
            columns=get_cdc_columns(),
            index=False,
            date_format=export_date_format,
            sep=delimiter)
        sys.stdout.write(f'* {path}\n')
    sys.stdout.write(f'Date format: {export_date_format}')
    sys.stdout.write(f'\nHeader:\n')
    sys.stdout.write(
        f'{delimiter}'.join(list(pd.read_csv(path, sep=delimiter).columns)))

In [None]:
df.groupby(['final_hiv_status']).size()

In [None]:
df.groupby(['final_arv_status']).size()

In [None]:
df.groupby('community').size()

In [None]:
df.groupby('pair').size()