## Add an identity column to a SAS dataset

In [None]:
import numpy as np
import pandas as pd
import os
import re

from dateutil.parser import parse
from edc_constants.constants import FEMALE, MALE, NAIVE, ON_ART, DEFAULTER, NEG, POS, IND
from edc_constants.constants import YES, NO
from edc_pdutils.utils import identity256

csv_path = '/Users/erikvw/Documents/bcpp/cdc/201710/'
file_ahs = 'CPC_AHS_PAIRS1_7_12OCT17.csv'
file_y2 = 'year2CPC_ALL_12OCT17.csv'
consent = '/Users/erikvw/Documents/bcpp/consent_data/consent20170927142648.csv'
na_value = np.nan
export_date_format = '%Y-%m-%d'
delimiter = '|'

# hiv_status = {0: NEG, 1: POS, 3: IND}
# art_status = {1: NAIVE, 2: DEFAULTER, 3: ON_ART}
# gender = {1: MALE, 2: FEMALE}
# yes_no = {0: YES, 1: NO}


In [None]:
def convert_sas_date(value):
    if pd.notnull(value):
        try:
            if not re.match('^[0-9]{2}[A-Z]{3}[0-9]{4}$', value):
                raise ValueError(f'Invalid date format. Got {value}.')
        except TypeError:
            raise ValueError(f'Invalid date format. Got {value}.')
        else:
            year = value.split(re.match('^[0-9]{2}[A-Z]{3}', value).group())[1]
            value = value.split(year)[0]
            day = re.match('^[0-9]{2}', value).group()
            month = value.split(day)[1]
            value = parse(f'{day} {month} {year}')
    return value


In [None]:
df_ahs = pd.read_csv(os.path.join(csv_path, file_ahs))
df_ahs.head()

In [None]:
# convert date columns to datetime
date_columns = ['interview_date', 'cd4_date', 'prev_result_date', 'final_hiv_status_date']
for col in date_columns:
    df_ahs[col] = df_ahs.apply(lambda row: convert_sas_date(row[col]), axis=1)

In [None]:
# misc
df_ahs = df_ahs.fillna(value=na_value)
df_ahs = df_ahs.rename(columns={'Pair': 'pair', 'Intervention': 'intervention'})

In [None]:
# remap
# df_ahs['community'] = df_ahs['community'].apply(lambda row: row['community'].str.lower())
# df_ahs['gender'] = df_ahs['gender'].apply(gender.get)
# df_ahs['prev_result'] = df_ahs['prev_result'].apply(hiv_status.get)
# df_ahs['final_hiv_status'] = df_ahs['final_hiv_status'].apply(hiv_status.get)
# df_ahs['self_reported_result'] = df_ahs['self_reported_result'].apply(hiv_status.get)
# df_ahs['final_arv_status'] = df_ahs['final_arv_status'].apply(art_status.get)
# df_ahs['prev_result_known'] = df_ahs['prev_result_known'].apply(yes_no.get)
# df_ahs['cd4_avail'] = df_ahs['cd4_avail'].apply(yes_no.get)
# df_ahs['referred'] = df_ahs['referred'].apply(yes_no.get)
# df_ahs['pregnant'] = df_ahs['pregnant'].apply(yes_no.get)
# df_ahs['circumcised'] = df_ahs['circumcised'].apply(yes_no.get)
# df_ahs['working'] = df_ahs['working'].apply(yes_no.get)

In [None]:
df_y2 = pd.read_csv(os.path.join(csv_path, file_y2))
df_y2.head()

In [None]:
# convert date columns to datetime
date_columns = ['interview_date', 'cd4_date', 'prev_result_date', 'final_hiv_status_date']
for col in date_columns:
    df_y2[col] = df_y2.apply(lambda row: convert_sas_date(row[col]), axis=1)

In [None]:
df_y2.info()

In [None]:
df_consent = pd.read_csv(consent)
df_consent.head()

In [None]:
df_ahs = pd.merge(
    df_ahs, df_consent[['subject_identifier', 'identity']],
    on='subject_identifier', how='left')

In [None]:
df_ahs[pd.isnull(df_ahs['identity'])]

In [None]:
df_ahs['identity256'] = df_ahs.apply(lambda row: identity256(row, 'identity'), axis=1)
df_ahs = df_ahs.drop('identity', axis=1)

In [None]:
df_ahs.head()

In [None]:
df_y2 = pd.merge(
    df_y2, df_consent[['subject_identifier', 'identity']],
    on='subject_identifier', how='left')

In [None]:
df_y2[pd.isnull(df_y2['identity'])]

In [None]:
df_y2['identity256'] = df_y2.apply(lambda row: identity256(row, 'identity'), axis=1)
df_y2 = df_y2.drop('identity', axis=1)

In [None]:
df_y2.head()

In [None]:
file_ahs_new = f"{file_ahs.split('.')[0]}_identity"
for delimiter, suffix in [(',', '_C'), ('|', '_P')]:
    df_ahs.to_csv(
        os.path.join(csv_path, f'{file_ahs_new}{suffix}.csv'),
        index=False,
        date_format=export_date_format,
        sep=delimiter)

In [None]:
file_y2_new = f"{file_y2.split('.')[0]}_identity"
for delimiter, suffix in [(',', '_C'), ('|', '_P')]:

    df_y2.to_csv(
        os.path.join(csv_path, f'{file_y2_new}{suffix}.csv'),
        index=False,
        date_format=export_date_format,
        sep=delimiter)

In [None]:
df_ahs.groupby('visit_code').size()

In [None]:
df_ahs.groupby('community').size()

In [None]:
df_ahs.groupby('pair').size()

In [None]:
df_ahs.groupby('intervention').size()

In [None]:
df_ahs['interview_date'].describe()

In [None]:
df_ahs.groupby('final_arv_status').size()

In [None]:
df_ahs.groupby('final_hiv_status').size()

In [None]:
df_ahs.groupby('prev_result').size()

In [None]:
df_ahs.groupby('prev_result_known').size()

In [None]:
df_ahs.groupby('cd4_avail').size()

In [None]:
df_ahs.groupby('marital_status').size()

In [None]:
df_ahs.groupby('self_reported_result').size()

In [None]:
df_ahs.groupby('referred').size()

In [None]:
df_ahs.groupby('pregnant').size()

In [None]:
df_ahs.groupby('circumcised').size()

In [None]:
df_ahs.groupby('working').size()

In [None]:
df_ahs.groupby('timepoint').size()