In [26]:
import os

import redcap

from cc_utilities.command_line.sync_redcap_to_commcare import get_redcap_state
from cc_utilities.redcap_sync import (
    collapse_checkbox_columns,
    normalize_phone_cols,
    set_external_id_column,
    upload_complete_records,
    upload_incomplete_records,
    split_complete_and_incomplete_records,
)

In [27]:
redcap_api_url = os.getenv("REDCAP_API_URL")
redcap_api_key = os.getenv("REDCAP_API_KEY")
commcare_api_key = os.getenv("COMMCARE_API_KEY")
commcare_user_name = os.getenv("COMMCARE_USERNAME")
commcare_project_name = os.getenv("COMMCARE_PROJECT")

state_file = "redcap_test.yaml"
sync_all = True
phone_cols = []
external_id_col = "cdms_id"

print(f"REDCap url: '{redcap_api_url}'")
print(f"CommCare Project: '{commcare_project_name}' as user: '{commcare_user_name}'")

REDCap url: 'https://redcap.phila.gov/api/'
CommCare Project: 'philly-covid19-intqa' as user: 'gsimardmoore@caktusgroup.com'


In [34]:
# Get REDCap records

state = get_redcap_state(state_file)
redcap_project = redcap.Project(redcap_api_url, redcap_api_key)
redcap_records = redcap_project.export_records(
    # Tell PyCap to return a pandas DataFrame.
    format="df",
    df_kwargs={
        # Without index_col=False, read_csv() will use the first column
        # ("record_id") as the index, which is problematic because it's
        # not unique and is easier to handle as a separate column anyways.
        "index_col": False,
        # We import everything as a string, to avoid pandas coercing ints
        # to floats and adding unnecessary decimal points in the data when
        # uploaded to CommCare.
        "dtype": str,
    },
)

redcap_records

Unnamed: 0,record_id,cdms_id,specimen_collection_date,first_name,last_name,primary_language,primary_language_other,phone_home,commcare_email_address,dob,...,tested_positive_last_year,integration_process,integration_reject,pcc_resolution_status,cdms_id_test,first_name_test,phone_test,onset,exposure_test,exposure_type_test
0,1,123456789.0,,HannahTestingRecord,,,,,,,...,,,,,,,,2020-10-27,,
1,2,987654321.0,,TestRecord2,,,,,,,...,,,,,,,,,,
2,3,223456789.0,,VinodTestingRecord,,,,,,,...,,,,,,,,2020-10-28,,
3,4,223456789.0,,Vinod2TestingRecord,,,,,,,...,,,,,,,,2020-10-28,,
4,5,100500819.0,,Test,,,,,,,...,,,,,,,,,,
5,6,100300338.0,,Derrick,,,,,,,...,,,,,,,,,,
6,7,100300340.0,,Rodney,,,,,,,...,,,,,,,,,,
7,8,100501819.0,,Test,,,,,,,...,,,,,,,,,,
8,9,,,,,,,,,,...,,,,,,,,,,
9,10,,,,,,,,,,...,,,,,,,,,,


In [29]:
cases_df = normalize_phone_cols(redcap_records, phone_cols)
cases_df.head()

Unnamed: 0,record_id,cdms_id,specimen_collection_date,first_name,last_name,primary_language,primary_language_other,phone_home,commcare_email_address,dob,...,tested_positive_last_year,integration_process,integration_reject,pcc_resolution_status,cdms_id_test,first_name_test,phone_test,onset,exposure_test,exposure_type_test
0,1,123456789,,HannahTestingRecord,,,,,,,...,,,,,,,,2020-10-27,,
1,2,987654321,,TestRecord2,,,,,,,...,,,,,,,,,,
2,3,223456789,,VinodTestingRecord,,,,,,,...,,,,,,,,2020-10-28,,
3,4,223456789,,Vinod2TestingRecord,,,,,,,...,,,,,,,,2020-10-28,,
4,5,100500819,,Test,,,,,,,...,,,,,,,,,,


In [35]:
cases_df = set_external_id_column(cases_df, external_id_col)
cases_df

Unnamed: 0,record_id,cdms_id,specimen_collection_date,first_name,last_name,primary_language,primary_language_other,phone_home,commcare_email_address,dob,...,integration_process,integration_reject,pcc_resolution_status,cdms_id_test,first_name_test,phone_test,onset,exposure_test,exposure_type_test,external_id
0,1,123456789,,HannahTestingRecord,,,,,,,...,,,,,,,2020-10-27,,,123456789
1,2,987654321,,TestRecord2,,,,,,,...,,,,,,,,,,987654321
2,3,223456789,,VinodTestingRecord,,,,,,,...,,,,,,,2020-10-28,,,223456789
3,4,223456789,,Vinod2TestingRecord,,,,,,,...,,,,,,,2020-10-28,,,223456789
4,5,100500819,,Test,,,,,,,...,,,,,,,,,,100500819
5,6,100300338,,Derrick,,,,,,,...,,,,,,,,,,100300338
6,7,100300340,,Rodney,,,,,,,...,,,,,,,,,,100300340
7,8,100501819,,Test,,,,,,,...,,,,,,,,,,100501819


In [39]:
# From split_complete_and_incomplete_records
# Drop columns where all values are missing.
cases_df.dropna(axis=1, how="all")

Unnamed: 0,record_id,cdms_id,first_name,hard_to_isolate_reasons___food,hard_to_isolate_reasons___unable_to_distance,hard_to_isolate_reasons___prescription,hard_to_isolate_reasons___med_appts,hard_to_isolate_reasons___employer,hard_to_isolate_reasons___mental_health_resources,hard_to_isolate_reasons___nothing,...,race___amer_indian_alaskan,race___asian,race___hawaiian_pi,race___other,race___unknown,ethnicity___hispanic,ethnicity___not_hispanic,ethnicity___unknown,onset,external_id
0,1,123456789,HannahTestingRecord,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2020-10-27,123456789
1,2,987654321,TestRecord2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,987654321
2,3,223456789,VinodTestingRecord,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2020-10-28,223456789
3,4,223456789,Vinod2TestingRecord,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2020-10-28,223456789
4,5,100500819,Test,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,100500819
5,6,100300338,Derrick,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,100300338
6,7,100300340,Rodney,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,100300340
7,8,100501819,Test,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,100501819


In [40]:
complete_records, incomplete_records = split_complete_and_incomplete_records(cases_df)
complete_records

Unnamed: 0,record_id,cdms_id,first_name,hard_to_isolate_reasons___food,hard_to_isolate_reasons___unable_to_distance,hard_to_isolate_reasons___prescription,hard_to_isolate_reasons___med_appts,hard_to_isolate_reasons___employer,hard_to_isolate_reasons___mental_health_resources,hard_to_isolate_reasons___nothing,...,race___amer_indian_alaskan,race___asian,race___hawaiian_pi,race___other,race___unknown,ethnicity___hispanic,ethnicity___not_hispanic,ethnicity___unknown,onset,external_id
0,1,123456789,HannahTestingRecord,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2020-10-27,123456789
2,3,223456789,VinodTestingRecord,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2020-10-28,223456789
3,4,223456789,Vinod2TestingRecord,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,2020-10-28,223456789


In [41]:
incomplete_records

Unnamed: 0,record_id,cdms_id,first_name,hard_to_isolate_reasons___food,hard_to_isolate_reasons___unable_to_distance,hard_to_isolate_reasons___prescription,hard_to_isolate_reasons___med_appts,hard_to_isolate_reasons___employer,hard_to_isolate_reasons___mental_health_resources,hard_to_isolate_reasons___nothing,...,race___amer_indian_alaskan,race___asian,race___hawaiian_pi,race___other,race___unknown,ethnicity___hispanic,ethnicity___not_hispanic,ethnicity___unknown,onset,external_id
1,2,987654321,TestRecord2,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,987654321
4,5,100500819,Test,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,100500819
5,6,100300338,Derrick,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,100300338
6,7,100300340,Rodney,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,100300340
7,8,100501819,Test,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,,100501819


In [None]:
# upload_complete_records(
#     cases_df, commcare_api_key, commcare_project_name, commcare_user_name
# )

In [42]:
# upload_incomplete_records(
#     cases_df, commcare_api_key, commcare_project_name, commcare_user_name
# )

for index, row in incomplete_records.iterrows():
    # Drops any values in this Series with missing/NA values,
    # and converts it back to a DataFrame.
    data = row.dropna().to_frame().transpose()

data

Unnamed: 0,record_id,cdms_id,first_name,hard_to_isolate_reasons___food,hard_to_isolate_reasons___unable_to_distance,hard_to_isolate_reasons___prescription,hard_to_isolate_reasons___med_appts,hard_to_isolate_reasons___employer,hard_to_isolate_reasons___mental_health_resources,hard_to_isolate_reasons___nothing,...,race___black,race___amer_indian_alaskan,race___asian,race___hawaiian_pi,race___other,race___unknown,ethnicity___hispanic,ethnicity___not_hispanic,ethnicity___unknown,external_id
7,8,100501819,Test,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,100501819
