# MIS-CPIMS Data Import Scripts

Import pandas library and read exported CPARA data from MIS (XLSX). Display all columns for exploration.

In [18]:
import pandas as pd
# Reads USAID 4The Child cpara data
cpara=pd.read_excel("Data/CPARA.xlsx", sheet_name="Sheet1")
# cpara.columns

In [19]:
# reads the mapping for U4TC mapping for HH_id, cpims_ovc_id, caregiver id
cpims_hhs=pd.read_csv("Data/Household ID OVC Mapping.csv")

In [20]:
# Reads individual cpara data for U4TC
cpara_ovc_questions = pd.read_excel("Data/OVC_Questions.xlsx", sheet_name="Sheet1")

In [21]:
# Generate UID whose length is equivalent to the U4TC cpara data
import uuid

record_counts = cpara['record_id'].count()

def generate_uid(count):
    uid_list = []
    for i in range (0, count):
        uuid_gen = uuid.uuid4()
        # print(f'generated uid: {uuid_gen} inte: {i}')
        uid_list.append(uuid_gen)
    return uid_list    

df_uid = pd.DataFrame(generate_uid(record_counts))
print(record_counts)

32516


In [22]:
# Reads the cpims questions mapping
cpims_questions=pd.read_excel("Data/ovc_care_questions.xlsx")

In [23]:
# Mergers cpara data with household id, cpims id on caregiver_id
cpara_data1=pd.merge(cpara, cpims_hhs, how='left', left_on='cpims_id', right_on='caregiver_id')
# Merges cpara data with the event id uids
cpara_data2 = pd.merge(cpara_data1, df_uid, left_index=True, right_index=True)
# cpara_data2.columns

In [24]:
# output the uids mapped to hh, cg, cpims
df_uid_event = cpara_data2[['household','caregiver_id', 'cpims_ovc_id', 0]]
df_uid_events = df_uid_event.rename(columns={0: 'event_id'})

df_uid_events.to_csv('event_id_uids.csv')

In [25]:
# cpara individual table merges with with household id, cpims id on caregiver_id
cpara_individual=pd.merge(cpara_ovc_questions, cpims_hhs, how='left', left_on='caregiver_id', right_on='caregiver_id')
# cpara_individual.head(2)

In [26]:
cpara_data = cpara_data2.rename(columns={"assessment_date": "date_of_event","client_time": "timestamp_created","cpims_id": "caregiver_id_ustc",0:  "event_id","household": "household_id","cpims_ovc_id": "person_id"})

cpara_data['cpara_id'] = ''
cpara_data['date_of_previous_event'] = ''
cpara_data['is_void'] = 'f'
cpara_data['timestamp_updated'] = ''
# cpara_data.columns

Align the MIS data from rows to columns => vertical and only display the first two records. Move the columns that should not be considered as var.

In [27]:
# converts the horizontal table of cpara data to vertical table for manipulation
cpara_unpivot=pd.melt(cpara_data, id_vars=["record_id","date_of_event","case_manager","caregiver_id_ustc","child_headed","has_hei","has_pbf","has_svac","healthy","stable","safe","schooled","entry_time","action_by","designation","lip","entry_by_name","timestamp_created","deleted","modified","gps","device","cpimsId","names","caregiverhivstatus","cbo_id","cbo","county","subcounty","ward","chv_id","chv",'Unnamed: 0',"person_id","caregiver_id","household_id","event_id","cpara_id","date_of_previous_event","is_void","timestamp_updated"])

# cpara_data.columns

In [28]:
cpara_unpivot.columns

Index(['record_id', 'date_of_event', 'case_manager', 'caregiver_id_ustc',
       'child_headed', 'has_hei', 'has_pbf', 'has_svac', 'healthy', 'stable',
       'safe', 'schooled', 'entry_time', 'action_by', 'designation', 'lip',
       'entry_by_name', 'timestamp_created', 'deleted', 'modified', 'gps',
       'device', 'cpimsId', 'names', 'caregiverhivstatus', 'cbo_id', 'cbo',
       'county', 'subcounty', 'ward', 'chv_id', 'chv', 'Unnamed: 0',
       'person_id', 'caregiver_id', 'household_id', 'event_id', 'cpara_id',
       'date_of_previous_event', 'is_void', 'timestamp_updated', 'variable',
       'value'],
      dtype='object')

In [29]:
# filters bencmark questions and pivots them.
ovc_care_benchmark = cpara_unpivot[cpara_unpivot["variable"].str.contains('benchmark')]#.pivot(index=['record_id', 'cpims_id','assessment_date','entry_time', 'modified', 'cpims_ovc_id', 'household', 0], columns='variable', values=['value'])
# ovc_care_benchmark.to_excel("ovc_care_benchmark.xlsx")
unique_hh = ovc_care_benchmark['household_id'].str.contains('NaN')

# print(unique_hh.str.contains('NaN'))

In [30]:
unique_hh.describe()

count     292644
unique         1
top        False
freq      292644
Name: household_id, dtype: object

In [31]:
# Filter cpara question and pivots them or not.  // send to excel
#ovc_care_cpara = cpara_unpivot[cpara_unpivot["variable"].str.startswith('q')]
ovc_care_cpara = cpara_unpivot[cpara_unpivot["variable"].str.startswith('q')]#.pivot(index=['record_id', 'cpims_id','assessment_date','cpims_ovc_id', 'household'], columns='variable', values=['value'])
# ovc_care_cpara.to_csv("ovc_care_cpara_pivot.csv")

In [32]:
cpims_questions.columns

Index(['question_id', 'code', 'question', 'domain', 'question_text',
       'question_type', 'is_void', 'timestamp_created', 'timestamp_updated',
       'form_id', 'CURRENT cPARA', 'ind_cpara', 'cpara'],
      dtype='object')

In [33]:
ovc_cpara_with_q = pd.merge(ovc_care_cpara, cpims_questions, how='left', left_on='variable', right_on='cpara')

# ovc_cpara_with_q.columns

In [34]:
# cpara_individual.to_excel("cpara_ovc_questions.xlsx")
cpara_individual.head(2)

Unnamed: 0.1,record_id,cpara_record_id,cpara_date,question,response,cpims_id,caregiver_id,chv_id,Unnamed: 0,cpims_ovc_id,household
0,5,8859,2022-08-02,q3.1,Yes,1705501,1676623,2042693,43178.0,1687650.0,b540b485-7399-4407-bd14-066f5b79c133
1,5,8859,2022-08-02,q3.1,Yes,1705501,1676623,2042693,43184.0,1705501.0,b540b485-7399-4407-bd14-066f5b79c133


In [35]:
cpims_hhs.columns

Index(['Unnamed: 0', 'cpims_ovc_id', 'caregiver_id', 'household'], dtype='object')

In [36]:
cpara_data.to_excel("CPARA_MAPPING.xlsx")

In [37]:
cpara_data_output = ovc_cpara_with_q.rename(columns={"code": "question_code","value": "answer","question_type": "question_type","domain": "domain","question_id": "question_id"})


cpara_data_output.columns



Index(['record_id', 'date_of_event', 'case_manager', 'caregiver_id_ustc',
       'child_headed', 'has_hei', 'has_pbf', 'has_svac', 'healthy', 'stable',
       'safe', 'schooled', 'entry_time', 'action_by', 'designation', 'lip',
       'entry_by_name', 'timestamp_created_x', 'deleted', 'modified', 'gps',
       'device', 'cpimsId', 'names', 'caregiverhivstatus', 'cbo_id', 'cbo',
       'county', 'subcounty', 'ward', 'chv_id', 'chv', 'Unnamed: 0',
       'person_id', 'caregiver_id', 'household_id', 'event_id', 'cpara_id',
       'date_of_previous_event', 'is_void_x', 'timestamp_updated_x',
       'variable', 'answer', 'question_id', 'question_code', 'question',
       'domain', 'question_text', 'question_type', 'is_void_y',
       'timestamp_created_y', 'timestamp_updated_y', 'form_id',
       'CURRENT cPARA', 'ind_cpara', 'cpara'],
      dtype='object')

In [38]:
columns_to_out = ["cpara_id","question_code","answer","question_type","domain","date_of_event","date_of_previous_event","timestamp_created_x","is_void_x","timestamp_updated_x","caregiver_id","event_id","household_id","person_id","question_id"]

cpara_output_sql = cpara_data_output[columns_to_out]

# cpara_output_sql.to_csv('cpara_data.csv')

cpara_output_sql.iloc[0:20000].to_csv('sample_cpara_data.csv')

In [39]:
# unique_hh_ids = pd.DataFrame(cpara_output_sql['household_id'].unique())
# unique_hh_ids.groupby([])
cpara_output_sql.groupby(['caregiver_id'])['person_id'].count().to_csv('unique_hhs.csv')

In [40]:
cpara_data.head()

Unnamed: 0.1,record_id,date_of_event,caregiver_id_ustc,case_manager,child_headed,has_hei,has_pbf,has_svac,q1_1,q1_2,...,chv,Unnamed: 0,person_id,caregiver_id,household_id,event_id,cpara_id,date_of_previous_event,is_void,timestamp_updated
0,9325,2022-09-12,105151,,No,No,No,No,Yes,na,...,GILLIANN GILLIAN ACHIENGGG,99003,4282788,105151.0,29d199db-2f32-4025-bb9b-94357558550b,89e06cf9-8aff-4d00-8483-16d949ce21c5,,,,
1,9325,2022-09-12,105151,,No,No,No,No,Yes,na,...,GILLIANN GILLIAN ACHIENGGG,99026,4282807,105151.0,29d199db-2f32-4025-bb9b-94357558550b,22cefcf8-0452-4c21-b837-c12953bf2c60,,,,
2,9325,2022-09-12,105151,,No,No,No,No,Yes,na,...,GILLIANN GILLIAN ACHIENGGG,99031,4282821,105151.0,ae1ff881-bf28-471a-9a03-3efceed4d96a,8557c297-c195-4998-b2d6-d744e308bb59,,,,
3,9325,2022-09-12,105151,,No,No,No,No,Yes,na,...,GILLIANN GILLIAN ACHIENGGG,200681,4282788,105151.0,ae1ff881-bf28-471a-9a03-3efceed4d96a,c284dee3-3b5c-4267-8eff-40b1e04303da,,,,
4,9325,2022-09-12,105151,,No,No,No,No,Yes,na,...,GILLIANN GILLIAN ACHIENGGG,200705,4282807,105151.0,ae1ff881-bf28-471a-9a03-3efceed4d96a,46eef32e-baaf-4ea8-b0a6-d2a49b13eb32,,,,


In [41]:
event_columns = ['event_id', 'date_of_event', 'household_id', 'person_id']

ovc_care_events = cpara_data[event_columns]

ovc_care_events

additional_columns = ["event_type_id","event_counter","event_score","date_of_previous_event","created_by","timestamp_created","is_void"]

for i in additional_columns:
    ovc_care_events[i]= ' '

ovc_care_events.columns


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ovc_care_events[i]= ' '
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ovc_care_events[i]= ' '
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ovc_care_events[i]= ' '
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the c

Index(['event_id', 'date_of_event', 'household_id', 'person_id',
       'event_type_id', 'event_counter', 'event_score',
       'date_of_previous_event', 'created_by', 'timestamp_created', 'is_void'],
      dtype='object')

In [42]:
# get sync id
# events_count = ovc_care_events['event_id'].count()
sync_uid = pd.DataFrame(generate_uid(record_counts))
sync_uid

ovc_care_events_wi = ovc_care_events.reset_index()

ovc_care_events_wi

ovc_care_events_sync = pd.merge(ovc_care_events_wi, sync_uid,left_index=True,  right_index=True)

ovc_care_events_sync_out=ovc_care_events_sync.rename(columns={0: 'sync_id'})



In [43]:
ovc_care_events_sync_out['sync_id'].count()
cpara['record_id'].count()

32516

In [44]:
ovc_care_events_sync_out.iloc[0:1000].to_csv('ovc_care_events.csv')