# Working with ManyToMany field data

### SQL Query to get and group m2m data
<code>select V.subject_identifier, C.short_name from bcpp_subject_hypertensioncardiovascular as A   left join bcpp_subject_hypertensioncardiovascular_medication_taken as B on A.id = B.hypertensioncardiovascular_id
left join bcpp_subject_medication as C on C.id = B.medication_id
left join bcpp_subject_subjectvisit as V on V.id = A.subject_visit_id
where C.id is not null and B.medication_id is not null
group by V.subject_identifier, C.short_name</code>

48 missing subject consent

51 have subject consent visit / CRF

UUIDs in registered subject

In [5]:
import pandas as pd

from datetime import datetime
from django.apps import apps as django_apps
from edc_pdutils.model_to_dataframe import SubjectModelToDataframe, ModelToDataframe
from edc_pdutils.model_to_dataframe import Helper
from edc_pdutils.database import Database
from pprint import pprint

from bcpp_subject.models import Medication
delimiter = ','
export_date_format = '%Y-%m-%d'
helper = Helper()
visit_model = 'bcpp_subject.subjectvisit'
model = 'bcpp_subject.hypertensioncardiovascular'

In [6]:
def get_m2m_fields(model=None):
    m2m_fields = []
    model_cls = django_apps.get_model(model)
    related_fields = [f for f in model_cls._meta.get_fields() if f.is_relation]
    for field in related_fields:
        try:
            field.m2m_db_table()
            m2m_fields.append(field)
        except AttributeError:
            pass
    return m2m_fields

def get_m2m_df(field=None):
    db = Database()
    # get m2m interim data
    df_m2m = db.select_table(
        table_name=field.m2m_db_table(),
        uuid_columns=[field.m2m_column_name(), field.m2m_reverse_name()])
    # get medication list
    qs = field.related_model.objects.filter(medication_given__isnull=False)
    df_list = ModelToDataframe(
        queryset=qs,
        drop_sys_columns=True).dataframe
    # df_list = db.select_table(table_name=list_model, uuid_columns=['id'])
    df_m2m = df_m2m[[field.m2m_column_name(), field.m2m_reverse_name()]]
    df_m2m = pd.merge(df_m2m, df_list, left_on=field.m2m_reverse_name(),
                      right_on='id', how='inner')
    df_m2m = df_m2m[[field.m2m_column_name(), 'short_name']]
    df_m2m = df_m2m.rename(columns={'short_name': field.column})
    return df_m2m

def pivot_m2m_df(df_m2m=None, field=None):
    print(list(df_m2m.columns))
    df_m2m['count'] = 1
    df_m2m = df_m2m.pivot(index=field.m2m_column_name(), columns=field.column, values='count')
    df_m2m = df_m2m.reset_index()
    # print(list(df_m2m.columns))
    return df_m2m

def merge_m2m_df_with_crf(df_crf=None, df_m2m=None, field=None, crf_id_column=None):
    print(f'left_on={crf_id_column}')
    print(f'right_on={field.m2m_column_name()}')
    df = pd.merge(
        df_crf[[crf_id_column, 'subject_identifier', 'report_datetime', 'visit_code']],
        df_m2m, left_on=crf_id_column, right_on=field.m2m_column_name())
    # df = df.drop(field.m2m_column_name(), axis=1)
    return df
    

In [7]:
# print model name
model_cls = django_apps.get_model(model)
print(f'model=\'{model_cls._meta.label_lower}\'')

model='bcpp_subject.hypertensioncardiovascular'


In [8]:
qs = Medication.objects.filter(medication_given__isnull=False)
df_list = ModelToDataframe(queryset=qs, drop_sys_columns=True).dataframe
df_list.head()

df_crf = helper.get_crf_dataframe(model=model)
df_crf = df_crf.rename(columns={'id': 'crf_id'})
df_crf = df_crf.reset_index()
df_crf = df_crf[['crf_id', 'subject_visit_id']]
df_crf = df_crf.sort_values('crf_id')
print(df_crf.head())

db = Database()
field = get_m2m_fields(model=model)[0]
df_m2m = db.select_table(
    table_name=field.m2m_db_table(),
    uuid_columns=[field.m2m_column_name(), field.m2m_reverse_name()])
df_m2m = df_m2m.sort_values(field.m2m_column_name())
print(df_m2m.head())
# df = pd.merge(df_m2m, df_crf,
#               left_on='hypertensioncardiovascular_id', right_on='crf_id')
# df.head()


# #model_cls.objects.filter(medication_taken__isnull=False).count()
# #Medication.objects.filter(medication_given__isnull=False)
# # df.groupby('short_name').size()
# df.head()

                                 crf_id                      subject_visit_id
0  0003b18b-8cf6-4087-ae55-86c343e90e54  842c1725-2116-4f36-8d0e-3071e02cae72
1  001664a0-ebc6-4c86-ae22-1ddd1751e559  410bf124-0416-4e3b-8d5c-3f47744f7c7d
2  004883e7-3ea1-48d6-99c7-bd216cfaa367  0268d184-a4ca-4415-a18c-38adb516b672
3  005f9309-11f5-4654-968a-9e6981ec84df  aa5ca88a-0141-449b-bc1f-5d8a0e8ea0d7
4  0062e943-2638-47fc-8ea0-d8c48aff5808  d907d4f7-b524-44a8-9e72-76f8dd0fc6d2
    id         hypertensioncardiovascular_id  \
0   54  00d893b0-82e5-4f71-a950-1ddb2c3a3b43   
1  172  01997ec1-9dba-4c21-bc0e-9c2b1f163e70   
2  340  01c35b35-8064-47dd-8bdd-b4e8bebd840f   
3   21  02811301-53db-4158-854c-ecf81c645b61   
4   20  02811301-53db-4158-854c-ecf81c645b61   

                          medication_id  
0  703efa68-3b26-465b-b3b2-63501b39a380  
1  f1d66a5c-a538-4ce2-bf5c-7e07a8d38d30  
2  f70f0273-8245-49b4-9e72-43e0d9520e3f  
3  f1d66a5c-a538-4ce2-bf5c-7e07a8d38d30  
4  f70f0273-8245-49b4-9e72-43e0d9

In [9]:
# print m2m table name and relevant columns
f = [f for f in model_cls._meta.get_fields() if f.is_relation][2]
print(f'column=\'{f.column}\'')
print(f'm2m_db_table=\'{f.m2m_db_table()}\'')
print(f'm2m_column_name=\'{f.m2m_column_name()}\'')
print(f'm2m_reverse_name=\'{f.m2m_reverse_name()}\'')

column='medication_given'
m2m_db_table='bcpp_subject_hypertensioncardiovascular_medication_given'
m2m_column_name='hypertensioncardiovascular_id'
m2m_reverse_name='medication_id'


In [10]:
# get visit df
df_visit = ModelToDataframe(
    model=visit_model, drop_sys_columns=True).dataframe
df_visit = df_visit.rename(columns={'id': 'subject_visit_id', 'report_datetime': 'visit_datetime'})

In [15]:
# get CRF df
visit_cols = ['subject_visit_id', 'subject_identifier', 'visit_datetime', 'visit_code']
df_crf = helper.get_crf_dataframe(model=model)
df_crf = df_crf.rename(columns={'id': 'crf_id'})
df_crf = df_crf.reset_index()
df_crf = pd.merge(
    df_visit[visit_cols], df_crf, on='subject_visit_id')
df_crf.head()
# pprint(list(df_crf.columns))

Unnamed: 0,subject_visit_id,subject_identifier,visit_datetime,visit_code,crf_id,form_as_json,consent_version,report_datetime,hypertension_diagnosis,health_care_facility,...,right_arm_one,left_arm_one,right_arm_two,left_arm_two,bm,bm_refused_reason,waist_reading_one,waist_reading_two,hip_reading_one,hip_reading_two
0,d855caa3-fc62-4dbe-9791-455ed777c84c,066-15100001-6,2017-04-11 13:19:24,T2,ee2669ab-450a-4e1d-b692-f5c1c8549935,,?,2017-04-11 13:44:08.324,No,,...,,,,,No,DWTA,,,,
1,2f9b39c4-7123-4776-81ad-e937eb080bb4,066-15100002-0,2017-03-31 15:03:43,T2,61ce23ca-d7d3-489f-b33a-784223ae81a7,,?,2017-03-31 15:33:29.963,No,,...,,,,,No,no_time,,,,
2,4662fa69-b39b-428d-8fdb-a6cee3f04a2e,066-15100003-1,2017-04-28 08:59:59,T2,4e7e3704-5550-45c9-acbc-b1b1ce213f0d,,?,2017-04-28 09:12:02.782,No,,...,125/78,122/75,126/78,122/75,No,not_sure,,,,
3,cf3e01d7-33a9-43ba-9484-1cb2eba73f03,066-15100004-2,2017-04-23 08:05:55,T2,b1feae63-ed36-4b2a-b1ac-59433c152a0a,,?,2017-04-23 08:36:47.975,Yes,clinic,...,154/119,182/112,167/113,183/110,No,no_time,,,,
4,d9fea684-3d24-4bd6-b569-b7b138f3057e,066-15100006-4,2017-04-26 12:09:10,T2,5e01655c-ba07-4628-aa4e-6095df0f45be,,?,2017-04-26 12:23:41.334,No,,...,,,,,No,DWTA,,,,


In [11]:
dataframes = {}
for field in get_m2m_fields(model=model):
    df_m2m = get_m2m_df(field)
    df_m2m = pivot_m2m_df(df_m2m=df_m2m, field=field)
    df = merge_m2m_df_with_crf(
        df_crf=df_crf, df_m2m=df_m2m, field=field, crf_id_column='crf_id')
df_crf.info()
#    df.head()
    # dataframes.update({field.column: {'m2m': df_m2m, 'df': df}})
# for name, dfs in dataframes.items():
#     print(name, dfs.get('m2m').head())
    # print(df_m2m[pd.notnull(df_m2m['atenolol'])].head())

['hypertensioncardiovascular_id', 'medication_taken']
left_on=crf_id
right_on=hypertensioncardiovascular_id


KeyError: "['subject_identifier' 'report_datetime' 'visit_code'] not in index"

In [17]:
df.head()

Unnamed: 0,crf_id,subject_identifier,report_datetime,visit_code,hypertensioncardiovascular_id,N/A,OTHER,amlodipine,atenolol,bisoprolol,...,carvedilol,co_micardis,doxazosin,enalapril,furosemide,hydralazine,hydrochlrothiazide,nifedipine,propranolol,spirinolactone


In [9]:
df_crf.head()

Unnamed: 0,subject_visit_id,subject_identifier,visit_datetime,visit_code,crf_id,form_as_json,consent_version,report_datetime,hypertension_diagnosis,health_care_facility,...,right_arm_one,left_arm_one,right_arm_two,left_arm_two,bm,bm_refused_reason,waist_reading_one,waist_reading_two,hip_reading_one,hip_reading_two
0,d855caa3-fc62-4dbe-9791-455ed777c84c,066-15100001-6,2017-04-11 13:19:24,T2,ee2669ab-450a-4e1d-b692-f5c1c8549935,,?,2017-04-11 13:44:08.324,No,,...,,,,,No,DWTA,,,,
1,2f9b39c4-7123-4776-81ad-e937eb080bb4,066-15100002-0,2017-03-31 15:03:43,T2,61ce23ca-d7d3-489f-b33a-784223ae81a7,,?,2017-03-31 15:33:29.963,No,,...,,,,,No,no_time,,,,
2,4662fa69-b39b-428d-8fdb-a6cee3f04a2e,066-15100003-1,2017-04-28 08:59:59,T2,4e7e3704-5550-45c9-acbc-b1b1ce213f0d,,?,2017-04-28 09:12:02.782,No,,...,125/78,122/75,126/78,122/75,No,not_sure,,,,
3,cf3e01d7-33a9-43ba-9484-1cb2eba73f03,066-15100004-2,2017-04-23 08:05:55,T2,b1feae63-ed36-4b2a-b1ac-59433c152a0a,,?,2017-04-23 08:36:47.975,Yes,clinic,...,154/119,182/112,167/113,183/110,No,no_time,,,,
4,d9fea684-3d24-4bd6-b569-b7b138f3057e,066-15100006-4,2017-04-26 12:09:10,T2,5e01655c-ba07-4628-aa4e-6095df0f45be,,?,2017-04-26 12:23:41.334,No,,...,,,,,No,DWTA,,,,


In [129]:
timestamp = datetime.today().strftime('%Y%m%d%H%M%S')
for column, df_m2m in dfs.items():
    df_m2m.to_csv(
        f'~/bcpp_{column}_{timestamp}.csv',
        date_format=export_date_format,
        sep=delimiter)