In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [39]:
h_facility = pd.read_csv('synthetic_facility_v3.csv')
hdss = pd.read_csv('synthetic_hdss_v3.csv')

In [3]:
h_facility.head(5)

Unnamed: 0,recnr,firstname,lastname,petname,dob,sex,nationalid,patientid,visitdate
0,2,Fatuma,,Zaina,24/08/2017 00:00,2,N_ID_5000,2069,10/09/2018
1,3,Gloria,Rashida,,11/07/1993 00:00,2,N_ID_11861,2079,14/12/2022
2,4,Ali,Hakram,Igomu,17/05/2014 00:00,1,N_ID_11864,2080,09/06/2023
3,5,Nakalema,,Nkwanga,27/02/2026 00:00,2,N_ID_11867,2081,07/02/2019
4,6,Asuman,Sempa,Aguti,02/03/2002 00:00,1,N_ID_11870,2082,18/08/2020


In [4]:
hdss.head(5)

Unnamed: 0,recnr,firstname,lastname,petname,dob,sex,nationalid,hdssid,hdsshhid
0,1,Zaina,Hanifa,Ula,22/09/1930 00:00,2,,I20001,HH100001
1,2,Godfrey,Maganda,Mukama,15/07/1934 00:00,1,,I20002,HH100002
2,3,Kasim,Ngobi,Galabuzi,03/03/1983 00:00,1,,I20003,HH100003
3,4,Esther,,Inara,30/07/1968 00:00,2,,I20004,HH100004
4,5,Sumaya,Swabula,,13/12/1930 00:00,2,,I20005,HH100005


In [3]:
print(f"The health facility dataset has {h_facility.shape[0]} records, the hdss dataset has {hdss.shape[0]} records")

The health facility dataset has 2902 records, the hdss dataset has 4115 records


There are many more records in the hdss than the health facility dataset, makes sense because not everyone in the hdss catchment area may have visited the facility. 

In [9]:
#shared firsname, lastname
counts = h_facility.groupby(['firstname', 'lastname']).size()
shared_names = counts[counts>1]
shared_names.size

534

In [10]:
counts

firstname   lastname 
Ababu       Crispus      1
Abalyogera  Namulondo    1
Abasa       Bukosi       1
            Nathan       1
Abatuka     Nahiya       1
                        ..
Zubedah     Nakaziba     2
Zulaika     Naigaga      1
            Nakaziba     1
            Namumbya     1
Zura        Racheal      1
Length: 2207, dtype: int64

In [11]:
#shared firstname, lastname and petname
counts2 = h_facility.groupby(['firstname', 'lastname', 'petname']).size()
counts2[counts2>1].size

310

In [12]:
#shared firstname, lastname, petname and dob in health facility dataset
counts3 = h_facility.groupby(['firstname', 'lastname', 'petname', 'dob']).size()
counts3[counts3>1].size

259

In [13]:
#Data points with firstname, petname and lastname in both datasets 
#all duplicates in either dataframe are repeated 
common_names = pd.merge(hdss, h_facility, on = ['firstname', 'lastname', 'petname'], how = 'inner')
common_names.shape

(1347, 15)

In [82]:
common_names.head()

Unnamed: 0,recnr_x,firstname,lastname,petname,dob_x,sex_x,nationalid_x,hdssid,hdsshhid,recnr_y,dob_y,sex_y,nationalid_y,patientid,visitdate
0,8,Hassan,Kapisi,Ganyana,25/03/1981 00:00,1,,I20008,HH100008,764,25/03/1981 00:00,1,N_ID_7262,2834,27/03/2020
1,9,Zubairi,Adam,Lubega,01/10/1931 00:00,1,,I20009,HH100009,2093,01/10/1931 00:00,1,N_ID_11240,4161,12/05/2020
2,13,Kitimbo,Bahati,Labeja,04/10/1995 00:00,1,,I20013,HH100013,2806,04/10/1995 00:00,1,N_ID_13691,4966,13/03/2019
3,14,Nakaziba,Kamega,,28/07/1947 00:00,2,,I20014,HH100014,2800,28/07/1947 00:00,2,N_ID_13397,4868,21/01/2023
4,15,Nangobi,Shamimu,,12/08/2012 00:00,2,,I20015,HH100015,774,12/08/2012 00:00,2,N_ID_7292,2844,08/12/2022


In [65]:
#first removing the duplicates in any one dataset before finding shared datapoints 
h_facility_unique = h_facility.drop_duplicates(subset=['firstname', 'lastname'])
hdss_unique = hdss.drop_duplicates(subset=['firstname', 'lastname'])

In [66]:
common_names1 = pd.merge(h_facility_unique, hdss_unique, on = ['firstname', 'lastname'], how = 'inner')
common_names1.shape

(2124, 16)

## HDSS Dataset

In [34]:
from pandas_profiling import ProfileReport

  from pandas_profiling import ProfileReport


In [40]:
profile = ProfileReport(hdss, title='Pandas Profiling Report', explorative=True)

In [41]:
profile.to_file("hdss_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Health Facility dataset

In [42]:
hf_profile = ProfileReport(h_facility, title = "EDA report", explorative = True)

In [44]:
hf_profile.to_file("hf_report.html")

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

In [47]:
print(h_facility.dob.dtype)

object


In [49]:
h_facility.dtypes

recnr          int64
firstname     object
lastname      object
petname       object
dob           object
sex            int64
nationalid    object
patientid      int64
visitdate     object
dtype: object

In [51]:
combined_df = pd.concat([hdss, h_facility], ignore_index=True)

In [52]:
combined_df.head()

Unnamed: 0,recnr,firstname,lastname,petname,dob,sex,nationalid,hdssid,hdsshhid,patientid,visitdate
0,1,Zaina,Hanifa,Ula,22/09/1930 00:00,2,,I20001,HH100001,,
1,2,Godfrey,Maganda,Mukama,15/07/1934 00:00,1,,I20002,HH100002,,
2,3,Kasim,Ngobi,Galabuzi,03/03/1983 00:00,1,,I20003,HH100003,,
3,4,Esther,,Inara,30/07/1968 00:00,2,,I20004,HH100004,,
4,5,Sumaya,Swabula,,13/12/1930 00:00,2,,I20005,HH100005,,


In [53]:
combined_df.shape

(7017, 11)

In [66]:
#how many in the hdss dataset share firstname, lastname, petname and dob

counts = hdss.groupby(['firstname', 'lastname', 'petname', 'dob', 'sex']).size()
counts[counts>1].size

0

In [67]:
#how many in the health facility dataset share firstname, lastname, petname and dob

counts = h_facility.groupby(['firstname', 'lastname', 'petname', 'dob', 'sex']).size()
counts[counts>1].size

259

In [57]:
counts[counts>1]

firstname  lastname  petname    dob             
Abubakali  Lunkumu   Jjunju     07/08/1934 00:00    2
Abuneli    Lawrence  Asega      11/08/1959 00:00    2
Aidah      Fatuma    Aya        13/12/2011 00:00    2
Akello     Jesca     Barungi    20/12/1937 00:00    2
Alex       Idinda    Nakato     08/10/1967 00:00    2
                                                   ..
Zaina      Mbabazi   Nakayiza   12/12/2017 00:00    2
Zauma      Bridget   Nakalema   28/04/1993 00:00    2
Zawumina   Nakaziba  Kamau      21/09/1959 00:00    2
Ziriya     Kauma     Kato       11/11/1963 00:00    2
Zubedah    Nakaziba  Nakalanzi  03/03/1982 00:00    2
Length: 259, dtype: int64

In [68]:
hf_unique = h_facility.drop_duplicates(subset=['firstname', 'lastname', 'dob', 'petname', 'sex'])

hf_unique.shape


(2609, 9)

293

In [60]:
ss = hf_unique.groupby(['firstname', 'lastname', 'petname', 'dob']).size()
ss[ss>1].size

0

### Concatenate datasets

In [61]:
combined_df = pd.concat([hdss, hf_unique], ignore_index=True)

In [62]:
combined_df.head()

Unnamed: 0,recnr,firstname,lastname,petname,dob,sex,nationalid,hdssid,hdsshhid,patientid,visitdate
0,1,Zaina,Hanifa,Ula,22/09/1930 00:00,2,,I20001,HH100001,,
1,2,Godfrey,Maganda,Mukama,15/07/1934 00:00,1,,I20002,HH100002,,
2,3,Kasim,Ngobi,Galabuzi,03/03/1983 00:00,1,,I20003,HH100003,,
3,4,Esther,,Inara,30/07/1968 00:00,2,,I20004,HH100004,,
4,5,Sumaya,Swabula,,13/12/1930 00:00,2,,I20005,HH100005,,


In [69]:
duplicates = combined_df.duplicated(subset=['firstname', 'lastname', 'petname', 'dob', 'sex'], keep=False)

In [70]:
counts = combined_df.groupby(['firstname', 'lastname', 'petname', 'dob', 'sex']).size()
counts[counts>1].size

587

In [71]:
# Merge the datasets to find common records
common_records = pd.merge(hf_unique, hdss, on=['firstname', 'lastname', 'petname', 'dob', 'sex'], how='inner')
len(common_records)

870

In [81]:
common_records.head()

Unnamed: 0,recnr_x,firstname,lastname,petname,dob,sex,nationalid_x,patientid,visitdate,recnr_y,nationalid_y,hdssid,hdsshhid
0,3,Gloria,Rashida,,11/07/1993 00:00,2,N_ID_11861,2079,14/12/2022,2613,,I22613,HH102613
1,8,Namutamba,Sumaya,Bria,18/02/1997 00:00,2,N_ID_11876,2084,11/08/2023,2902,,I22902,HH102902
2,9,Madina,Shirati,Sira,10/07/1997 00:00,2,N_ID_5006,2085,11/06/2021,2951,,I22951,HH102951
3,12,Nasiri,Kirunda,Ebitu,02/06/1959 00:00,1,N_ID_5015,2088,28/11/2018,1721,,I21721,HH101721
4,16,Akiramu,Kigenyi,Malinga,18/08/2010 00:00,1,N_ID_5027,2091,07/04/2020,3060,,I23060,HH103060


## Permutations of duplications

In [74]:
counts = h_facility.groupby(['firstname', 'petname', 'dob', 'sex']).size()
counts[counts>1].size

264

In [76]:
counts = h_facility.groupby(['firstname','lastname', 'dob', 'sex']).size()
counts[counts>1].size

271

In [78]:
counts = hdss.groupby(['firstname','lastname', 'dob', 'sex', 'petname']).size()
counts[counts>1].size

0

In [79]:
counts = hdss.groupby(['firstname','lastname', 'dob', 'sex']).size()
counts[counts>1].size

0

In [80]:
counts = hdss.groupby(['firstname', 'dob', 'sex', 'petname']).size()
counts[counts>1].size

0