## Section 1.0: Installing the necessary libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import pandas as pd
import joblib

### Section 1.1.1: Loading the datasets

In [2]:
eid_vl_dataset = pd.read_csv('dataset/EID-VL-dataset.csv')

### Section 1.1.2: Viewing the dataset

In [3]:
eid_vl_dataset

Unnamed: 0,pseudonymous_id,facility_id,sex,dob,initiation_date,datecollected,datereceived,datedispatched,lab_id
0,b32bd2a98e526373197f1725093d83d8,1463,2,1997-01-12,2004-12-09,2022-10-06,2022-10-07,2022-10-28,6
1,b39c3247661d069195f888641ecc76f7,3142,3,1980-03-03,2019-10-03,2021-06-14,2021-08-13,2022-02-02,5
2,b39c3247661d069195f888641ecc76f7,3142,3,1980-03-03,2019-10-03,2021-06-14,2021-08-13,2022-08-01,10
3,ec71bb536d907ebe241a80bd002186c0,3130,3,1984-06-15,2011-06-09,2021-06-17,2021-08-10,2022-01-28,5
4,ec71bb536d907ebe241a80bd002186c0,3130,3,1984-06-15,2011-06-09,2021-06-17,2021-08-10,2022-08-01,10
...,...,...,...,...,...,...,...,...,...
995,d4b52f2e40dd213c4c7d8e04ed19ff03,4899,2,1965-01-01,2004-06-01,2022-04-26,2022-04-28,2022-05-16,5
996,ef5302343022ae6a8696b5c0beed75b1,5798,1,2002-01-01,2009-06-01,2021-05-28,2021-06-02,2022-01-03,5
997,ef5302343022ae6a8696b5c0beed75b1,5798,1,2002-01-01,2009-06-01,2022-05-04,2022-05-04,2022-05-14,5
998,ef5302343022ae6a8696b5c0beed75b1,5798,1,2002-01-01,2009-06-01,2022-12-17,2022-12-20,2022-12-23,5


### Section 1.1.3: Viewing the eid-vl dataset columns

In [4]:
eid_vl_dataset.columns

Index(['pseudonymous_id', 'facility_id', 'sex', 'dob', 'initiation_date',
       'datecollected', 'datereceived', 'datedispatched', 'lab_id'],
      dtype='object')

### Section 1.1.4: Creating a pipeline to flag Duplicates
        - flagging rows with same pseudonymous_id and facility_id

In [5]:
## Here, we are combining the two columns and check the occurrences

counts = eid_vl_dataset.groupby(['pseudonymous_id', 'facility_id']).size()

## using counts to find if we have duplicates
## Here, if row is more than 1 we consider it as a duplicate
duplicates = counts[counts > 1]

## We used tuple since can't be changed compared or modified to list
eid_vl_dataset_duplicates = eid_vl_dataset[eid_vl_dataset[['pseudonymous_id', 'facility_id']].apply(lambda x: tuple(x) in duplicates, axis=1)]
eid_vl_dataset_duplicates = eid_vl_dataset_duplicates.sort_values(['pseudonymous_id', 'facility_id']) 


In [6]:
# Viewing the dataset
print("Duplicate rows:")
eid_vl_dataset_duplicates

Duplicate rows:


Unnamed: 0,pseudonymous_id,facility_id,sex,dob,initiation_date,datecollected,datereceived,datedispatched,lab_id
851,01730c3209af0945ee4415bc28b7961f,2715,3,1953-06-15,2006-06-21,2021-06-15,2021-08-06,2022-01-27,5
852,01730c3209af0945ee4415bc28b7961f,2715,3,1953-06-15,2006-06-21,2021-06-15,2021-08-09,2022-08-01,10
267,03ac7615fbaa57604c55af18dac5dabc,4840,1,2005-05-14,2006-06-05,2022-04-25,2022-04-25,2022-04-28,5
268,03ac7615fbaa57604c55af18dac5dabc,4840,1,2005-05-14,2006-06-05,2022-09-22,2022-09-23,2022-09-25,5
732,0589a4882e17aaf8d4c346960e882220,4840,2,1964-01-01,2007-09-01,2022-04-14,2022-04-14,2022-04-15,5
...,...,...,...,...,...,...,...,...,...
859,f8ec85c28af510e23f77417976c69f53,4840,1,1970-01-01,2007-02-01,2022-09-07,2022-09-07,2022-09-16,5
404,fde34841225e1a98c96d1bdb8b6d349e,4104,2,1983-12-06,2008-12-16,2022-07-21,2022-07-22,2022-08-02,5
405,fde34841225e1a98c96d1bdb8b6d349e,4104,2,1983-12-06,2008-12-16,2022-10-21,2022-10-22,2022-11-10,5
389,fff01f795bfbc421055e8ad9cfd7afe0,4899,2,1978-01-01,,2022-04-19,2022-04-20,2022-04-22,5


    - Flagging rows with the same pseudonymous_id, facility_id and dob
   