## Section 1.0: Installing the necessary libraries

In [6]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import re

### Section 1.1.1: Loading the datasets

In [7]:
eid_vl_dataset = pd.read_csv('dataset/EID-VL-dataset.csv')

### Section 1.1.2: Viewing the dataset

In [8]:
eid_vl_dataset

Unnamed: 0,pseudonymous_id,facility_id,sex,dob,initiation_date,datecollected,datereceived,datedispatched,lab_id
0,b32bd2a98e526373197f1725093d83d8,1463,2,1997-01-12,2004-12-09,2022-10-06,2022-10-07,2022-10-28,6
1,b39c3247661d069195f888641ecc76f7,3142,3,1980-03-03,2019-10-03,2021-06-14,2021-08-13,2022-02-02,5
2,b39c3247661d069195f888641ecc76f7,3142,3,1980-03-03,2019-10-03,2021-06-14,2021-08-13,2022-08-01,10
3,ec71bb536d907ebe241a80bd002186c0,3130,3,1984-06-15,2011-06-09,2021-06-17,2021-08-10,2022-01-28,5
4,ec71bb536d907ebe241a80bd002186c0,3130,3,1984-06-15,2011-06-09,2021-06-17,2021-08-10,2022-08-01,10
...,...,...,...,...,...,...,...,...,...
995,d4b52f2e40dd213c4c7d8e04ed19ff03,4899,2,1965-01-01,2004-06-01,2022-04-26,2022-04-28,2022-05-16,5
996,ef5302343022ae6a8696b5c0beed75b1,5798,1,2002-01-01,2009-06-01,2021-05-28,2021-06-02,2022-01-03,5
997,ef5302343022ae6a8696b5c0beed75b1,5798,1,2002-01-01,2009-06-01,2022-05-04,2022-05-04,2022-05-14,5
998,ef5302343022ae6a8696b5c0beed75b1,5798,1,2002-01-01,2009-06-01,2022-12-17,2022-12-20,2022-12-23,5


### Section 1.1.3: Viewing the eid-vl dataset columns

In [9]:
eid_vl_dataset.columns

Index(['pseudonymous_id', 'facility_id', 'sex', 'dob', 'initiation_date',
       'datecollected', 'datereceived', 'datedispatched', 'lab_id'],
      dtype='object')

### Section 1.1.4: Data cleansing

In [10]:
## Here, We are dropping duplicates and keeping the first occurance

eid_vl_dataset.drop_duplicates(subset=['pseudonymous_id'], keep='first', inplace=True)

### Section 1.1.5: Checking the first and last 10 rows in our dataset
    - Viewing the first 10 rows
    - Viewing the last 10 rows
    - Reshuffling and reset index

In [35]:
#  Viewing the first 10 rows

eid_vl_dataset.head(10)

Unnamed: 0,pseudonymous_id,facility_id,sex,dob,initiation_date,datecollected,datereceived,datedispatched,lab_id
0,b32bd2a98e526373197f1725093d83d8,1463,2,1997-01-12,2004-12-09,2022-10-06,2022-10-07,2022-10-28,6
1,b39c3247661d069195f888641ecc76f7,3142,3,1980-03-03,2019-10-03,2021-06-14,2021-08-13,2022-02-02,5
3,ec71bb536d907ebe241a80bd002186c0,3130,3,1984-06-15,2011-06-09,2021-06-17,2021-08-10,2022-01-28,5
5,c55a9eea0d9ef3759663390bac550259,3130,2,2010-05-15,2021-10-13,2022-03-04,2022-03-14,2022-11-08,7
6,8d9bf860df35615e20c969e839c58675,2715,1,1959-02-18,2004-03-25,2021-05-31,2021-08-16,2022-01-31,5
9,13a093409e038b8d2fe248a7fe460a05,3130,2,2004-06-15,2006-08-09,2021-05-24,2021-06-05,2022-02-16,5
12,1b2aa015f1dc8ec2c3ca7b8539a4dc35,4104,2,1980-01-01,2007-08-04,2022-09-06,2022-09-08,2022-09-19,5
13,cbbdd06e3b0da4eebedc423048b76e6b,4104,1,1972-09-01,2019-03-08,2022-10-21,2022-10-22,2022-11-10,5
14,44faf18106c75f5e831bd9d81ed4bda3,4104,2,1956-01-01,2010-02-01,2022-06-03,2022-06-06,2022-06-11,5
15,1b6b82f4f040a060082c2711faf7d4de,6429,2,1979-08-21,2012-01-01,2022-10-31,2022-11-02,2022-11-07,3


In [11]:
#  Viewing the first 10 rows

eid_vl_dataset.tail(10)

Unnamed: 0,pseudonymous_id,facility_id,sex,dob,initiation_date,datecollected,datereceived,datedispatched,lab_id
986,96eed708d7e13951ff6bba0a475f5b66,6117,1,1971-07-01,2012-10-01,2022-09-13,2022-09-15,2022-09-17,5
987,0f3ff60a337e77ae3ea65baf05b7fa56,6117,2,1973-01-01,2012-04-11,2022-08-30,2022-08-31,2022-09-12,5
988,e43d4715e84b8ef1e133fb1338991e6c,4892,2,1968-01-01,2010-08-08,2022-04-26,2022-04-28,2022-05-14,5
989,10e5bcdb93d8309956d28cd3db1fc82b,4840,2,1973-01-01,2006-04-01,2022-06-21,2022-06-21,2022-06-22,5
990,a7166e9477fbb790c3ea96aa5f47b968,4104,1,1950-01-02,2010-11-23,2022-09-05,2022-09-08,2022-09-20,5
992,c4118e9a66eabf082d6fd88de1e071f1,4812,2,1976-01-01,2011-01-01,2022-04-21,2022-04-26,2022-04-28,5
993,bbb4ac86db2c4c396992bc2fcad0948b,4104,2,1978-09-25,2011-03-01,2022-09-08,2022-09-14,2022-09-20,5
994,da18dcd4ab7211dea8cf9ccd15c0fce0,4853,1,1960-01-01,2009-12-01,2022-05-04,2022-05-05,2022-05-12,5
995,d4b52f2e40dd213c4c7d8e04ed19ff03,4899,2,1965-01-01,2004-06-01,2022-04-26,2022-04-28,2022-05-16,5
996,ef5302343022ae6a8696b5c0beed75b1,5798,1,2002-01-01,2009-06-01,2021-05-28,2021-06-02,2022-01-03,5


### Section 1.1.6: Reshuffling and resetting the index 