In [1]:
#Loading relevant libraries
import pandas as pd
import numpy as np
import random
from faker import Faker
import string
import datetime as dt

In [2]:
#creating an instance Faker class

synthetic = Faker(locale='en_GB')

In [3]:
#using already existing LSOA codes from WIMD_2014
#these will be assigned to patients as their addresses

imd_2014 = pd.read_excel("IMD 2014.xlsx",sheet_name=1)
imd_2014.head(3)

Unnamed: 0,LSOA Code,r,Income,Employment,Health,Education,Access to Services,Community Safety,Physical Environment,Housing
0,W01000001,861,929,867,1227,797,219,804,1776,744
1,W01000002,1153,1097,998,1347,1255,445,1656,1653,378
2,W01000003,500,483,607,450,620,628,392,1387,219


In [4]:
gender = [1,0] #gender will be assigned as 0 or 1
lsoa_codes = imd_2014['LSOA Code'] #getting the lsoa_codes

In [5]:
#creating a function to generate anonymised linking field, start_date and end_date of registration

def WLGP_CLEAN_GP_REG_MEDIAN(number_of_records):
    Faker.seed(987) #setting a seed to generate the same set of data
    np.random.seed(987)
    
    
    history = {
        
        #Encrypted anonymised linking field (ALF): Int
        'ALF_PE': [synthetic.bothify('#####???') for _ in range(number_of_records)],
        
        #Start date of this follow-up period: date
        'START_DATE': [synthetic.date_between_dates(date_start = dt.date(2015,1,1))
                       for _ in range(number_of_records)],
        
        #End date of this follow up period: date
        'END_DATE': ['-' for _ in range(number_of_records)]
        
    }
    
    return pd.DataFrame(history)

In [6]:
#generating the data for 1500 patients
wlgp_clean_gp_reg_median = WLGP_CLEAN_GP_REG_MEDIAN(1500)
wlgp_clean_gp_reg_median.head()

Unnamed: 0,ALF_PE,START_DATE,END_DATE
0,88800brd,2020-08-02,-
1,31356idW,2020-10-21,-
2,82091DFn,2017-02-18,-
3,87107xlO,2024-06-20,-
4,02461YpK,2016-10-06,-


In [7]:
#checking for duplicated alf_pe
wlgp_clean_gp_reg_median[wlgp_clean_gp_reg_median.duplicated(subset='ALF_PE')]

Unnamed: 0,ALF_PE,START_DATE,END_DATE


In [8]:
#using same patient_id's generated from WLGP_CLEAN_GP_REG_MEDIAN
#newly generated id's might not be the same as the old id's so I wont generate new id's
gp_reg_alf_pe = wlgp_clean_gp_reg_median['ALF_PE']

In [9]:
#creating a function to generate personal details of patients

def AR_PERS(number_of_records: int):
    Faker.seed(987) #setting a seed to generate the same set of data
    np.random.seed(987)
    alf_pe = np.array(gp_reg_alf_pe)
    
    #using a dictionary to get features of patients and corresponding data
    ar_pers = {
        
        #Encrypted anonymised linking field (ALF): Int
        'ALF_PE': alf_pe,
        
        #Sex(gender): char
        'GNDR_CD': [np.random.choice(gender) for _ in range(number_of_records)],
        
        #The date of the monday that occurs prior, or on, the actual date of birth: date
        'WOB': [synthetic.date_of_birth(minimum_age = 1, maximum_age=101) for _ in range(number_of_records)],
        
        #The date which a person died: date
        'DOD': [np.random.choice(['-',synthetic.date_this_decade(before_today=True),],
                                 p=[0.985,0.015]) for _ in range(number_of_records)],
        
        #mbi of the patient
        'bmi': [np.abs(round(np.random.normal(40,15),1)) for _ in range(number_of_records)],
        
        #smoking status of the patient
        'smk_sta' : [np.random.choice(['Yes','No'],p=[0.10, 0.9]) for _ in range(number_of_records)],
        
        
    }
    
    #returning a dataframe of synthetic data
    return pd.DataFrame(ar_pers)

In [10]:
#generating the data for 1500 patients
df_ar_person = AR_PERS(1500)
df_ar_person.head()

Unnamed: 0,ALF_PE,GNDR_CD,WOB,DOD,bmi,smk_sta
0,88800brd,0,1995-02-01,-,39.1,Yes
1,31356idW,0,1994-07-20,-,53.1,No
2,82091DFn,0,2007-11-08,-,29.8,No
3,87107xlO,1,1994-08-23,-,47.9,No
4,02461YpK,0,1929-12-07,-,40.3,No


In [11]:
#will be used as the alf_pe

wlgp_alf_pe = wlgp_clean_gp_reg_median['ALF_PE']

In [12]:
def WDSD_SINGLE_CLEAN_GEO_CHAR_LSOA2011(number_of_records: int):
    Faker.seed(987) #setting a seed to generate the same set of data
    np.random.seed(987)
    
    #using a dictionary to get features of patients and corresponding data
    addresses = {
        
        #Encrypted anonymised linking field (ALF): Int
        'ALF_PE': wlgp_alf_pe,
        
        #Start date of this follow-up period: date
        'START_DATE': [synthetic.date_between_dates(date_start = dt.date(2000,2,2) ,date_end = dt.date(2010,2,2))
                       for _ in range(number_of_records)],
        
        #End date of this follow up period: date
        'END_DATE': [synthetic.date_between_dates(date_start = dt.date(2010,2,2) ,date_end = dt.date(2024,6,30))
                       for _ in range(number_of_records)],
        
        #WIMD: getting addresses for patients
        'WIMD_2019_LSOA_CODES': [np.random.choice(lsoa_codes) for _ in range(number_of_records)],
        
        
    }
    
    #returning a dataframe of synthetic data
    return pd.DataFrame(addresses)

In [13]:
#generating the data for 1500 patients
df_address = WDSD_SINGLE_CLEAN_GEO_CHAR_LSOA2011(1500)
df_address.head(3)

Unnamed: 0,ALF_PE,START_DATE,END_DATE,WIMD_2019_LSOA_CODES
0,88800brd,2009-02-24,2019-10-21,W01001553
1,31356idW,2009-01-30,2020-12-17,W01000042
2,82091DFn,2009-02-04,2014-11-03,W01001798


In [14]:
#no duplicates for ALF_PE
df_address[df_address.duplicated(subset=['ALF_PE'],keep=False)]['ALF_PE'].sort_values()

Series([], Name: ALF_PE, dtype: object)

In [15]:
#creating GP event codes 

def GP_EVENT_CODES():
    Faker.seed(345)
    np.random.seed(678)
    
    #using the read_code.csv to get read codes which will be event codes for patients
    readcodes = pd.read_csv('read_codes.csv')
    
    event_codes = {
        
        #links GP_EVENT_REFORMATTED table with GP_EVENT_CODES table: int
        'EVENT_CD_ID': [_ for _ in range(1,readcodes.shape[0]+1)],
        
        #Code relating to information recorded during visit: smallint
        'EVENT_CD':  list(readcodes['readcode']),
        
        #1 IF EVENT IS A v3 code, 0 otherwise
        'IS_READ_V3': [np.random.choice([1,0], p=[0.985,0.015]) for _ in range(readcodes.shape[0])],
                
    }
    
    return pd.DataFrame(event_codes)

In [16]:
#table for event_codes
event_codes = GP_EVENT_CODES()
event_codes.head()

Unnamed: 0,EVENT_CD_ID,EVENT_CD,IS_READ_V3
0,1,H33..00,1
1,2,663..11,1
2,3,H333.00,1
3,4,H33z100,1
4,5,H33z011,1


In [17]:
#creating function to generate patient's event dates and event code id

def GP_EVENT_REFORMATTED(number_of_records: int):
    Faker.seed(345)
    np.random.seed(678)
    readcodes = pd.read_csv('read_codes.csv')
    
    event_reformatted = {
        
        #encrypted anonymised linking field: bigint
        'ALF_PE': [np.random.choice(gp_reg_alf_pe) for _ in range(number_of_records)],
        
        #status code assigned when deriving the encrypted anonymised linking field: char
        'ALF_STS_CD': [synthetic.bothify('?###??',letters=string.ascii_uppercase) for _ in range(number_of_records)],
        
        #encrypted code of patient's registered GP: int
        'PRAC_CD_E': [synthetic.bothify('##') for _ in range(number_of_records)],
        
        #date of the event/incident/medical episode: date
        'EVENT_DT': [synthetic.date_between_dates(date_start = dt.date(2018,1,1), date_end = dt.date(2023,12,31)) 
                                    for _ in range(number_of_records)],
        
        #links GP_EVENT_REFORMATTED table with GP_EVENT_CODES table: int
        'EVENT_CD_ID': [np.random.choice(event_codes['EVENT_CD_ID']) for _ in range(number_of_records)],
        
        #Value associated with the EVENT_CD found in GP_EVENT_CODES table; decimal
        'EVENT_VAL': [np.random.randint(1,20) for _ in range(number_of_records)]
        
    }
    
    return pd.DataFrame(event_reformatted)

In [18]:
#this includes the event_code_id which will be linked to the event_code table to get specific event codes of patients.
#it also includes dates that events occured

gp_event_reformatted = GP_EVENT_REFORMATTED(10000)
gp_event_reformatted.head()

Unnamed: 0,ALF_PE,ALF_STS_CD,PRAC_CD_E,EVENT_DT,EVENT_CD_ID,EVENT_VAL
0,45016HiS,N402DG,97,2022-05-29,110,19
1,37300hWO,M524VJ,92,2018-06-26,68,4
2,72021hiS,C052ZT,33,2019-11-03,94,4
3,19801BKT,E505YJ,96,2020-12-04,121,15
4,49688IPX,U772FS,42,2021-08-06,73,13


#### GENERATE CSV FILES

In [None]:
df_ar_person.to_csv('AR_PERS.csv',index=False)
df_address.to_csv('WDSD_SINGLE_CLEAN_GEO_CHAR_LSOA2011.csv',index=False)
event_codes.to_csv('GP_EVENT_CODES.csv',index=False)
gp_event_reformatted.to_csv('GP_EVENT_REFORMATTED.csv',index=False)
wlgp_clean_gp_reg_median.to_csv('WLGP_CLEAN_GP_REG_MEDIAN.csv',index=False)