# Data cleaning for simple model

In [1]:
import cryptpandas as crp
import datetime as dt
import numpy as np
import pandas as pd

from IPython.display import clear_output
from utils.missing import replace_missing_values

## Load files

In [2]:
impute_method = pd.read_csv('./data/missing_values_method.csv')

In [3]:
data_info = pd.read_csv('./data/SMMIS_fields.csv')
data_info.head()

Unnamed: 0,Field #,Variable name,Base model,Description,"Used in preliminary data (Y=yes, N=no)",Field type,Criteria for inclusion in study,Missing value criteria,Discrepant values,Recoded variable name,Recoded variable values
0,1,RECORDNO,,Oracle record number,Y,Numeric,,,,,
1,2,HOSPITAL,,SMMIS hospital code number of birth,Y,Numeric,,,,,
2,3,HOSPNAME,1.0,Hospital name,N,Label,,,,,
3,4,YEAR,,Year of record,Y,Numeric,,,,,
4,5,ED91,,Enumeration district of residence,N,Label,,,,,


In [4]:
password = input('Password? ')
clear_output()

In [5]:
filename = './data/SMMIS data (1988-2000).crypt'
data = crp.read_encrypted(filename, password=password)
data.set_index('recordno', inplace=True)

In [6]:
# Remove data with no 5 min Apgar score
mask = data['apgar5'].notnull()
data = data[mask]

Filter data to fileds used in base model

In [7]:
mask = data_info['Base model'] == 1
fields = list(data_info['Variable name'][mask])
fields = [field.lower() for field in fields]
data = data[fields]
data.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 511888 entries, 3.0 to 585291.0
Data columns (total 19 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   hospname  511888 non-null  object 
 1   mdob      511700 non-null  object 
 2   ethnic    497690 non-null  object 
 3   parity    511798 non-null  float64
 4   prevcaes  511798 non-null  float64
 5   lperiod   488322 non-null  object 
 6   mheight   424598 non-null  float64
 7   mweight   445826 non-null  float64
 8   noscans   505133 non-null  float64
 9   onsetla   511276 non-null  object 
 10  analab    504702 non-null  object 
 11  analdel   484892 non-null  object 
 12  methind   193427 non-null  object 
 13  pyrexia   510281 non-null  object 
 14  ctg       511000 non-null  object 
 15  meconium  511285 non-null  object 
 16  birthdat  511888 non-null  object 
 17  methdel   511880 non-null  object 
 18  apgar5    511888 non-null  float64
dtypes: float64(6), object(13)
memory usage

In [8]:
# Format data values
cols = ['mdob','lperiod', 'birthdat']

for col in cols:
    data[col] = pd.to_datetime(data[col])

## Replace missing values for selected fields

In [9]:
# Replace missing induction method with 'NA'
data['methind'] = replace_missing_values(data['methind'], 'NA')

# Replace missing hight and weight with zero
data['mweight'] = replace_missing_values(data['mweight'], 'zero')
data['mheight'] = replace_missing_values(data['mheight'], 'zero')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s[missing] = 'NA'
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s[missing] = 0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s[missing] = 0


In [10]:
# Keep complete data only
mask = data.isnull().sum(axis=1) == 0
data = data[mask]
data.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 440300 entries, 4.0 to 585291.0
Data columns (total 19 columns):
 #   Column    Non-Null Count   Dtype         
---  ------    --------------   -----         
 0   hospname  440300 non-null  object        
 1   mdob      440300 non-null  datetime64[ns]
 2   ethnic    440300 non-null  object        
 3   parity    440300 non-null  float64       
 4   prevcaes  440300 non-null  float64       
 5   lperiod   440300 non-null  datetime64[ns]
 6   mheight   440300 non-null  float64       
 7   mweight   440300 non-null  float64       
 8   noscans   440300 non-null  float64       
 9   onsetla   440300 non-null  object        
 10  analab    440300 non-null  object        
 11  analdel   440300 non-null  object        
 12  methind   440300 non-null  object        
 13  pyrexia   440300 non-null  object        
 14  ctg       440300 non-null  object        
 15  meconium  440300 non-null  object        
 16  birthdat  440300 non-null  datet

## Calculate required fields

In [11]:
# Calculate BMI
bmi = data['mweight'] / ((data['mheight']/100) ** 2)
bmi =  bmi.values.astype(int)

# Replace any BMI < 10 with zero
mask = bmi < 10
bmi[mask]= 0

data['bmi'] = bmi
data.drop(['mweight', 'mheight'], axis=1, inplace=True)



In [12]:
# Calculate age (years)
age = data['birthdat'] - data['mdob']
age = age.dt.days / 365
data ['age'] = age.values.astype(int)

# Calculate gestantion age (weeks)
gest = data['birthdat'] - data['lperiod']
gest = gest.dt.days / 7
data ['gest'] = gest.values.astype(int)

# Remove dates
data.drop(['birthdat', 'mdob', 'lperiod'], axis=1, inplace=True)


In [13]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 440300 entries, 4.0 to 585291.0
Data columns (total 17 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   hospname  440300 non-null  object 
 1   ethnic    440300 non-null  object 
 2   parity    440300 non-null  float64
 3   prevcaes  440300 non-null  float64
 4   noscans   440300 non-null  float64
 5   onsetla   440300 non-null  object 
 6   analab    440300 non-null  object 
 7   analdel   440300 non-null  object 
 8   methind   440300 non-null  object 
 9   pyrexia   440300 non-null  object 
 10  ctg       440300 non-null  object 
 11  meconium  440300 non-null  object 
 12  methdel   440300 non-null  object 
 13  apgar5    440300 non-null  float64
 14  bmi       440300 non-null  int64  
 15  age       440300 non-null  int64  
 16  gest      440300 non-null  int64  
dtypes: float64(4), int64(3), object(10)
memory usage: 60.5+ MB


In [14]:
# Remove subzero age and gestational age
mask = (data['age'] > 0) & (data['gest'] > 0)
data = data[mask]

In [15]:
# Reduce ethnicity
data['ethnic_white'] = data['ethnic'] == 'C'
data.drop('ethnic', axis=1, inplace=True)

In [16]:
# Randomise
data = data.sample(frac=1.0)

In [17]:
data.describe()

Unnamed: 0,parity,prevcaes,noscans,apgar5,bmi,age,gest
count,440275.0,440275.0,440275.0,440275.0,440275.0,440275.0,440275.0
mean,0.878031,0.083911,2.112761,9.493519,18.949102,28.732263,39.534866
std,1.084955,0.326576,1.335164,0.835846,10.399782,5.317193,14.708855
min,0.0,0.0,0.0,0.0,0.0,12.0,1.0
25%,0.0,0.0,1.0,9.0,19.0,25.0,38.0
50%,1.0,0.0,2.0,10.0,22.0,29.0,40.0
75%,1.0,0.0,3.0,10.0,25.0,32.0,41.0
max,20.0,7.0,9.0,10.0,77.0,91.0,4737.0


In [18]:
data.head().T

recordno,360980.0,147183.0,555233.0,265680.0,243895.0
hospname,Luton and Dunstable Hospital,QEII Hospital,Hemel Hempstead Hospital,St. Mary's Hospital,Lister Hospital
parity,0.0,2.0,1.0,1.0,1.0
prevcaes,0.0,0.0,0.0,0.0,0.0
noscans,1.0,1.0,1.0,1.0,1.0
onsetla,S,I,S,S,S
analab,PI,I,I,PE,I
analdel,I,N,I,E,I
methind,,AP,,P,
pyrexia,N,N,N,N,N
ctg,N,N,N,N,Z


In [19]:
data.to_csv('./data/data_for_model.csv', index=False)