In [1]:
import cryptpandas as crp
import datetime as dt
import numpy as np
import pandas as pd

from IPython.display import clear_output
from utils.missing import replace_missing_values

## Load files

In [2]:
impute_method = pd.read_csv('./data/missing_values_method.csv')

In [3]:
data_info = pd.read_csv('./data/SMMIS_fields.csv')
data_info.head()

Unnamed: 0,Field #,Variable name,Base model,Description,"Used in preliminary data (Y=yes, N=no)",Field type,Criteria for inclusion in study,Missing value criteria,Discrepant values,Recoded variable name,Recoded variable values
0,1,RECORDNO,,Oracle record number,Y,Numeric,,,,,
1,2,HOSPITAL,,SMMIS hospital code number of birth,Y,Numeric,,,,,
2,3,HOSPNAME,1.0,Hospital name,N,Label,,,,,
3,4,YEAR,,Year of record,Y,Numeric,,,,,
4,5,ED91,,Enumeration district of residence,N,Label,,,,,


In [4]:
password = input('Password? ')
clear_output()

In [5]:
filename = './data/SMMIS data (1988-2000).crypt'
data = crp.read_encrypted(filename, password=password)
data.set_index('recordno', inplace=True)

In [6]:
# Remove data with no 5 min Apgar score
mask = data['apgar5'].notnull()
data = data[mask]

Filter data to fileds used in base model

In [7]:
mask = data_info['Base model'] == 1
fields = list(data_info['Variable name'][mask])
fields = [field.lower() for field in fields]
data = data[fields]
data.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 511888 entries, 3.0 to 585291.0
Data columns (total 18 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   hospname  511888 non-null  object 
 1   mdob      511700 non-null  object 
 2   parity    511798 non-null  float64
 3   prevcaes  511798 non-null  float64
 4   lperiod   488322 non-null  object 
 5   mheight   424598 non-null  float64
 6   mweight   445826 non-null  float64
 7   noscans   505133 non-null  float64
 8   onsetla   511276 non-null  object 
 9   analab    504702 non-null  object 
 10  analdel   484892 non-null  object 
 11  methind   193427 non-null  object 
 12  pyrexia   510281 non-null  object 
 13  ctg       511000 non-null  object 
 14  meconium  511285 non-null  object 
 15  birthdat  511888 non-null  object 
 16  methdel   511880 non-null  object 
 17  apgar5    511888 non-null  float64
dtypes: float64(6), object(12)
memory usage: 74.2+ MB


In [8]:
# Format data values

cols = ['mdob','lperiod', 'birthdat']

for col in cols:
    data[col] = pd.to_datetime(data[col])

Replace missing induction method with 'NA'

In [9]:
data['methind'] = replace_missing_values(data['methind'], 'NA')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  s[missing] = 'NA'


## Calculate required fields

In [10]:
# Calculate BMI
data['bmi'] = data['mweight'] / ((data['mheight']/100) ** 2)
data.drop(['mweight', 'mheight'], axis=1, inplace=True)

In [11]:
# Calculate age (years)
age = data['birthdat'] - data['mdob']
age = age.dt.days / 365
data ['age'] = age.values.astype(int)

# Calculate gestantion age (weeks)
gest = data['birthdat'] - data['lperiod']
gest = gest.dt.days / 7
data ['gest'] = gest.values.astype(int)

# Remove dates
data.drop(['birthdat', 'mdob', 'lperiod'], axis=1, inplace=True)


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 511888 entries, 3.0 to 585291.0
Data columns (total 16 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   hospname  511888 non-null  object 
 1   parity    511798 non-null  float64
 2   prevcaes  511798 non-null  float64
 3   noscans   505133 non-null  float64
 4   onsetla   511276 non-null  object 
 5   analab    504702 non-null  object 
 6   analdel   484892 non-null  object 
 7   methind   511888 non-null  object 
 8   pyrexia   510281 non-null  object 
 9   ctg       511000 non-null  object 
 10  meconium  511285 non-null  object 
 11  methdel   511880 non-null  object 
 12  apgar5    511888 non-null  float64
 13  bmi       405309 non-null  float64
 14  age       511888 non-null  int64  
 15  gest      511888 non-null  int64  
dtypes: float64(5), int64(2), object(9)
memory usage: 66.4+ MB


In [13]:
# Keep complete data only
mask = data.isnull().sum(axis=1) == 0
data = data[mask]
data.info()

<class 'pandas.core.frame.DataFrame'>
Float64Index: 376368 entries, 4.0 to 585291.0
Data columns (total 16 columns):
 #   Column    Non-Null Count   Dtype  
---  ------    --------------   -----  
 0   hospname  376368 non-null  object 
 1   parity    376368 non-null  float64
 2   prevcaes  376368 non-null  float64
 3   noscans   376368 non-null  float64
 4   onsetla   376368 non-null  object 
 5   analab    376368 non-null  object 
 6   analdel   376368 non-null  object 
 7   methind   376368 non-null  object 
 8   pyrexia   376368 non-null  object 
 9   ctg       376368 non-null  object 
 10  meconium  376368 non-null  object 
 11  methdel   376368 non-null  object 
 12  apgar5    376368 non-null  float64
 13  bmi       376368 non-null  float64
 14  age       376368 non-null  int64  
 15  gest      376368 non-null  int64  
dtypes: float64(5), int64(2), object(9)
memory usage: 48.8+ MB


In [14]:
# Remove subzero age and gestational age
mask = (data['age'] > 0) & (data['gest'] > 0)
data = data[mask]

# Randomise
data = data.sample(frac=1.0)

In [15]:
data.describe()

Unnamed: 0,parity,prevcaes,noscans,apgar5,bmi,age,gest
count,361174.0,361174.0,361174.0,361174.0,361174.0,361174.0,361174.0
mean,0.858578,0.082243,2.091646,9.487438,24.239302,28.686805,39.573142
std,1.066241,0.32216,1.309348,0.836593,4.634404,5.274806,14.886485
min,0.0,0.0,0.0,0.0,0.0,13.0,4.0
25%,0.0,0.0,1.0,9.0,21.339608,25.0,39.0
50%,1.0,0.0,2.0,10.0,23.458562,29.0,40.0
75%,1.0,0.0,3.0,10.0,26.304339,32.0,41.0
max,17.0,7.0,9.0,10.0,77.669095,91.0,4737.0


In [16]:
data.head().T

recordno,44356.0,213584.0,285942.0,204325.0,458433.0
hospname,Edgware General Hospital,Central Middlesex Hospital,Northwick Park Hospital,Watford General Hospital,Northwick Park Hospital
parity,0.0,3.0,1.0,0.0,3.0
prevcaes,0.0,0.0,0.0,0.0,0.0
noscans,2.0,1.0,2.0,1.0,2.0
onsetla,S,S,I,I,S
analab,PI,PEI,PI,E,N
analdel,L,G,N,G,N
methind,,O,PA,AO,
pyrexia,N,Y,N,N,N
ctg,N,A,N,A,N


In [17]:
data.to_csv('./data/data_for_model.csv', index=False)