In [36]:
from pathlib import Path
import pandas as pd
import numpy as np

In [37]:
# file path to the interim data to be cleaned
FILE_PATH = Path.cwd().resolve().parents[0] / "data" / "interim" / "raw_data.csv"

In [38]:
df = pd.read_csv(FILE_PATH)

In [39]:
df.head()

Unnamed: 0,month,time,day,place,m_age,m_nativity,m_edu,num_of_prenatals,WIC,smoker,...,eclampsia,preterm_births,infert_treatment,prev_c_section,no_infections,induction,steroids,antibiotics,anesthesia,delivery_method
0,1,501,2,1,30,1,3,8,N,N,...,N,N,N,N,1,N,N,N,N,1
1,1,509,3,1,28,1,6,13,N,N,...,N,N,N,N,1,N,N,N,N,1
2,1,525,3,1,41,1,6,11,N,N,...,N,Y,N,Y,1,N,N,Y,Y,1
3,1,1910,2,2,29,1,4,10,N,N,...,N,N,N,N,1,N,N,N,N,1
4,1,2241,2,1,28,1,6,13,N,N,...,N,Y,N,N,1,N,N,N,N,1


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3638436 entries, 0 to 3638435
Data columns (total 25 columns):
 #   Column             Dtype  
---  ------             -----  
 0   month              int64  
 1   time               int64  
 2   day                int64  
 3   place              int64  
 4   m_age              int64  
 5   m_nativity         int64  
 6   m_edu              int64  
 7   num_of_prenatals   int64  
 8   WIC                object 
 9   smoker             object 
 10  m_bmi              float64
 11  pre_diabetes       object 
 12  gest_diabetes      object 
 13  pre_hypertension   object 
 14  gest_hypertension  object 
 15  eclampsia          object 
 16  preterm_births     object 
 17  infert_treatment   object 
 18  prev_c_section     object 
 19  no_infections      int64  
 20  induction          object 
 21  steroids           object 
 22  antibiotics        object 
 23  anesthesia         object 
 24  delivery_method    int64  
dtypes: float64(1), int

# Cleaning each column

## 0: Month

In [41]:
# No cleaning needed
df['month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

## 1: Time

'time' is an integer n such that n//100 is the hours and n%100 is the minutes since 0:00 on a 24-hour clock. This represents time of birth. 9999 encodes an unknown time of birth.

We will replace 9999 with np.nan and convert all else into a float representing hours since 0:00.

In [42]:
df['time'] = df['time'].replace(9999, np.nan)

In [43]:
df['time'] = df['time'].apply(lambda x: (x//100) + (x%100)/60)

## 2: Day

In [44]:
# No cleaning needed
df['day'].unique()

array([2, 3, 4, 1, 7, 5, 6])

## 3: Place

In [45]:
# 1 encodes a hospital birth, 2 not in a hospital, 3 unknown.
# We will make this categorical, rename as 'hospital',
# and replace 1 with 'Y', 2 with 'N', and 3 with np.nan
df['place'].unique()

array([1, 2, 3])

In [46]:
df['place'] = df['place'].replace({
    1: 'Y',
    2: 'N',
    3: np.nan
})

df.rename(columns={'place': 'hospital'}, inplace=True)

## 4: Age

In [47]:
# No cleaning needed
np.sort(df['m_age'].unique())

array([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
       29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
       46, 47, 48, 49, 50])

## 5: Nativity

In [48]:
# 1 encodes mother is U.S. born, 2 born outside U.S., 3 unknown.
# We will make this categorical, rename 'm_us_born',
# and replace 1 with 'Y', 2 with 'N', and 3 with np.nan
df['m_nativity'].unique()

array([1, 2, 3])

In [49]:
df['m_nativity'] = df['m_nativity'].replace({
    1: 'Y',
    2: 'N',
    3: np.nan
})

df.rename(columns={'m_nativity': 'm_us_born'}, inplace=True)

## 6: Education

In [50]:
# 1-8 encodes a mother's education, from 8th grade to doctorate.
# 9 encodes unknown education level. We will replace 9 with np.nan
df['m_edu'].unique()

array([3, 6, 4, 7, 8, 9, 2, 5, 1])

In [51]:
df['m_edu'] = df['m_edu'].replace(9, np.nan)

## 7: Number of prenatal visits

In [52]:
# 0-98 encodes the number of prenatal visits a mother had
# before giving birth, 99 encodes unknown. We will replace 99
# with np.nan
np.sort(df['num_of_prenatals'].unique())

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 76, 77, 80, 88, 90, 99])

In [53]:
df['num_of_prenatals'] = df['num_of_prenatals'].replace(99, np.nan)

## 10: BMI

In [54]:
# 13.0-69.9 encode the mother's bmi, 99.9 encodes unknown.
# We will replace 99.9 with np.nan
df['m_bmi'].min(), df['m_bmi'].max()

(np.float64(13.0), np.float64(99.9))

In [55]:
df['m_bmi'] = df['m_bmi'].replace(99.9, np.nan)

## 19: No infections

In [56]:
# 1 encodes the presense of an STI, 0 no STI, and 9 unknown.
# We will rename this 'sti' and replace 1 with 'N', 0 with 'Y', and 9 with np.nan
df['no_infections']. unique()

array([1, 0, 9])

In [57]:
df['no_infections'] = df['no_infections'].replace({
    1: 'N',
    0: 'Y',
    9: np.nan
})

df.rename(columns={'no_infections': 'sti'}, inplace=True)

## 24: Delivery method

In [58]:
# 1 encodes a vaginal birth, 2 a c section, and 9 unknown.
# We will rename this 'c_section' and replace 1 with 'N', 2 with 'Y', and 9 with np.nan
df['delivery_method'].unique()

array([1, 2, 9])

In [59]:
df['delivery_method'] = df['delivery_method'].replace({
    1: 'N',
    2: 'Y',
    9: np.nan
})

df.rename(columns={'delivery_method': 'c_section'}, inplace=True)

## All other columns

In [60]:
# All remaining columns have unique values 'Y', 'N', and 'U'. We will replace all 'U' with np.nan
df = df.replace('U', np.nan)

# Drop np.nan and downcast dtypes

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3638436 entries, 0 to 3638435
Data columns (total 25 columns):
 #   Column             Dtype  
---  ------             -----  
 0   month              int64  
 1   time               float64
 2   day                int64  
 3   hospital           object 
 4   m_age              int64  
 5   m_us_born          object 
 6   m_edu              float64
 7   num_of_prenatals   float64
 8   WIC                object 
 9   smoker             object 
 10  m_bmi              float64
 11  pre_diabetes       object 
 12  gest_diabetes      object 
 13  pre_hypertension   object 
 14  gest_hypertension  object 
 15  eclampsia          object 
 16  preterm_births     object 
 17  infert_treatment   object 
 18  prev_c_section     object 
 19  sti                object 
 20  induction          object 
 21  steroids           object 
 22  antibiotics        object 
 23  anesthesia         object 
 24  c_section          object 
dtypes: float64(4), int

In [62]:
# Drop all incomplete records (~250000)
df.dropna(inplace=True)

In [63]:
# Recode the binary predictors:
# Y -> 1
# N -> 0
df.replace({'Y': 1, 'N': 0}, inplace=True)

  df.replace({'Y': 1, 'N': 0}, inplace=True)


In [64]:
# Recast m_edu and num_of_prenatals as ints
df['m_edu'] = df['m_edu'].astype(int)
df['num_of_prenatals'] = df['num_of_prenatals'].astype(int)

In [65]:
# Downcast dtypes to optimize memory footprint
fcols = df.select_dtypes('float').columns
icols = df.select_dtypes('integer').columns

df[fcols] = df[fcols].apply(pd.to_numeric, downcast='float')
df[icols] = df[icols].apply(pd.to_numeric, downcast='integer')

In [66]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3388302 entries, 0 to 3638435
Data columns (total 25 columns):
 #   Column             Dtype  
---  ------             -----  
 0   month              int8   
 1   time               float32
 2   day                int8   
 3   hospital           int8   
 4   m_age              int8   
 5   m_us_born          int8   
 6   m_edu              int8   
 7   num_of_prenatals   int8   
 8   WIC                int8   
 9   smoker             int8   
 10  m_bmi              float32
 11  pre_diabetes       int8   
 12  gest_diabetes      int8   
 13  pre_hypertension   int8   
 14  gest_hypertension  int8   
 15  eclampsia          int8   
 16  preterm_births     int8   
 17  infert_treatment   int8   
 18  prev_c_section     int8   
 19  sti                int8   
 20  induction          int8   
 21  steroids           int8   
 22  antibiotics        int8   
 23  anesthesia         int8   
 24  c_section          int8   
dtypes: float32(2), int8(23)

# Store processed data

In [67]:
# Save the processed data
DATA_PATH = Path.cwd().resolve().parents[0] / "data" / "processed" / "processed_data.csv"

In [68]:
df.to_csv(DATA_PATH, index=False)