In [1]:
from pathlib import Path
import pandas as pd
import numpy as np

In [2]:
# file path to the interim data to be cleaned
FILE_PATH = Path.cwd().resolve().parents[0] / "data" / "interim" / "raw_data.csv"

In [3]:
df = pd.read_csv(FILE_PATH)

In [4]:
df.head()

Unnamed: 0,month,time,day,place,m_age,m_nativity,m_edu,num_of_prenatals,WIC,smoker,...,preterm_births,infert_treatment,fert_drugs,prev_c_section,no_infections,induction,steroids,antibiotics,anesthesia,delivery_method
0,1,501,2,1,30,1,3,8,N,N,...,N,N,X,N,1,N,N,N,N,1
1,1,509,3,1,28,1,6,13,N,N,...,N,N,X,N,1,N,N,N,N,1
2,1,525,3,1,41,1,6,11,N,N,...,Y,N,X,Y,1,N,N,Y,Y,1
3,1,1910,2,2,29,1,4,10,N,N,...,N,N,X,N,1,N,N,N,N,1
4,1,2241,2,1,28,1,6,13,N,N,...,Y,N,X,N,1,N,N,N,N,1


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3638436 entries, 0 to 3638435
Data columns (total 26 columns):
 #   Column             Dtype  
---  ------             -----  
 0   month              int64  
 1   time               int64  
 2   day                int64  
 3   place              int64  
 4   m_age              int64  
 5   m_nativity         int64  
 6   m_edu              int64  
 7   num_of_prenatals   int64  
 8   WIC                object 
 9   smoker             object 
 10  m_bmi              float64
 11  pre_diabetes       object 
 12  gest_diabetes      object 
 13  pre_hypertension   object 
 14  gest_hypertension  object 
 15  eclampsia          object 
 16  preterm_births     object 
 17  infert_treatment   object 
 18  fert_drugs         object 
 19  prev_c_section     object 
 20  no_infections      int64  
 21  induction          object 
 22  steroids           object 
 23  antibiotics        object 
 24  anesthesia         object 
 25  delivery_method   

# Cleaning each column

## 0: Month

In [6]:
# No cleaning needed
df['month'].unique()

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12])

## 1: Time

'time' is an integer n such that n//100 is the hours and n%100 is the minutes since 0:00 on a 24-hour clock. This represents time of birth. 9999 encodes an unknown time of birth.

We will replace 9999 with np.nan and convert all else into a float representing hours since 0:00.

In [7]:
df['time'] = df['time'].replace(9999, np.nan)

In [8]:
df['time'] = df['time'].apply(lambda x: (x//100) + (x%100)/60)

## 2: Day

In [9]:
# No cleaning needed
df['day'].unique()

array([2, 3, 4, 1, 7, 5, 6])

## 3: Place

In [10]:
# 1 encodes a hospital birth, 2 not in a hospital, 3 unknown.
# We will make this categorical, rename as 'hospital',
# and replace 1 with 'Y', 2 with 'N', and 3 with np.nan
df['place'].unique()

array([1, 2, 3])

In [11]:
df['place'] = df['place'].replace({
    1: 'Y',
    2: 'N',
    3: np.nan
})

df.rename(columns={'place': 'hospital'}, inplace=True)

## 4: Age

In [12]:
# No cleaning needed
np.sort(df['m_age'].unique())

array([12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
       29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
       46, 47, 48, 49, 50])

## 5: Nativity

In [13]:
# 1 encodes mother is U.S. born, 2 born outside U.S., 3 unknown.
# We will make this categorical, rename 'm_us_born',
# and replace 1 with 'Y', 2 with 'N', and 3 with np.nan
df['m_nativity'].unique()

array([1, 2, 3])

In [14]:
df['m_nativity'] = df['m_nativity'].replace({
    1: 'Y',
    2: 'N',
    3: np.nan
})

df.rename(columns={'m_nativity': 'm_us_born'}, inplace=True)

## 6: Education

In [15]:
# 1-8 encodes a mother's education, from 8th grade to doctorate.
# 9 encodes unknown education level. We will replace 9 with np.nan
df['m_edu'].unique()

array([3, 6, 4, 7, 8, 9, 2, 5, 1])

In [18]:
df['m_edu'] = df['m_edu'].replace(9, np.nan)

## 7: Number of prenatal visits

In [20]:
# 0-98 encodes the number of prenatal visits a mother had
# before giving birth, 99 encodes unknown. We will replace 99
# with np.nan
np.sort(df['num_of_prenatals'].unique())

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 76, 77, 80, 88, 90, 99])

In [26]:
df['num_of_prenatals'] = df['num_of_prenatals'].replace(99, np.nan)

## 10: BMI

In [27]:
# 13.0-69.9 encode the mother's bmi, 99.9 encodes unknown.
# We will replace 99.9 with np.nan
df['m_bmi'].min(), df['m_bmi'].max()

(np.float64(13.0), np.float64(99.9))

In [29]:
df['m_bmi'] = df['m_bmi'].replace(99.9, np.nan)