In [40]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.impute import KNNImputer
import seaborn as sns

In [62]:
df = pd.read_csv('Data/births_toy.csv', index_col=[0])
#df = pd.read_csv('Data/US_births(2018).csv')
col_dict = pd.read_excel('Data/Column_dictionary.xlsx', header=None)

In [63]:
df.shape

(10000, 55)

In [64]:
df.head()

Unnamed: 0,ATTEND,BFACIL,BMI,CIG_0,DBWT,DLMP_MM,DLMP_YY,DMAR,DOB_MM,DOB_TT,...,PRIORDEAD,PRIORLIVE,PRIORTERM,PWgt_R,RDMETH_REC,RESTATUS,RF_CESAR,RF_CESARN,SEX,WTGAIN
1,1,1,46.3,0,2183,6,2017,1.0,2,1341,...,0,0,1,270,1,1,N,0,M,40
2,1,1,25.5,0,3280,4,2017,2.0,1,621,...,0,0,0,153,1,2,N,0,F,26
3,1,1,21.8,0,2410,3,2018,2.0,12,741,...,0,0,0,123,1,1,N,0,M,31
4,3,1,22.7,0,3544,3,2018,2.0,12,750,...,0,1,1,124,2,1,Y,1,F,26
5,1,1,34.0,3,2778,10,2017,2.0,7,1423,...,0,1,1,192,1,1,N,0,M,0


In [65]:
col_dict.head()

Unnamed: 0,0,1
0,ATTEND,birth_attendant
1,BFACIL,birth_place
2,BMI,bmi
3,CIG_0,cigs_before_preg
4,DBWT,birthweight_g


In [66]:
df.columns = col_dict[1]
df.columns = map(str.lower, df.columns)

In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10000 entries, 1 to 10000
Data columns (total 55 columns):
birth_attendant          10000 non-null int64
birth_place              10000 non-null int64
bmi                      10000 non-null float64
cigs_before_preg         10000 non-null int64
birthweight_g            10000 non-null int64
last_norm_menses_mn      10000 non-null int64
last_norm_menses_yr      10000 non-null int64
marital_stat             8821 non-null float64
birth_mn                 10000 non-null int64
birth_time               10000 non-null int64
birth_dy                 10000 non-null int64
birth_yr                 10000 non-null int64
m_deliveryweight         10000 non-null int64
f_age                    10000 non-null int64
f_education              10000 non-null int64
f_hispanic               10000 non-null int64
f_race15                 10000 non-null int64
f_race31                 10000 non-null int64
f_race6                  10000 non-null int64
last_live_birt

* Cleaning data: removing useless columns, coding NAs, coding in binary, and removing entries with impossible values (high/low birthweight)

In [68]:
df = df.drop(['last_other_preg_mn', 
              'sex_imp', 
              'm_age_imp', 
              'm_marital_stat_imp', 
              'm_race_imp',
              'payment_det'], axis=1)

In [69]:
def replace_na(data, column_name, value):
    data[column_name] = data[column_name].mask(data[column_name] == value, np.nan)

In [70]:
replace_na(df, 'bmi', 99.9)
replace_na(df, 'm_nativity', 3)

In [71]:
columns_9 = ['birth_attendant','birth_place','f_education','f_hispanic','f_race6','m_education','m_hispanic',
             'infections','m_morbidity','riskf','delivery_method','payment']
              
for i in columns_9:
    replace_na(df, i, 9)

In [72]:
columns_99 = ['cigs_before_preg','last_norm_menses_mn','f_age','f_race15','f_race31','m_height_in',
             'mn_prenatalcare_began','num_prenatal_visits','prior_births_dead','prior_births_living',
             'prior_terminations','num_prev_cesareans','weight_gain']

for i in columns_99:
    replace_na(df, i, 99)

In [73]:
columns_888 = ['last_preg_mn','last_live_birth_mn']

for i in columns_888:
    replace_na(df, i, 888)

In [74]:
columns_999 = ['last_live_birth_mn','last_preg_mn','m_deliveryweight','prepreg_weight']

for i in columns_999:
    replace_na(df, i, 999)

In [75]:
columns_9999 = ['birthweight_g','last_norm_menses_yr','birth_time']

for i in columns_9999:
    replace_na(df, i, 9999)

In [76]:
columns_U = ['gonorrhea','labour_induced','admit_icu','m_transferred','prev_cesarean']

for i in columns_U:
    replace_na(df, i, 'U')

In [77]:
df['infections'] = df['infections'].mask(df['infections'] == 1, 'N')
df['infections'] = df['infections'].mask(df['infections'] == 0, 'Y')

df['m_morbidity'] = df['m_morbidity'].mask(df['m_morbidity'] == 1, 'N')
df['m_morbidity'] = df['m_morbidity'].mask(df['m_morbidity'] == 0, 'Y')

df['riskf'] = df['riskf'].mask(df['riskf'] == 1, 'N')
df['riskf'] = df['riskf'].mask(df['riskf'] == 0, 'Y')

In [78]:
df.describe()

Unnamed: 0,birth_attendant,birth_place,bmi,cigs_before_preg,birthweight_g,last_norm_menses_mn,last_norm_menses_yr,marital_stat,birth_mn,birth_time,...,mn_prenatalcare_began,num_prenatal_visits,prior_births_dead,prior_births_living,prior_terminations,prepreg_weight,delivery_method,res_status,num_prev_cesareans,weight_gain
count,9998.0,10000.0,9791.0,9964.0,9994.0,9536.0,9561.0,8821.0,10000.0,10000.0,...,9736.0,9749.0,9972.0,9976.0,9963.0,9822.0,9994.0,10000.0,9991.0,9719.0
mean,1.319164,1.0366,27.204882,1.15847,3269.019412,6.566904,2017.25102,1.388051,6.5605,1218.2194,...,2.918447,11.366704,0.020357,1.133721,0.426177,159.396864,1.795077,1.3329,0.218397,29.659739
std,0.741094,0.33507,6.85617,5.028263,588.318525,3.486964,0.435068,0.487334,3.411672,626.39526,...,1.518631,4.090332,0.243019,1.281353,0.893914,42.409744,1.159902,0.531701,0.586933,15.104574
min,1.0,1.0,15.0,0.0,277.0,1.0,2016.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,79.0,1.0,1.0,0.0,0.0
25%,1.0,1.0,22.1,0.0,2977.0,4.0,2017.0,1.0,4.0,758.0,...,2.0,9.0,0.0,0.0,0.0,130.0,1.0,1.0,0.0,20.0
50%,1.0,1.0,25.6,0.0,3310.0,7.0,2017.0,1.0,7.0,1230.0,...,3.0,12.0,0.0,1.0,0.0,150.0,1.0,1.0,0.0,29.0
75%,1.0,1.0,30.9,0.0,3630.0,10.0,2018.0,2.0,10.0,1723.0,...,3.0,13.0,0.0,2.0,1.0,180.0,3.0,2.0,0.0,39.0
max,5.0,7.0,66.4,98.0,6150.0,12.0,2018.0,2.0,12.0,2359.0,...,10.0,49.0,10.0,17.0,14.0,375.0,6.0,4.0,6.0,98.0


In [159]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 40 columns):
bmi                      10000 non-null float64
cigs_before_preg         10000 non-null float64
birthweight_g            10000 non-null float64
birth_time               10000 non-null float64
m_deliveryweight         10000 non-null float64
f_age                    10000 non-null float64
m_age                    10000 non-null float64
m_height_in              10000 non-null float64
num_prenatal_visits      10000 non-null float64
prior_births_dead        10000 non-null float64
prior_births_living      10000 non-null float64
prior_terminations       10000 non-null float64
prepreg_weight           10000 non-null float64
num_prev_cesareans       10000 non-null float64
time_since_menses        10000 non-null float64
birth_attendant          10000 non-null float64
birth_place              10000 non-null float64
birth_mn                 10000 non-null float64
birth_dy                 100

In [80]:
def convert_binary(data, col_name):
    data[col_name] = data[col_name].mask(data[col_name] == 'Y', 1)
    data[col_name] = data[col_name].mask(data[col_name] == 'N', 0)

cols = ['gonorrhea','labour_induced','admit_icu','m_transferred','infections','m_morbidity','prev_cesarean', 'riskf']

for i in cols:
    convert_binary(df, i)

In [81]:
df['infant_sex'] = df['infant_sex'].mask(df['infant_sex'] == 'M', 1)
df['infant_sex'] = df['infant_sex'].mask(df['infant_sex'] == 'F', 0)

In [82]:
#def convert_int(data, col_name):
 #   data[col_name] = data[col_name].astype(int)
    
#cols = ['gonorrhea','labour_induced','admit_icu','m_transferred','infections','m_morbidity','riskf','infant_sex']

#for i in cols:
 #   convert_int(df, i)

In [83]:
#drop recoded
df = df.drop(['f_race15','f_race31','m_race15','m_race31'], axis=1)

In [84]:
#drop those with too many missing values
df = df.drop(['last_live_birth_mn','last_preg_mn'], axis=1)

In [85]:
print('{:.2f}% of infants born in US in 2018 were LBW infants'.\
      format(1e2*len(df[df['birthweight_g'] < 2500])/len(df)))
print('{:.2f}% of infants born in US in 2018 were very-LBW infants'.\
      format(1e2*len(df[df['birthweight_g'] < 1500])/len(df)))

8.29% of infants born in US in 2018 were LBW infants
1.39% of infants born in US in 2018 were very-LBW infants


In [86]:
#idx = df[df['birthweight_g'] < 300].index
#df.drop(idx, inplace=True)

In [87]:
#idx = df[df['birthweight_g'] > 7000].index
#df.drop(idx, inplace=True)

In [88]:
#dropping marital_stat because something wrong with encoding
#dropping weight_gain because will encode weight_change later instead
df = df.drop(['marital_stat','weight_gain'], axis=1)

* Encoding time between birth and menses, instead of date of last menses

In [89]:
df['last_menses_day'] = 15
   
last_menses = pd.DataFrame({'year': df['last_norm_menses_yr'], 
                            'month': df['last_norm_menses_mn'], 
                            'day': df['last_menses_day']}) 
df['last_menses'] = pd.to_datetime(last_menses)

df = df.drop(['last_norm_menses_yr','last_norm_menses_mn','last_menses_day'], axis=1)

In [90]:
dob = pd.DataFrame({'year': df['birth_yr'],
                    'month': df['birth_mn'], 
                    'day': df['birth_dy']})  
df['dob'] = pd.to_datetime(dob)

df = df.drop(['birth_yr'], axis=1)

In [91]:
df['time_since_menses'] = (df['dob'] - df['last_menses'])

df = df.drop(['last_menses','dob'], axis=1)

In [92]:
df['time_since_menses']

1       236 days
2       263 days
3       263 days
4       264 days
5       265 days
          ...   
9996    261 days
9997    261 days
9998    263 days
9999    261 days
10000   263 days
Name: time_since_menses, Length: 10000, dtype: timedelta64[ns]

In [93]:
df['time_since_menses'] = df['time_since_menses'].dt.days

In [94]:
#df.to_csv('Data/Processed_data_full.csv')
df.to_csv('Data/Processed_data.csv')

#### Imputation

* Tried making a processing pipeline, for both numeric and categorical data - currently not working 

In [149]:
df = pd.read_csv('Data/Processed_data.csv', index_col=[0])

In [150]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [151]:
numeric_cols = ('bmi','cigs_before_preg','birthweight_g','birth_time','m_deliveryweight','f_age','m_age',
               'm_height_in','num_prenatal_visits','prior_births_dead','prior_births_living','prior_terminations',
               'prepreg_weight','num_prev_cesareans','time_since_menses')

cat_cols = ('birth_attendant','birth_place','birth_mn','birth_dy','f_education','f_hispanic','f_race6','gonorrhea',
           'labour_induced','m_nativity','m_education','m_hispanic','admit_icu','m_race6','m_transferred',
           'infections','m_morbidity','riskf','payment','mn_prenatalcare_began','delivery_method','res_status',
           'prev_cesarean','infant_sex')

In [152]:
numeric_transformer = Pipeline(steps = [('imputer', IterativeImputer(random_state=0))])
  
    #('scaler', StandardScaler())])

cat_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='most_frequent'))])
  
    #('onehot', OneHotEncoder())])

preprocessor = ColumnTransformer(transformers=[
      ('num', numeric_transformer, numeric_cols),
      ('cat', cat_transformer, cat_cols)])

In [153]:
df_processed = preprocessor.fit_transform(df)



In [154]:
df_processed = pd.DataFrame(df_processed)
df_processed.columns = numeric_cols + cat_cols

In [155]:
df_processed['weight_change'] = df_processed['m_deliveryweight'] - df_processed['prepreg_weight']

In [157]:
df_bin = df_processed.copy()
df_bin['birthweight_bin'] = np.where(df_bin['birthweight_g'] < 2500, 1, 0)

df_cat = df_bin.copy()
df_cat['birthweight_cat'] = np.where(df_cat['birthweight_g'] < 1500, 2, df_cat['birthweight_bin'])

df_bin = df_bin.drop(['birthweight_g'], axis=1)
df_cat = df_cat.drop(['birthweight_g', 'birthweight_bin'], axis=1)

In [158]:
df_processed.to_csv('Data/Pipeline_data_toy.csv')
df_bin.to_csv('Data/Pipeline_data_toy_bin.csv')
df_cat.to_csv('Data/Pipeline_data_toy_cat.csv')

* Iterative imputer

In [None]:
imputer = IterativeImputer(random_state=0)
df_imputed = pd.DataFrame(imputer.fit_transform(df))

In [None]:
df_colnames = df.columns
df_imputed.columns = df_colnames

In [121]:
df_imputed.to_csv('Data/MICE_data_toy.csv')

In [28]:
df_imputed = pd.read_csv('Data/MICE_data2.csv', index_col=[0])

  mask |= (ar1 == a)


In [29]:
df_imputed.head()

Unnamed: 0,birth_attendant,birth_place,bmi,cigs_before_preg,birthweight_g,birth_mn,birth_time,birth_dy,m_deliveryweight,f_age,...,prior_births_dead,prior_births_living,prior_terminations,prepreg_weight,delivery_method,res_status,prev_cesarean,num_prev_cesareans,infant_sex,time_since_menses
0,1.0,1.0,30.7,0.0,3657.0,1.0,1227.0,2.0,231.0,31.0,...,0.0,1.0,2.0,190.0,1.0,2.0,0.0,0.0,1.0,262.0
1,1.0,1.0,33.3,2.0,3242.0,1.0,1704.0,2.0,185.0,35.0,...,0.0,2.0,0.0,188.0,4.0,2.0,1.0,2.0,0.0,257.233267
2,1.0,1.0,30.0,0.0,3470.0,1.0,336.0,2.0,273.0,31.0,...,0.0,1.0,0.0,215.0,1.0,1.0,0.0,0.0,1.0,262.0
3,3.0,1.0,23.7,0.0,3140.0,1.0,938.0,2.0,138.0,26.0,...,0.0,2.0,0.0,138.0,1.0,2.0,0.0,0.0,0.0,232.0
4,1.0,1.0,35.5,0.0,2125.0,1.0,830.0,3.0,219.0,35.0,...,0.0,1.0,4.0,220.0,3.0,1.0,0.0,0.0,1.0,240.35572


In [30]:
df_imputed['weight_change'] = df_imputed['m_deliveryweight'] - df_imputed['prepreg_weight']

In [32]:
cols = ['birth_attendant','birth_place','cigs_before_preg','birth_mn','birth_dy','f_education','f_hispanic',
        'f_race6','gonorrhea','labour_induced','m_nativity','m_education','m_hispanic','admit_icu','m_race6',
        'm_transferred','infections','m_morbidity','riskf','payment','mn_prenatalcare_began',
        'num_prenatal_visits','prior_births_dead','prior_births_living','prior_terminations','delivery_method',
        'res_status','prev_cesarean','num_prev_cesareans','infant_sex']

df_imputed[cols] = df_imputed[cols].round()

In [33]:
df_bin = df_imputed.copy()
df_bin['birthweight_bin'] = np.where(df_bin['birthweight_g'] < 2500, 1, 0)

df_cat = df_bin.copy()
df_cat['birthweight_cat'] = np.where(df_cat['birthweight_g'] < 1500, 2, df_cat['birthweight_bin'])

df_bin = df_bin.drop(['birthweight_g'], axis=1)
df_cat = df_cat.drop(['birthweight_g', 'birthweight_bin'], axis=1)

In [None]:
df_imputed.to_csv('Data/MICE_processed.csv')
df_bin.to_csv('Data/MICE_bin_processed.csv')
df_cat.to_csv('Data/MICE_cat_processed.csv')

* Trying KNN imputation to see if this is better for categorical numerical variables 

In [164]:
df = pd.read_csv('Data/Processed_data.csv', index_col=[0])

In [165]:
df.head()

Unnamed: 0,birth_attendant,birth_place,bmi,cigs_before_preg,birthweight_g,birth_mn,birth_time,birth_dy,m_deliveryweight,f_age,...,prior_births_dead,prior_births_living,prior_terminations,prepreg_weight,delivery_method,res_status,prev_cesarean,num_prev_cesareans,infant_sex,time_since_menses
1,1.0,1,46.3,0.0,2183.0,2,1341,6,310.0,,...,0.0,0.0,1.0,270.0,1.0,1,0.0,0.0,1,236.0
2,1.0,1,25.5,0.0,3280.0,1,621,3,179.0,,...,0.0,0.0,0.0,153.0,1.0,2,0.0,0.0,0,263.0
3,1.0,1,21.8,0.0,2410.0,12,741,3,154.0,27.0,...,0.0,0.0,0.0,123.0,1.0,1,0.0,0.0,1,263.0
4,3.0,1,22.7,0.0,3544.0,12,750,4,150.0,31.0,...,0.0,1.0,1.0,124.0,2.0,1,1.0,1.0,0,264.0
5,1.0,1,34.0,3.0,2778.0,7,1423,7,192.0,28.0,...,0.0,1.0,1.0,192.0,1.0,1,0.0,0.0,1,265.0


In [166]:
df_KNN = df.copy()

In [167]:
KNN_imputer = KNNImputer(n_neighbors=5)
df_KNN = KNN_imputer.fit_transform(df)
df_KNN = pd.DataFrame(df_KNN)

In [168]:
df_colnames = df.columns
df_KNN.columns = df_colnames

In [122]:
df_knn_bin = pd.DataFrame.copy(df_KNN)
df_knn_bin['birthweight_bin'] = np.where(df_knn_bin['birthweight_g'] < 2500, 1, 0)

In [123]:
df_knn_cat = pd.DataFrame.copy(df_knn_bin)
df_knn_cat['birthweight_cat'] = np.where(df_knn_cat['birthweight_g'] < 1500, 2, df_knn_cat['birthweight_bin'])

In [124]:
df_knn_bin = df_knn_bin.drop(['birthweight_g'], axis=1)
df_knn_cat = df_knn_cat.drop(['birthweight_g', 'birthweight_bin'], axis=1)

In [125]:
df_KNN.to_csv('Data/KNN_data_toy.csv')
df_knn_bin.to_csv('Data/KNN_data_bin_toy.csv')
df_knn_cat.to_csv('Data/KNN_data_cat_toy.csv')