In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn import metrics
from sklearn.preprocessing import PolynomialFeatures

%matplotlib inline

pd.set_option('display.max_columns',500)
pd.set_option('display.max_rows',500)

In [2]:
df = pd.read_csv('../US_births.csv', low_memory=False)
df = df.sample(n=250_000, random_state=42)
df.shape

(250000, 55)

In [3]:
def drop_rows(df):
    df.drop(df[df['DBWT'].eq(9999)].index, inplace=True)
#     df['DBWT'] = np.where(df['DBWT'].eq(9999), df[df['DBWT'].lt(9999)]['DBWT'].mean(), df['DBWT'])
    
    df.drop(df[df['BMI'].eq(99.9)].index, inplace=True)
    df.drop(df[df['DBWT'].eq(9999)].index, inplace=True)
    df.drop(df[df['WTGAIN'].eq(99)].index, inplace=True)
    df.drop(df[df['PWgt_R'].eq(999)].index, inplace=True)
#     df.drop(df[df['FAGECOMB'].eq(99)].index, inplace=True)
    df.drop(df[df['DLMP_MM'].eq(99)].index, inplace=True)
    df.drop(df[df['DLMP_YY'].eq(9999)].index, inplace=True)
    return df


In [4]:
df = drop_rows(df)
df.shape

(231493, 55)

In [5]:
def feature_engineer(df):
    df['binary_sex'] = np.where(df['SEX'] == 'M', 1, 0)
    df['first_birth'] = np.where(df['ILLB_R'].eq(888), 1, 0)

    df['binary_sex'] = np.where(df['SEX'] == 'M', 1, 0)
    df['first_birth'] = np.where(df['ILLB_R'].eq(888), 1, 0)

    conditions = [df['CIG_0'].eq(0),
                  df['CIG_0'].eq(99),
    ]
    choices = ['None',
               'Unknown',
    ]
    df['smoked'] = np.select(conditions, choices, 'Daily')

    
    conditions = [df['PRIORDEAD'].eq(0),
                  df['PRIORDEAD'].eq(99),
    ]
    choices = ['None',
               'Unknown',
    ]
    df['PRIORDEAD_cat'] = np.select(conditions, choices, 'Yes')

    
    conditions = [df['PRIORTERM'].eq(0),
                  df['PRIORTERM'].eq(99),
    ]
    choices = ['None',
               'Unknown',
    ]
    df['PRIORTERM_cat'] = np.select(conditions, choices, 'Yes')

    
    conditions = [df['PRIORLIVE'].eq(0),
                  df['PRIORLIVE'].eq(99),
    ]
    choices = ['None',
               'Unknown',
    ]
    df['PRIORLIVE_cat'] = np.select(conditions, choices, 'Yes')

    
    conditions = [(df['DOB_MM'] > df['DLMP_MM']) & (2018 == df['DLMP_YY']),
                  (df['DOB_MM'] > df['DLMP_MM']) & (2018 > df['DLMP_YY']),
                  (df['DOB_MM'] < df['DLMP_MM']) & (2018 > df['DLMP_YY'])
    ]
    choices = [df['DOB_MM'] - df['DLMP_MM'],
               ((df['DOB_YY'] - df['DLMP_YY'])* 12) + df['DOB_MM'] - df['DLMP_MM'],
               ((df['DOB_YY'] - df['DLMP_YY'])* 12) - df['DLMP_MM'] + df['DOB_MM']

    ]
    df['pregnancy_length'] = np.select(conditions,choices, 12)

    conditions = [df['MAGER'].lt(18)
    ]
    choices = ['Minor'
    ]
    df['MAGER_cat'] = np.select(conditions,choices,'Adult')

    
    condition =[df['pregnancy_length'].eq(9),
                df['pregnancy_length'].eq(8),
                df['pregnancy_length'].eq(10),
                df['pregnancy_length'].lt(7)
    ]
    choices = ['9',
              '8',
               '10',
               'Early'
    ]
    df['pregnancy_length_cat'] = np.select(condition,choices, 'Late')

    
    df['BMI_log'] = np.log(df['BMI'])
    df['first_birth'] = np.where(df['ILLB_R'].eq(888), 1, 0)
    df['plural_delivery'] = np.where(df['ILLB_R'].lt(4), 'Yes', 'No')
    df['first_pregnancy'] = np.where(df['ILP_R'].eq(888), 1, 0)
    df['first_natal'] = np.where(df['ILOP_R'].eq(888),1, 0)
    df['PRECARE'] = np.where(df['PRECARE'].eq(99), 0, df['PRECARE'])
    df['PREVIS'] = np.where(df['PREVIS'].eq(99), 0, df['PREVIS'])
    df['35AGE_older'] = np.where(df['MAGER'].gt(34), 1, 0)

    df['pregnancy_length_WTGAIN'] = df['pregnancy_length']/df['PWgt_R']
    df['MOM_weight'] = (df['M_Ht_In']**2)*df['BMI']/704
    df['WTGAIN_div_MOM_weight'] = df['WTGAIN']/df['MOM_weight']
    df['WTGAIN_div_length'] = df['WTGAIN'] / df['pregnancy_length']
    df['Weight_before'] = df['MOM_weight'] - df['WTGAIN']
    df['WT_percent_gain'] = df['WTGAIN'] / df['PWgt_R']
    return df


In [6]:
df = feature_engineer(df)
df.shape

(231493, 75)

In [7]:
X = df[['ATTEND','BFACIL', 'smoked', 'DOB_MM', 'DMAR','FHISPX','FEDUC', 'FRACE6', 'first_birth', 'plural_delivery', 'first_pregnancy', 'first_natal',
        'IP_GON', 'LD_INDL', 'MAGER', '35AGE_older','MAR_IMP', 'MBSTATE_REC', 'MEDUC', 'MHISPX', 'MRAVE6', 'MTRAN', 'pregnancy_length', 'WTGAIN_div_MOM_weight','WTGAIN_div_length',
        'NO_INFEC','NO_MMORB','NO_RISKS','PAY', 'PAY_REC','PRECARE','PREVIS', 'PRIORDEAD_cat', 'PRIORLIVE_cat', 'PRIORTERM_cat', 'PWgt_R', 'BMI_log','M_Ht_In', 'MOM_weight',
        'RDMETH_REC', 'RESTATUS', 'RF_CESAR', 'SEX', 'WTGAIN','WT_percent_gain','MAGER_cat','pregnancy_length_cat','BMI'
]]
_X = pd.get_dummies(X, columns=['ATTEND', 'BFACIL','smoked', 'DOB_MM','DMAR','FHISPX','FEDUC','FRACE6', 'plural_delivery',
                                'IP_GON','LD_INDL', '35AGE_older','MAR_IMP', 'MBSTATE_REC', 'MEDUC', 'MHISPX', 'MRAVE6', 'MTRAN',
                                'NO_INFEC','NO_MMORB','NO_RISKS', 'PAY', 'PAY_REC','PRIORDEAD_cat', 'PRIORLIVE_cat','PRIORTERM_cat',
                                'RDMETH_REC', 'RESTATUS', 'RF_CESAR', 'SEX','MAGER_cat','pregnancy_length_cat'])
y = df['DBWT']

In [8]:
def feat_eng_dummy(_X):
    _X['MAGER_smoked_Daily'] = _X['MAGER'] * _X['smoked_Daily']
    _X['NO_RISKS_1_length'] = _X['NO_RISKS_1'] * _X['pregnancy_length']
    _X['RDMETH_REC_3_length'] = _X['RDMETH_REC_3'] * _X['pregnancy_length']
    _X['RDMETH_REC_1_length'] = _X['RDMETH_REC_1'] * _X['pregnancy_length']
    _X['ATTEND_1_length'] = _X['pregnancy_length'] * _X['ATTEND_1']
    _X['MRAVE6_1_FRACE6_1'] = _X['MRAVE6_1'] * _X['FRACE6_1']
    _X['BFACIL_1_length'] = _X['pregnancy_length'] * _X['BFACIL_1']
    _X['BMI_log_length'] = _X['BMI_log'] * _X['pregnancy_length']
    _X['M_Ht_In_length'] = _X['DMAR_1'] * _X['pregnancy_length']
    _X['LD_INDL_N_length'] = _X['LD_INDL_N'] * _X['pregnancy_length']
    _X['MTRAN_Y_length'] = _X['MTRAN_Y'] * _X['pregnancy_length']
    _X['PRECARE_length'] = _X['PRECARE'] * _X['pregnancy_length']
    _X['PREVIS_length'] = _X['PREVIS'] * _X['pregnancy_length']
    _X['MOM_weight_length'] = _X['MOM_weight'] * _X['pregnancy_length']

    _X['RDMETH_REC_3_pregnancy_length_cat_9'] = _X['RDMETH_REC_3'] * _X['pregnancy_length_cat_9']
    _X['RF_CESAR_Y_pregnancy_length_cat_9'] = _X['RF_CESAR_Y'] * _X['pregnancy_length_cat_9']
    return _X

In [9]:
_X = feat_eng_dummy(_X)

In [10]:

_X['DBWT'] = y
from statsmodels.formula.api import ols
formula = 'DBWT~BMI_log+WTGAIN+M_Ht_In+SEX_M+BFACIL_3+ATTEND_1+DMAR_1+FRACE6_1+FEDUC_3+FRACE6_1+first_birth+plural_delivery_Yes+LD_INDL_N\
+MBSTATE_REC_1+MEDUC_6+MHISPX_2+MRAVE6_2+MRAVE6_1+MTRAN_Y+NO_RISKS_1+PAY_1+PRECARE+PREVIS+PRIORDEAD_cat_Yes+RDMETH_REC_3+RF_CESAR_Y+MAGER_smoked_Daily+pregnancy_length+MOM_weight\
+WTGAIN_div_MOM_weight+WTGAIN_div_length+NO_RISKS_1_length+RDMETH_REC_3_length+RDMETH_REC_1_length+ATTEND_1_length+BFACIL_1_length+BMI_log_length+M_Ht_In_length\
+LD_INDL_N_length+LD_INDL_N_length+MTRAN_Y_length+NO_RISKS_1_length+PREVIS_length+MOM_weight_length+pregnancy_length_cat_9+pregnancy_length_cat_8\
+pregnancy_length_cat_10+pregnancy_length_cat_Early+RDMETH_REC_3_pregnancy_length_cat_9+RF_CESAR_Y_pregnancy_length_cat_9'
model = ols(formula=formula, data=_X).fit()
model.summary()

0,1,2,3
Dep. Variable:,DBWT,R-squared:,0.365
Model:,OLS,Adj. R-squared:,0.365
Method:,Least Squares,F-statistic:,2831.0
Date:,"Thu, 30 Jul 2020",Prob (F-statistic):,0.0
Time:,01:13:37,Log-Likelihood:,-1750100.0
No. Observations:,231493,AIC:,3500000.0
Df Residuals:,231445,BIC:,3501000.0
Df Model:,47,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-5276.7282,329.974,-15.991,0.000,-5923.468,-4629.988
BMI_log,1491.3511,120.360,12.391,0.000,1255.449,1727.253
WTGAIN,-4.3468,0.481,-9.030,0.000,-5.290,-3.403
M_Ht_In,46.6253,1.018,45.813,0.000,44.631,48.620
SEX_M,115.5772,1.934,59.754,0.000,111.786,119.368
BFACIL_3,88.0371,15.358,5.732,0.000,57.935,118.139
ATTEND_1,-225.8802,28.986,-7.793,0.000,-282.691,-169.069
DMAR_1,-85.4211,22.375,-3.818,0.000,-129.276,-41.566
FRACE6_1,49.2113,2.679,18.367,0.000,43.960,54.463

0,1,2,3
Omnibus:,7072.005,Durbin-Watson:,2.001
Prob(Omnibus):,0.0,Jarque-Bera (JB):,19232.891
Skew:,-0.012,Prob(JB):,0.0
Kurtosis:,4.412,Cond. No.,543000.0
