In [1]:
############    Import all the packages that need in this code    ############
import pandas as pd # data science essentials
import matplotlib.pyplot as plt # essential graphical output
import seaborn as sns # enhanced graphical output
import numpy as np # mathematical essentials
import statsmodels.formula.api as smf # regression modeling
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import sklearn.linear_model

############    Read the excel  dataset   ############   
file = "./birthweight_low.xlsx"
### Give the name of dataset 
bw =pd.read_excel(io=file) 

print(bw.info())
print('Missing valueinformation')
print(bw.isnull().sum())

########## Clean the data and fill the missing value with mode######### 
mode_list = bw[['meduc','npvis','feduc']].mode()

meduct_fill = mode_list.iloc[0,0]
bw['meduc'].fillna(meduct_fill,inplace = True)


npvis_fill = mode_list.iloc[0,1]
bw['npvis'].fillna(npvis_fill,inplace = True)

feduc_fill = mode_list.iloc[0,2]
bw['feduc'].fillna(feduc_fill,inplace = True)


###########   Features Engineering   ###########

###### Age 
bw['parents_age'] = bw['mage'] + bw['fage'] 
bw['mage_young'] = 0
for index, value in bw.iterrows():
    if bw.loc[index, 'mage'] < 35:
        bw.loc[index,'mage_young'] = 1

bw['fage_young'] = 0
for index, value in bw.iterrows():
    if bw.loc[index, 'fage'] < 55:
        bw.loc[index,'fage_young'] = 1
        
###### Cigs & Drinks
bw['cig_too_much'] = 0
for index, value in bw.iterrows():
    if bw.loc[index, 'cigs'] > 20 :
        bw.loc[index,'cig_too_much'] = 1

###### Edu 
bw['par_edugap'] = np.abs(bw['meduc']- bw['feduc'])

###### Prenatal care and prenatal visits
bw['npvis_enough'] = 0
for index, value in bw.iterrows():
    if bw.loc[index, 'npvis'] > 13:
        bw.loc[index,'npvis_enough'] = 1
        
##############################      Train and Test Data       ##################################
#After transform all the data,begin split the datasets
#Creta a list to select the features that use in the model
x_varaiables = ['parents_age', 'mage_young','fage_young',
                'cigs','drink','cig_too_much',
                'par_edugap',
                'monpre','npvis_enough']

#Split features and y columns.
x = bw.loc[:,x_varaiables]
y = bw.loc[:,"bwght"]
#split data in to train and test group
train_x,test_x,train_y,test_y = train_test_split(x,y,test_size = .25,random_state = 219)

#combine x and y into train and test datasets
train_set=pd.concat([train_x,train_y],axis=1)
test_set=pd.concat([test_x,test_y],axis=1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 196 entries, 0 to 195
Data columns (total 18 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   mage    196 non-null    int64  
 1   meduc   193 non-null    float64
 2   monpre  196 non-null    int64  
 3   npvis   193 non-null    float64
 4   fage    196 non-null    int64  
 5   feduc   189 non-null    float64
 6   omaps   196 non-null    int64  
 7   fmaps   196 non-null    int64  
 8   cigs    196 non-null    int64  
 9   drink   196 non-null    int64  
 10  male    196 non-null    int64  
 11  mwhte   196 non-null    int64  
 12  mblck   196 non-null    int64  
 13  moth    196 non-null    int64  
 14  fwhte   196 non-null    int64  
 15  fblck   196 non-null    int64  
 16  foth    196 non-null    int64  
 17  bwght   196 non-null    int64  
dtypes: float64(3), int64(15)
memory usage: 27.7 KB
None
Missing valueinformation
mage      0
meduc     3
monpre    0
npvis     3
fage      0
feduc     7
o

The reason of feature engineering:
###### Age
Based on my research, the birth weight will be highly influenced by the age of the mother. However, the effect that from the father can not be ignored because the quality of sperm will also affect birth weight.So I combined the age from mother and father.
Also, the medical research also show that starting between 35 and 37, fertility begins to drop more quickly. So I create a categorical variable to check if the mother or father is young for pregnant.
###### Cigarettes & Drink
In CDC reports, it shows that one in every five babies born to mothers who smoke during pregnancy has low birth weight. The secondhand smoke will also have influence.So I defined who smoke more than one pack(20 cigarettes) per day will highly increase the potential of low birth weigh.
###### Education
The scatter plots of mother and father education year did not show too much trend, I just got idea about maybe the gap of parents' education will affect the birth weight.So I use absolute gap of education.
###### Race
Consider the birth weight is affected by both mother and father. Therefore, I combined the race information from both side. However,based the the coefficient in Lasso model, it shows the race have no influence on birth weight, so I did not use race as variables.
###### Prenatal care and prenatal visits
According to the medical research, the Adequacy of Prenatal Care will influence the risk of low birth weight.However, adequacy of care can not performance well for high-risk women.
###### Reference:
1. https://www.ncbi.nlm.nih.gov/pmc/articles/PMC1470584/
2. https://health.mo.gov/data/mica/CDP_MICA/MICH_PreDefinitionofIndicators.html
3. https://www.healthline.com/health/womens-health/childbearing-age
4. https://www.cdc.gov/tobacco/basic_information/health_effects/pregnancy/index.htm

In [3]:
#####################     OLS Regression     ##################### 
# INSTANTIATING an OLS model 
lr = LinearRegression()

# FITTING to the training data
lr_fit = lr.fit(train_x,train_y)

# PREDICTING on new data
lr_pred = lr_fit.predict(test_x)

# saving scoring data for future use
lr_train_score = lr.score(train_x,train_y).round(4) # using R-square
lr_test_score  = lr.score(test_x,test_y).round(4)   # using R-square

# defined gap between training and testing in OLS model
lr_test_gap = abs(lr_train_score - lr_test_score).round(4)

#####################     Lasso     #####################  
# INSTANTIATING a lasso model 
lasso_model = sklearn.linear_model.Lasso(alpha = 1.0,
                                         normalize = True) # default magitude

# FITTING to the training data
lasso_fit = lasso_model.fit(train_x,train_y)

# PREDICTING on new data
lasso_pred = lasso_fit.predict(test_x)

# defined Lasso scores from train sets and test sets
lasso_train_score = lasso_model.score(train_x,train_y).round(4) # using R-square
lasso_test_score  = lasso_model.score(test_x,test_y).round(4)   # using R-square


# defined gap between training and testing in Lasso model
lasso_test_gap = abs(lasso_train_score - lasso_test_score).round(4)

#####################     ARD    #####################  
# INSTANTIATING an ARD model 
ard_model = sklearn.linear_model.ARDRegression()

# FITTING the training data
ard_fit = ard_model.fit(train_x,train_y)

# PREDICTING on new data
ard_pred = ard_fit.predict(test_x)

# defined ARD scores from train sets and test sets
ard_train_score = ard_model.score(train_x,train_y).round(4)
ard_test_score  = ard_model.score(test_x,test_y).round(4)

# defined gap between training and testing in ARD model
ard_test_gap = abs(ard_train_score - ard_test_score).round(4)

# displaying result
print(f"""
-------------       <Final Model I choose>       -------------
Model type      Train Score      Test Score      Train-Test Gap
----------      -----------      ----------      --------------
OLS             {lr_train_score}           {lr_test_score}           {lr_test_gap}

The other model performance
Model type      Train Score      Test Score      Train-Test Gap
----------      -----------      ----------      --------------
Lasso           {lasso_train_score}           {lasso_test_score}          {lasso_test_gap}
ARD             {ard_train_score}           {ard_test_score}          {ard_test_gap}
""")



-------------       <Final Model I choose>       -------------
Model type      Train Score      Test Score      Train-Test Gap
----------      -----------      ----------      --------------
OLS             0.721           0.7128           0.0082

The other model performance
Model type      Train Score      Test Score      Train-Test Gap
----------      -----------      ----------      --------------
Lasso           0.7186           0.7047          0.0139
ARD             0.7174           0.6998          0.0176

