<a href="https://colab.research.google.com/github/beckyeng226/beckyeng226/blob/main/Capstone_Project_Live_Birth_Data_Working_File_2_1_23.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#import standard libraries

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import matplotlib as mpl
from matplotlib import cm

from sklearn.impute import KNNImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.ensemble import RandomForestRegressor
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import mean_squared_error

%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

In [None]:
#import data

colspecs = [(74, 76), (104, 106), (114, 115), (118, 119), (119, 120), (123, 124), (146, 148), (150, 152), (159, 160), 
            (162, 163), (237, 239), (250, 251), (402, 403), (407, 408), (498, 500), (502, 503), (503, 507), (568, 569)]
colnames = ['MAGER', 'MRACE31', 'MHISP_R', 'MAR_P', 'DMAR', 'MEDUC', 'FAGECOMB', 'FRACE31', 'FHISP_R', 'FEDUC', 'PREVIS', 
           'WIC', 'ME_TRIAL', 'DMETH_REC', 'OEGEST_COMB', 'OEGEST_R3', 'DBWT', 'BFED']

data = pd.read_fwf('Nat2021US.txt', colspecs=colspecs, names=colnames)
data.head()

## Descriptive and Exploratory Analyses

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
data.corr()

In [None]:
data.isna().sum()

In [None]:
### marital status of mother: 1=married, 2=unmarried
data['DMAR'].value_counts()

In [None]:
### percentage marital status of mother: 1=married, 2=unmarried
data['DMAR'].value_counts(normalize=True)

In [None]:
### paternity acknowledged: Y, N, U=Unknown, X=Not applicable (because reported married, paternity assumed)
data['MAR_P'].value_counts()

### Maternal Demographic Characteristics

In [None]:
### mother education level
data['MEDUC'] = data.MEDUC.replace([9], [np.nan])
data['MEDUC'].value_counts(normalize=True)

In [None]:
plt.hist(data.MEDUC.loc[data.MEDUC != 99], bins=8)

In [None]:
### mother age
data['MAGER'].value_counts()

In [None]:
data['MAGER'].loc[data['MAGER'] != 99].mean()

In [None]:
plt.hist(data.MAGER, bins=10)

In [None]:
plt.boxplot(data.MAGER)

In [None]:
### mother race
data['MRACE31'].value_counts(normalize=True)

In [None]:
### mother Hispanic origin
data['MHISP_R'] = data.MHISP_R.replace([9], [np.nan])
data['MHISP_R'].value_counts(normalize=True)


### Paternal Demographic Characteristics

In [None]:
### father age
data['FAGECOMB'] = data.FAGECOMB.replace([99], [np.nan])
data['FAGECOMB'].value_counts(normalize=True)

In [None]:
data['FAGECOMB'].loc[data['FAGECOMB'] != 99].mean()

In [None]:
plt.boxplot(data.FAGECOMB)

In [None]:
plt.hist(data.FAGECOMB, bins=9)

In [None]:
### father race
data['FRACE31'] = data.FRACE31.replace([99], [np.nan])
data['FRACE31'].value_counts(normalize=True)

In [None]:
### father Hispanic origin
data['FHISP_R'] = data.FHISP_R.replace([9], [np.nan])
data['FHISP_R'].value_counts()

In [None]:
### father education level
data['FEDUC'] = data.FEDUC.replace([9], [np.nan])
data['FEDUC'].value_counts(normalize=True)

In [None]:
plt.hist(data.FEDUC.loc[data.FEDUC != 99], bins=8)

### Pregnancy and Infant Outcomes

In [None]:
### number of prenatal visits
data['PREVIS'] = data.PREVIS.replace([99], [np.nan])
data['PREVIS'].value_counts()

In [None]:
#number of observations with 0 prenatal visits
data['PREVIS'].loc[data['PREVIS'] == 0].count()

In [None]:
# number of observations where number of prenatal visits is unknown
data['PREVIS'].loc[data['PREVIS'] == 99].count()

In [None]:
data['PREVIS'].loc[(data['PREVIS'] != 0) & (data['PREVIS'] != 99)].count()

In [None]:
#mean number of visits among those who had at least 1 prenatal visit
data['PREVIS'].loc[(data['PREVIS'] != 0) & (data['PREVIS'] != 99)].mean()

In [None]:
#mean number of visits including those who had 0 prenatal visits
data['PREVIS'].loc[data['PREVIS'] != 99].mean()

In [None]:
plt.boxplot(data.PREVIS)

In [None]:
# received WIC during pregnancy
data['WIC'].value_counts(normalize=True)

In [None]:
# attempted a trial of labor prior to cesarean Y=yes, N=no, U=unknown, X=not applicable (delivered vaginally)
data['ME_TRIAL'].value_counts(normalize=True)

In [None]:
# method of delivery 1=vaginal, 2=cesarean, 99=missing
data['DMETH_REC'] = data.DMETH_REC.replace([9], [np.nan])
data['DMETH_REC'].value_counts(normalize=True)

In [None]:
# weeks of gestation, range= 17-47, 99=missing
data['OEGEST_COMB'] = data.OEGEST_COMB.replace([99], [np.nan])
data['OEGEST_COMB'].value_counts()

In [None]:
plt.hist(data.OEGEST_COMB.loc[data.OEGEST_COMB != 99], bins=5)

In [None]:
plt.boxplot(data.OEGEST_COMB)

In [None]:
# infant weight in grams at birth, 99=missing
data['DBWT'] = data.DBWT.replace([9999], [np.nan])
data['DBWT'].value_counts(normalize=True)

In [None]:
plt.boxplot(data.DBWT)

In [None]:
# infant being breastfed at discharge Y=yes, N=no, U=unknown
data['BFED'].value_counts(normalize=True)

## Recoding of Data to Numeric

In [None]:
#change letters to numbers for WIC data
data['WIC_R'] = data.WIC.replace(['Y', 'N', 'U'], [1, 0, np.nan])

In [None]:
#change letters to numbers for labor trial data 
data['TRIAL_R'] = data.ME_TRIAL.replace(['Y', 'N', 'U', 'X'], [1, 0, np.nan, 1])
data['TRIAL_R'].value_counts(normalize=True)

In [None]:
#change letters to numbers for breastfeeding data
data['BFED_R'] = data.BFED.replace(['Y', 'N', 'U'], [1, 0, np.nan])
data['BFED_R'].value_counts(normalize=True)

In [None]:
# recode paternity acknowledged to numeric
data['PATERNITY_R'] = data.MAR_P.replace(['Y', 'N', 'U', 'X'], [1, 0, np.nan, 1])
data.head()

In [None]:
data.PATERNITY_R.value_counts()

In [None]:
data.isnull().sum()

In [None]:
data.info()

## Taking a Stratified Random Sample for Analyses

In [None]:
sample_data = data.sample(n=1000000)
sample_data.info()

In [None]:
sample_data['WIC'].value_counts(normalize=True)

In [None]:
sample_data['ME_TRIAL'].value_counts(normalize=True)

In [None]:
sample_data['DMETH_REC'].value_counts(normalize=True)

In [None]:
sample_data['BFED'].value_counts(normalize=True)

In [None]:
### mother education level
sample_data['MEDUC'] = sample_data.MEDUC.replace([9], [np.nan])
sample_data['MEDUC'].value_counts(normalize=True)

In [None]:
### mother Hispanic origin
sample_data['MHISP_R'] = sample_data.MHISP_R.replace([9], [np.nan])
sample_data['MHISP_R'].value_counts(normalize=True)

In [None]:
### father age
sample_data['FAGECOMB'] = sample_data.FAGECOMB.replace([99], [np.nan])
sample_data['FAGECOMB'].value_counts(normalize=True)

In [None]:
### father race
sample_data['FRACE31'] = sample_data.FRACE31.replace([99], [np.nan])
sample_data['FRACE31'].value_counts(normalize=True)

In [None]:
### father Hispanic origin
sample_data['FHISP_R'] = sample_data.FHISP_R.replace([9], [np.nan])
sample_data['FHISP_R'].value_counts()

In [None]:
### father education level
sample_data['FEDUC'] = sample_data.FEDUC.replace([9], [np.nan])
sample_data['FEDUC'].value_counts(normalize=True)

In [None]:
### number of prenatal visits
sample_data['PREVIS'] = sample_data.PREVIS.replace([99], [np.nan])
sample_data['PREVIS'].value_counts()

In [None]:
# method of delivery 1=vaginal, 2=cesarean, 99=missing
sample_data['DMETH_REC'] = sample_data.DMETH_REC.replace([9], [np.nan])
sample_data['DMETH_REC'].value_counts(normalize=True)

In [None]:
# weeks of gestation, range= 17-47, 99=missing
sample_data['OEGEST_COMB'] = sample_data.OEGEST_COMB.replace([99], [np.nan])
sample_data['OEGEST_COMB'].value_counts()

In [None]:
# infant weight in grams at birth, 99=missing
sample_data['DBWT'] = sample_data.DBWT.replace([9999], [np.nan])
sample_data['DBWT'].value_counts(normalize=True)

In [None]:
# infant being breastfed at discharge Y=yes, N=no, U=unknown
sample_data['BFED'].value_counts(normalize=True)

In [None]:
#change letters to numbers for WIC data
sample_data['WIC_R'] = sample_data.WIC.replace(['Y', 'N', 'U'], [1, 0, np.nan])

In [None]:
#change letters to numbers for labor trial data 
sample_data['TRIAL_R'] = sample_data.ME_TRIAL.replace(['Y', 'N', 'U', 'X'], [1, 0, np.nan, 1])

In [None]:
#change letters to numbers for breastfeeding data
sample_data['BFED_R'] = sample_data.BFED.replace(['Y', 'N', 'U'], [1, 0, np.nan])

In [None]:
# recode paternity acknowledged to numeric
sample_data['PATERNITY_R'] = sample_data.MAR_P.replace(['Y', 'N', 'U', 'X'], [1, 0, np.nan, 1])

## Imputing Missing Values using KNNImputer()

In [None]:
impute_data=sample_data[['DMAR', 'PATERNITY_R', 'MEDUC', 'MAGER', 'MRACE31', 'MHISP_R', 'FAGECOMB', 'FRACE31', 'FHISP_R', 
                         'FEDUC', 'PREVIS', 'WIC_R', 'TRIAL_R', 'BFED_R', 'DMETH_REC', 'DBWT', 'OEGEST_COMB']]

In [None]:
impute_data.info()

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=1)
imputed_data = imputer.fit_transform(impute_data)

In [None]:
# Make df of imputed data
column_values = ['DMAR', 'PATERNITY_R', 'MEDUC', 'MAGER', 'MRACE31', 'MHISP_R', 'FAGECOMB', 'FRACE31', 'FHISP_R', 'FEDUC',
       'PREVIS', 'DMETH_REC', 'DBWT', 'OEGEST_COMB']
imputed_data = pd.DataFrame(data=imputed_data, columns=column_values)
imputed_data.head()

## Export of Imputed Data 
### Once all data is imputed, will use this data file so do not have to re-run KNNImputer because it is so time-intensive.

In [None]:
imputed_data.to_csv('imputed_data.csv')

## Import Imputed Data

In [None]:
# Because running the imputer takes multiple hours, the dataset was exported so can be used without having to run imputer
imputed_data = pd.read_csv('full_sample.csv')

In [None]:
imputed_data.head()

In [None]:
imputed_data = imputed_data.drop(labels='Unnamed: 0', axis=1)

In [None]:
imputed_data['DMAR'] = imputed_data.DMAR.replace([2.0], [0])
imputed_data['DMETH_REC'] = imputed_data.DMETH_REC.replace([2.0], [0])

In [None]:
imputed_data.info()

## One Hot Encoding of Categorical Features

In [None]:
from sklearn.preprocessing import OneHotEncoder

categorical_cols = ['MEDUC', 'MRACE31', 'MHISP_R', 'FRACE31', 'FHISP_R', 'FEDUC']

encoder = OneHotEncoder(drop='first', sparse=False)
OHE_encoded_data = pd.DataFrame(encoder.fit_transform(imputed_data[categorical_cols]))
OHE_encoded_data.columns = encoder.get_feature_names_out(input_features=categorical_cols)

In [None]:
OHE_encoded_data.info()

In [None]:
OHE_encoded_data.columns

In [None]:
OHE_encoded_data.isna().sum()

In [None]:
imputed_encoded_data = pd.concat([imputed_data, OHE_encoded_data], axis=1)

## Treatment of Outliers

### Paternal Age

In [None]:
plt.boxplot(imputed_data.FAGECOMB)

In [None]:
q_FAGECOMB = imputed_data.FAGECOMB.quantile(0.997)

In [None]:
q_FAGECOMB

### Prenatal Visits

In [None]:
plt.boxplot(imputed_data.PREVIS)

In [None]:
q_PREVIS = imputed_data.PREVIS.quantile(0.997)
q_PREVIS

### Infant Birth Weight

In [None]:
plt.boxplot(imputed_data.DBWT)

In [None]:
q_DBWT_high = imputed_data.DBWT.quantile(0.997)
q_DBWT_low = imputed_data.DBWT.quantile(0.003)
q_DBWT_high

In [None]:
q_DBWT_low

In [None]:
imputed_data['DBWT'].loc[(imputed_data.DBWT < 4840) & (imputed_data.DBWT > 619)].count()

## Final Cleaned and Encoded Data Set

In [None]:
final_data = imputed_encoded_data[(imputed_encoded_data['FAGECOMB'] < q_FAGECOMB) & 
                                                     (imputed_encoded_data['PREVIS'] < q_PREVIS) & 
                                                     (imputed_encoded_data['DBWT'] > q_DBWT_low) & 
                                                     (imputed_encoded_data['DBWT'] < q_DBWT_high)]

In [None]:
final_data.info()

In [None]:
final_data.to_csv('final_data.csv')

In [3]:
final_data = pd.read_csv('final_data.csv')

In [4]:
final_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 987629 entries, 0 to 987628
Columns: 102 entries, Unnamed: 0 to FEDUC_8.0
dtypes: float64(101), int64(1)
memory usage: 768.6 MB


## Split Data into Train and Test Sets

In [5]:
from sklearn.model_selection import train_test_split

train_set, test_set = train_test_split(final_data, test_size=0.2, random_state=0)

In [6]:
#features training sets
X_paternal_train = train_set[['DMAR', 'FAGECOMB', 'FRACE31_2.0', 'FRACE31_3.0', 'FRACE31_4.0', 'FRACE31_5.0', 
                              'FRACE31_6.0', 'FRACE31_7.0', 'FRACE31_8.0', 'FRACE31_9.0', 'FRACE31_10.0', 
                              'FRACE31_11.0', 'FRACE31_12.0', 'FRACE31_13.0', 'FRACE31_14.0', 'FRACE31_15.0', 
                              'FRACE31_16.0', 'FRACE31_17.0', 'FRACE31_18.0', 'FRACE31_19.0', 'FRACE31_20.0', 
                              'FRACE31_21.0', 'FRACE31_22.0', 'FRACE31_23.0', 'FRACE31_24.0', 'FRACE31_25.0',
                              'FRACE31_26.0', 'FRACE31_27.0', 'FRACE31_28.0', 'FRACE31_29.0', 'FRACE31_30.0', 
                              'FRACE31_31.0', 'FHISP_R_1.0', 'FHISP_R_2.0', 'FHISP_R_3.0', 'FHISP_R_4.0', 
                              'FHISP_R_5.0', 'FEDUC_2.0', 'FEDUC_3.0', 'FEDUC_4.0', 'FEDUC_5.0', 'FEDUC_6.0', 
                              'FEDUC_7.0', 'FEDUC_8.0']]

X_maternal_train = train_set[['DMAR', 'MAGER', 'MEDUC_2.0', 'MEDUC_3.0', 'MEDUC_4.0', 'MEDUC_5.0', 'MEDUC_6.0',
                              'MEDUC_7.0', 'MEDUC_8.0', 'MRACE31_2.0', 'MRACE31_3.0', 'MRACE31_4.0', 'MRACE31_5.0', 
                              'MRACE31_6.0', 'MRACE31_7.0', 'MRACE31_8.0', 'MRACE31_9.0', 'MRACE31_10.0', 
                              'MRACE31_11.0', 'MRACE31_12.0', 'MRACE31_13.0', 'MRACE31_14.0', 'MRACE31_15.0', 
                              'MRACE31_16.0', 'MRACE31_17.0', 'MRACE31_18.0', 'MRACE31_19.0', 'MRACE31_20.0',
                              'MRACE31_21.0', 'MRACE31_22.0', 'MRACE31_23.0', 'MRACE31_24.0', 'MRACE31_25.0', 
                              'MRACE31_26.0', 'MRACE31_27.0', 'MRACE31_28.0', 'MRACE31_29.0', 'MRACE31_30.0', 
                              'MRACE31_31.0', 'MHISP_R_1.0', 'MHISP_R_2.0', 'MHISP_R_3.0', 'MHISP_R_4.0', 'MHISP_R_5.0',]]

X_combined_train = train_set[['DMAR', 'FAGECOMB', 'MAGER', 'MEDUC_2.0', 'MEDUC_3.0', 'MEDUC_4.0', 'MEDUC_5.0', 'MEDUC_6.0',
                              'MEDUC_7.0', 'MEDUC_8.0', 'MRACE31_2.0', 'MRACE31_3.0', 'MRACE31_4.0', 'MRACE31_5.0', 
                              'MRACE31_6.0', 'MRACE31_7.0', 'MRACE31_8.0', 'MRACE31_9.0', 'MRACE31_10.0', 'MRACE31_11.0', 
                              'MRACE31_12.0', 'MRACE31_13.0', 'MRACE31_14.0', 'MRACE31_15.0', 'MRACE31_16.0', 
                              'MRACE31_17.0', 'MRACE31_18.0', 'MRACE31_19.0', 'MRACE31_20.0', 'MRACE31_21.0', 
                              'MRACE31_22.0', 'MRACE31_23.0', 'MRACE31_24.0', 'MRACE31_25.0', 'MRACE31_26.0', 
                              'MRACE31_27.0', 'MRACE31_28.0', 'MRACE31_29.0', 'MRACE31_30.0', 'MRACE31_31.0', 
                              'MHISP_R_1.0', 'MHISP_R_2.0', 'MHISP_R_3.0', 'MHISP_R_4.0', 'MHISP_R_5.0', 'FRACE31_2.0', 
                              'FRACE31_3.0', 'FRACE31_4.0', 'FRACE31_5.0', 'FRACE31_6.0', 'FRACE31_7.0', 'FRACE31_8.0', 
                              'FRACE31_9.0', 'FRACE31_10.0', 'FRACE31_11.0', 'FRACE31_12.0', 'FRACE31_13.0', 'FRACE31_14.0',
                              'FRACE31_15.0', 'FRACE31_16.0', 'FRACE31_17.0', 'FRACE31_18.0', 'FRACE31_19.0', 
                              'FRACE31_20.0', 'FRACE31_21.0', 'FRACE31_22.0', 'FRACE31_23.0', 'FRACE31_24.0', 
                              'FRACE31_25.0', 'FRACE31_26.0', 'FRACE31_27.0', 'FRACE31_28.0', 'FRACE31_29.0', 
                              'FRACE31_30.0', 'FRACE31_31.0', 'FHISP_R_1.0', 'FHISP_R_2.0', 'FHISP_R_3.0', 'FHISP_R_4.0', 
                              'FHISP_R_5.0', 'FEDUC_2.0', 'FEDUC_3.0', 'FEDUC_4.0', 'FEDUC_5.0', 'FEDUC_6.0', 
                              'FEDUC_7.0', 'FEDUC_8.0']]

# responses training sets 
y_clf_train = train_set[['WIC_R', 'TRIAL_R', 'DMETH_REC', 'BFED_R']]
y_reg_train = train_set[['PREVIS', 'DBWT', 'OEGEST_COMB']]

In [7]:
# features test sets 
X_paternal_test = test_set[['DMAR', 'FAGECOMB', 'FRACE31_2.0', 'FRACE31_3.0', 'FRACE31_4.0', 'FRACE31_5.0', 
                              'FRACE31_6.0', 'FRACE31_7.0', 'FRACE31_8.0', 'FRACE31_9.0', 'FRACE31_10.0', 
                              'FRACE31_11.0', 'FRACE31_12.0', 'FRACE31_13.0', 'FRACE31_14.0', 'FRACE31_15.0', 
                              'FRACE31_16.0', 'FRACE31_17.0', 'FRACE31_18.0', 'FRACE31_19.0', 'FRACE31_20.0', 
                              'FRACE31_21.0', 'FRACE31_22.0', 'FRACE31_23.0', 'FRACE31_24.0', 'FRACE31_25.0',
                              'FRACE31_26.0', 'FRACE31_27.0', 'FRACE31_28.0', 'FRACE31_29.0', 'FRACE31_30.0', 
                              'FRACE31_31.0', 'FHISP_R_1.0', 'FHISP_R_2.0', 'FHISP_R_3.0', 'FHISP_R_4.0', 
                              'FHISP_R_5.0', 'FEDUC_2.0', 'FEDUC_3.0', 'FEDUC_4.0', 'FEDUC_5.0', 'FEDUC_6.0', 
                              'FEDUC_7.0', 'FEDUC_8.0']]

X_maternal_test = test_set[['DMAR', 'MAGER', 'MEDUC_2.0', 'MEDUC_3.0', 'MEDUC_4.0', 'MEDUC_5.0', 'MEDUC_6.0',
                              'MEDUC_7.0', 'MEDUC_8.0', 'MRACE31_2.0', 'MRACE31_3.0', 'MRACE31_4.0', 'MRACE31_5.0', 
                              'MRACE31_6.0', 'MRACE31_7.0', 'MRACE31_8.0', 'MRACE31_9.0', 'MRACE31_10.0', 
                              'MRACE31_11.0', 'MRACE31_12.0', 'MRACE31_13.0', 'MRACE31_14.0', 'MRACE31_15.0', 
                              'MRACE31_16.0', 'MRACE31_17.0', 'MRACE31_18.0', 'MRACE31_19.0', 'MRACE31_20.0',
                              'MRACE31_21.0', 'MRACE31_22.0', 'MRACE31_23.0', 'MRACE31_24.0', 'MRACE31_25.0', 
                              'MRACE31_26.0', 'MRACE31_27.0', 'MRACE31_28.0', 'MRACE31_29.0', 'MRACE31_30.0', 
                              'MRACE31_31.0', 'MHISP_R_1.0', 'MHISP_R_2.0', 'MHISP_R_3.0', 'MHISP_R_4.0', 'MHISP_R_5.0',]]

X_combined_test = test_set[['DMAR', 'FAGECOMB', 'MAGER', 'MEDUC_2.0', 'MEDUC_3.0', 'MEDUC_4.0', 'MEDUC_5.0', 'MEDUC_6.0',
                              'MEDUC_7.0', 'MEDUC_8.0', 'MRACE31_2.0', 'MRACE31_3.0', 'MRACE31_4.0', 'MRACE31_5.0', 
                              'MRACE31_6.0', 'MRACE31_7.0', 'MRACE31_8.0', 'MRACE31_9.0', 'MRACE31_10.0', 'MRACE31_11.0', 
                              'MRACE31_12.0', 'MRACE31_13.0', 'MRACE31_14.0', 'MRACE31_15.0', 'MRACE31_16.0', 
                              'MRACE31_17.0', 'MRACE31_18.0', 'MRACE31_19.0', 'MRACE31_20.0', 'MRACE31_21.0', 
                              'MRACE31_22.0', 'MRACE31_23.0', 'MRACE31_24.0', 'MRACE31_25.0', 'MRACE31_26.0', 
                              'MRACE31_27.0', 'MRACE31_28.0', 'MRACE31_29.0', 'MRACE31_30.0', 'MRACE31_31.0', 
                              'MHISP_R_1.0', 'MHISP_R_2.0', 'MHISP_R_3.0', 'MHISP_R_4.0', 'MHISP_R_5.0', 'FRACE31_2.0', 
                              'FRACE31_3.0', 'FRACE31_4.0', 'FRACE31_5.0', 'FRACE31_6.0', 'FRACE31_7.0', 'FRACE31_8.0', 
                              'FRACE31_9.0', 'FRACE31_10.0', 'FRACE31_11.0', 'FRACE31_12.0', 'FRACE31_13.0', 'FRACE31_14.0',
                              'FRACE31_15.0', 'FRACE31_16.0', 'FRACE31_17.0', 'FRACE31_18.0', 'FRACE31_19.0', 
                              'FRACE31_20.0', 'FRACE31_21.0', 'FRACE31_22.0', 'FRACE31_23.0', 'FRACE31_24.0', 
                              'FRACE31_25.0', 'FRACE31_26.0', 'FRACE31_27.0', 'FRACE31_28.0', 'FRACE31_29.0', 
                              'FRACE31_30.0', 'FRACE31_31.0', 'FHISP_R_1.0', 'FHISP_R_2.0', 'FHISP_R_3.0', 'FHISP_R_4.0', 
                              'FHISP_R_5.0', 'FEDUC_2.0', 'FEDUC_3.0', 'FEDUC_4.0', 'FEDUC_5.0', 'FEDUC_6.0', 
                              'FEDUC_7.0', 'FEDUC_8.0']]

# responses test sets
y_clf_test = test_set[['WIC_R', 'TRIAL_R', 'DMETH_REC', 'BFED_R']]
y_reg_test = test_set[['PREVIS', 'DBWT', 'OEGEST_COMB']]

In [8]:
train_set.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 790103 entries, 88550 to 985772
Columns: 102 entries, Unnamed: 0 to FEDUC_8.0
dtypes: float64(101), int64(1)
memory usage: 620.9 MB


In [None]:
test_set.info()

## Hyperparameter Tuning for RandomForestClassifier() Models

In [9]:
rf_clf = RandomForestClassifier(n_estimators=100, max_depth=6, min_samples_split=2, random_state=42)
clf_model = MultiOutputClassifier(estimator=rf_clf)

### Paternal Features

In [None]:


# Coarse RandomClassifier GridSearch - paternal
param_grid_coarse_paternal = {'estimator__max_depth':[2, 4, 5],
              'estimator__n_estimators':[250, 500, 1000],
              'estimator__min_samples_split':[3, 5, 7]}

gscv_paternal_clf_coarse = GridSearchCV(clf_model, param_grid_coarse_paternal, n_jobs=-1, cv=3)
gscv_paternal_clf_coarse.fit(X_paternal_train.iloc[:200000], y_clf_train.iloc[:200000])

print("The best coarse parameters are: ", gscv_paternal_clf_coarse.best_params_)

The best coarse parameters are:  {'estimator__max_depth': 5, 'estimator__min_samples_split': 5, 'estimator__n_estimators': 1000}


In [None]:
# Refined RandomForestClassifier GridSearch - paternal

param_grid_refined_paternal = {'estimator__max_depth':[4,5,6,7,8], 
              'estimator__n_estimators': [900, 950, 1000, 1050, 1100], 
              'estimator__min_samples_split': [5,6,7,8,9,10,11,12]}

gscv_paternal_clf_refined = GridSearchCV(clf_model, param_grid_refined_paternal, n_jobs=-1, cv=3)
gscv_paternal_clf_refined.fit(X_paternal_train, y_clf_train)

print("The best refined parameters are: ", gscv_paternal_clf_refined.best_params_)

In [10]:
# Final RandomForestClassifier GridSearch - paternal
param_grid_final_paternal = {'estimator__max_depth':[12, 14, 16], 
              'estimator__n_estimators': [250], 
              'estimator__min_samples_split': [2]}

gscv_paternal_clf_final = GridSearchCV(clf_model, param_grid_final_paternal, n_jobs=-1, cv=3)
gscv_paternal_clf_final.fit(X_paternal_train, y_clf_train)

print("The best refined parameters are: ", gscv_paternal_clf_final.best_params_)

The best refined parameters are:  {'estimator__max_depth': 14, 'estimator__min_samples_split': 2, 'estimator__n_estimators': 250}


### Maternal Features 

In [None]:
# Coarse RandomClassifier GridSearch - maternal
param_grid_coarse_maternal = {'estimator__max_depth':[2,4,5,8],
              'estimator__n_estimators':[250, 500, 750, 1000],
              'estimator__min_samples_split':[3, 5, 7, 9]}

gscv_maternal_clf_coarse = GridSearchCV(clf_model, param_grid_coarse_maternal, n_jobs=-1, cv=3)
gscv_maternal_clf_coarse.fit(X_maternal_train, y_clf_train)

print("The best coarse parameters are: ", gscv_maternal_clf_coarse.best_params_)

In [None]:
# Refined RandomForestClassifier GridSearch - maternal

param_grid_refined_maternal = {'estimator__max_depth':[4,5,6, 8], 
              'estimator__n_estimators': [900, 950, 1000, 1050, 1100], 
              'estimator__min_samples_split': [5,6,7,8,9,10,11,12]}

gscv_maternal_clf_refined = GridSearchCV(clf_model, param_grid_refined_maternal, n_jobs=-1, cv=3)
gscv_maternal_clf_refined.fit(X_maternal_train, y_clf_train)

print("The best refined parameters are: ", gscv_maternal_clf_refined.best_params_)

In [None]:
# Final RandomForestClassifier GridSearch - maternal
param_grid_final_maternal = {'estimator__max_depth':[4,5,6,7,8], 
              'estimator__n_estimators': [900, 950, 1000, 1050, 1100], 
              'estimator__min_samples_split': [5,6,7,8,9,10,11,12]}

gscv_maternal_clf_final = GridSearchCV(clf_model, param_grid_final_maternal, n_jobs=-1, cv=3)
gscv_maternal_clf_final.fit(X_maternal_train, y_clf_train)

print("The best refined parameters are: ", gscv_maternal_clf_final.best_params_)

### Combined Paternal and Maternal Features

In [None]:
# Coarse RandomClassifier GridSearch - combined
param_grid_coarse_combined = {'estimator__max_depth':[2, 4, 5],
              'estimator__n_estimators':[250, 500, 1000],
              'estimator__min_samples_split':[3, 5, 7]}

gscv_combined_clf_coarse = GridSearchCV(clf_model, param_grid_coarse_combined, n_jobs=-1, cv=3)
gscv_combined_clf_coarse.fit(X_combined_train, y_clf_train)

print("The best coarse parameters are: ", gscv_combined_clf_coarse.best_params_)

In [None]:
# Refined RandomForestClassifier GridSearch - combined

param_grid_refined_combined = {'estimator__max_depth':[4,5,6,7,8], 
              'estimator__n_estimators': [900, 950, 1000, 1050, 1100], 
              'estimator__min_samples_split': [5,6,7,8,9,10,11,12]}

gscv_combined_clf_refined = GridSearchCV(clf_model, param_grid_refined_combined, n_jobs=-1, cv=3)
gscv_combined_clf_refined.fit(X_combined_train, y_clf_train)

print("The best refined parameters are: ", gscv_combined_clf_refined.best_params_)

In [None]:
# Final RandomForestClassifier GridSearch - combined
param_grid_final_combined = {'estimator__max_depth':[4,5,6,7,8], 
              'estimator__n_estimators': [900, 950, 1000, 1050, 1100], 
              'estimator__min_samples_split': [5,6,7,8,9,10,11,12]}

gscv_combined_clf_final = GridSearchCV(clf_model, param_grid_final_combined, n_jobs=-1, cv=3)
gscv_combined_clf_final.fit(X_combined_train, y_clf_train)

print("The best refined parameters are: ", gscv_combined_clf_final.best_params_)

## RandomForestClassifier() Models

In [11]:
# RandomForestClassifier model on paternal features using default hyperparameter values
rf_clf_paternal = MultiOutputClassifier(RandomForestClassifier(max_depth=20, random_state=42))
rf_clf_paternal.fit(X_paternal_train, y_clf_train)

MultiOutputClassifier(estimator=RandomForestClassifier(max_depth=20,
                                                       random_state=42))

In [12]:
# predict responses based on the default paternal RandomForestClassifier model
y_pred_rf_clf_paternal = rf_clf_paternal.predict(X_paternal_test)
y_pred_rf_clf_paternal

array([[0., 1., 1., 1.],
       [0., 1., 1., 1.],
       [0., 1., 1., 1.],
       ...,
       [1., 1., 1., 1.],
       [0., 1., 1., 1.],
       [0., 1., 1., 1.]])

In [None]:
# Optimal RandomForestClassifier Model based on GridSearch results for paternal features
optimal_rf_clf_paternal = RandomForestClassifier(max_depth=gscv_paternal_clf_final.best_params_['estimator__max_depth'],
                                   n_estimators=gscv_paternal_clf_final.best_params_['estimator__n_estimators'],
                                   min_samples_split=gscv_paternal_clf_final.best_params_['estimator__min_samples_split'],
                                   random_state=42)
optimal_rf_clf_paternal.fit(X_paternal_train, y_clf_train)

In [None]:
#predict responses based on the optimal paternal RandomForestClassifier model
y_pred_rf_clf_paternal_optimal = optimal_rf_clf_paternal.predict(X_paternal_test)
y_pred_rf_clf_paternal_optimal

In [13]:
# RandomForestClassifier model on maternal features using default hyperparameter values
rf_clf_maternal = MultiOutputClassifier(RandomForestClassifier(max_depth=20, random_state=42))
rf_clf_maternal.fit(X_maternal_train, y_clf_train)

MultiOutputClassifier(estimator=RandomForestClassifier(max_depth=20,
                                                       random_state=42))

In [14]:
# predict responses based on the default maternal RandomForestClassifier model
y_pred_rf_clf_maternal = rf_clf_maternal.predict(X_maternal_test)
y_pred_rf_clf_maternal

array([[0., 1., 1., 1.],
       [0., 1., 1., 1.],
       [0., 1., 1., 1.],
       ...,
       [0., 1., 1., 1.],
       [0., 1., 1., 1.],
       [0., 1., 1., 1.]])

In [None]:
# Optimal RandomForestClassifier Model based on GridSearch results for maternal features
optimal_rf_clf_maternal = RandomForestClassifier(max_depth=gscv_maternal_clf_final.best_params_['estimator__max_depth'],
                                   n_estimators=gscv_maternal_clf_final.best_params_['estimator__n_estimators'],
                                   min_samples_split=gscv_maternal_clf_final.best_params_['estimator__min_samples_split'],
                                   random_state=42)
optimal_rf_clf_maternal.fit(X_maternal_train, y_clf_train)

In [None]:
#predict responses based on the optimal maternal RandomForestClassifier model
y_pred_rf_clf_maternal_optimal = optimal_rf_clf_maternal.predict(X_maternal_test)
y_pred_rf_clf_maternal_optimal

In [25]:
# RandomForestClassifier model on combined paternal and maternal features using default hyperparameter values
rf_clf_combined = MultiOutputClassifier(RandomForestClassifier(max_depth=20, random_state=42))
rf_clf_combined.fit(X_combined_train, y_clf_train)

MultiOutputClassifier(estimator=RandomForestClassifier(max_depth=20,
                                                       random_state=42))

In [26]:
# predict responses based on the default combined RandomForestClassifier model
y_pred_rf_clf_combined = rf_clf_combined.predict(X_combined_test)
y_pred_rf_clf_combined

array([[0., 1., 1., 1.],
       [0., 1., 1., 1.],
       [0., 1., 1., 1.],
       ...,
       [1., 1., 1., 1.],
       [0., 1., 1., 1.],
       [0., 1., 1., 1.]])

In [None]:
# Optimal RandomForestClassifier Model based on GridSearch results for combined paternal and maternal features
optimal_rf_clf_combined = RandomForestClassifier(max_depth=gscv_combined_clf_final.best_params_['estimator__max_depth'],
                                   n_estimators=gscv_combined_clf_final.best_params_['estimator__n_estimators'],
                                   min_samples_split=gscv_combined_clf_final.best_params_['estimator__min_samples_split'],
                                   random_state=42)
optimal_rf_clf_combined.fit(X_combined_train, y_clf_train)

In [None]:
#predict responses based on the combined paternal and maternal RandomForestClassifier model
y_pred_rf_clf_combined = optimal_rf_clf_combined.predict(X_combined_test)
y_pred_rf_clf_combined

## Permutation Importance for RandomForestClassifier() Models

In [None]:
from sklearn.inspection import permutation_importance

# permutation importance for paternal RandomForestClassifier Model
r_clf_paternal = permutation_importance(rf_clf_paternal, X_paternal_test, y_clf_test, n_repeats=10, random_state=42)

In [None]:
r_clf_paternal.importances_mean

In [None]:
r_clf_paternal.importances

In [None]:
# permutation importance for maternal RandomForestClassifier Model
r_clf_maternal = permutation_importance(rf_clf_maternal, X_maternal_test, y_clf_test, n_repeats=10, random_state=42)

In [None]:
r_clf_maternal.importances_mean

In [None]:
r_clf_maternal.importances

In [None]:
# permutation importance for combined RandomForestClassifier Model
r_clf_combined = permutation_importance(rf_clf_combined, X_combined_test, y_clf_test, n_repeats=10, random_state=42)

In [None]:
r_clf_combined.importances_mean

In [None]:
r_clf_combined.importances

### Accuracy, Precision, Recall, and F1 Scores for RandomForestClassifier() Models

In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

#RandomForestClassifier accuracy of default paternal model
rf_paternal_accuracy = accuracy_score(y_clf_test, y_pred_rf_clf_paternal)
rf_paternal_accuracy

0.4312141186476717

In [None]:
#RandomForestClassifier accuracy of optimal paternal model
optimal_rf_paternal_accuracy = accuracy_score(y_clf_test, y_pred_rf_clf_paternal)
rf_paternal_accuracy

In [16]:
#RandomForestClassifier accuracy of maternal model
rf_maternal_accuracy = accuracy_score(y_clf_test, y_pred_rf_clf_maternal)
rf_maternal_accuracy

0.43215576683575835

In [27]:
#RandomForestClassifier accuracy of combined model
rf_combined_accuracy = accuracy_score(y_clf_test, y_pred_rf_clf_combined)
rf_combined_accuracy

0.4365754381701649

In [18]:
#RandomForestClassifier precision of paternal model
rf_paternal_precision = precision_score(y_clf_test, y_pred_rf_clf_paternal, average='micro')
rf_paternal_precision

0.7469327383567577

In [19]:
#RandomForestClassifier precision of maternal model
rf_maternal_precision = precision_score(y_clf_test, y_pred_rf_clf_maternal, average='micro')
rf_maternal_precision

0.7489995487158844

In [28]:
#RandomForestClassifier precision of combined model
rf_combined_precision = precision_score(y_clf_test, y_pred_rf_clf_combined, average='micro')
rf_combined_precision

0.7489207271976747

In [20]:
#RandomForestClassifier recall of paternal model
rf_paternal_recall = recall_score(y_clf_test, y_pred_rf_clf_paternal, average='micro')
rf_paternal_recall

0.9366509094545233

In [21]:
#RandomForestClassifier recall of maternal model
rf_maternal_recall = recall_score(y_clf_test, y_pred_rf_clf_maternal, average='micro')
rf_maternal_recall

0.9339353882411069

In [29]:
#RandomForestClassifier recall of combined model
rf_combined_recall = recall_score(y_clf_test, y_pred_rf_clf_combined, average='micro')
rf_combined_recall

0.9391056622048464

In [23]:
#RandomForestClassifier F1 scores of paternal model
rf_paternal_f1 = f1_score(y_clf_test, y_pred_rf_clf_paternal, average='micro')
rf_paternal_f1

0.8311024279580517

In [24]:
#RandomForestClassifier F1 scores of maternal model
rf_maternal_f1 = f1_score(y_clf_test, y_pred_rf_clf_maternal, average='micro')
rf_maternal_f1

0.831306272109627

In [31]:
#RandomForestClassifier F1 scores of combined model
rf_combined_f1 = f1_score(y_clf_test, y_pred_rf_clf_combined, average='micro')
rf_combined_f1

0.8332994079587189

## RandomForestRegressor() Models

In [32]:
rf_reg_paternal = MultiOutputRegressor(RandomForestRegressor(max_depth=20, random_state=42)).fit(X_paternal_train, y_reg_train)

In [33]:
#predict responses based on the paternal RandomForestRegressor model
y_pred_rf_reg_paternal = rf_reg_paternal.predict(X_paternal_test)
y_pred_rf_reg_paternal

array([[  11.39048315, 3307.77177097,   38.25474835],
       [  11.45855102, 3288.47827941,   38.27528491],
       [  11.05104166, 3132.30925627,   37.74003829],
       ...,
       [  10.50712116, 3293.6082583 ,   38.12360387],
       [  11.09820264, 3236.99167276,   38.62276032],
       [  10.54311823, 3213.17291354,   38.27903393]])

In [37]:
rf_reg_maternal = MultiOutputRegressor(RandomForestRegressor(max_depth=20, random_state=42)).fit(X_maternal_train, y_reg_train)

In [38]:
#predict responses based on the maternal RandomForestRegressor model
y_pred_rf_reg_maternal = rf_reg_maternal.predict(X_maternal_test)
y_pred_rf_reg_maternal

array([[  11.1099296 , 3294.59810081,   38.30883496],
       [  11.48794209, 3267.48933515,   38.03821731],
       [  11.16630066, 3029.67878902,   37.77649994],
       ...,
       [  10.95386638, 3284.23851661,   38.11784988],
       [  11.16602987, 3297.32887096,   38.53938926],
       [  10.4337081 , 3225.99920299,   38.22980417]])

In [None]:
rf_reg_combined = MultiOutputRegressor(RandomForestRegressor(max_depth=20, random_state=42)).fit(X_combined_train, y_reg_train)

In [None]:
#predict responses based on the combined RandomForestRegressor model
y_pred_rf_reg_combined = rf_reg_combined.predict(X_combined_test)
y_pred_rf_reg_combined

In [36]:
#generalization error default paternal RandomForestRegressor model
rf_reg_paternal_y_pred = rf_reg_paternal.predict(X_paternal_test)
mse_rf_reg_paternal = mean_squared_error(y_reg_test, rf_reg_paternal_y_pred)
mse_rf_reg_paternal

102731.99931139528

In [39]:
#generalization error default maternal RandomForestRegressor model
rf_reg_maternal_y_pred = rf_reg_maternal.predict(X_maternal_test)
mse_rf_reg_maternal = mean_squared_error(y_reg_test, rf_reg_maternal_y_pred)
mse_rf_reg_maternal

102429.51807068492

In [None]:
#generalization error default combined RandomForestRegressor model
rf_reg_combined_y_pred = rf_reg_combined.predict(X_combined_test)
mse_rf_reg_combined = mean_squared_error(y_reg_test, rf_reg_combined_y_pred)

### Hyperparameter Tuning for RandomForestRegressor() Models

In [None]:
rf_reg = RandomForestRegressor(n_estimators=100, max_depth=6, min_samples_split=2, random_state=42)
reg_model = MultiOutputClassifier(estimator=rf_reg)

#### Paternal Features Model

In [None]:
# Coarse RandomForestRegressor GridSearch - paternal
param_grid_coarse_reg_paternal = {'estimator__max_depth':[2, 4, 5, 8],
              'estimator__n_estimators':[250, 500, 750, 1000],
              'estimator__min_samples_split':[3, 5, 7, 9]}

gscv_paternal_reg_coarse = GridSearchCV(reg_model, param_grid_coarse_reg_paternal, n_jobs=-1, cv=3)
gscv_paternal_reg_coarse.fit(X_paternal_train, y_clf_train)

print("The best coarse parameters are: ", gscv_paternal_reg_coarse.best_params_)

In [None]:
# Refined RandomForestRegressor GridSearch - paternal
param_grid_refined_reg_paternal = {'estimator__max_depth':[2, 4, 5, 8],
              'estimator__n_estimators':[250, 500, 750, 1000],
              'estimator__min_samples_split':[3, 5, 7, 9]}

gscv_paternal_reg_refined = GridSearchCV(reg_model, param_grid_refined_reg_paternal, n_jobs=-1, cv=3)
gscv_paternal_reg_refined.fit(X_paternal_train, y_clf_train)

print("The best coarse parameters are: ", gscv_paternal_reg_refined.best_params_)

In [None]:
# Final RandomForestRegressor GridSearch - paternal
param_grid_final_reg_paternal = {'estimator__max_depth':[2, 4, 5, 8],
              'estimator__n_estimators':[250, 500, 750, 1000],
              'estimator__min_samples_split':[3, 5, 7, 9]}

gscv_paternal_reg_final = GridSearchCV(reg_model, param_grid_final_reg_paternal, n_jobs=-1, cv=3)
gscv_paternal_reg_final.fit(X_paternal_train, y_clf_train)

print("The best coarse parameters are: ", gscv_paternal_reg_final.best_params_)

In [None]:
# Coarse-Grained RandomForestRegressor GridSearch - paternal

# param_grid_coarse_rf_reg_paternal = {'max_depth':[1,2,4,5,8,20], 
#               'n_estimators': [250, 500, 1000], 
#               'min_samples_split': [2,4,5,8,12,20]}

# grid_search_cv_coarse_rf_reg_paternal = GridSearchCV(RandomForestRegressor(random_state=42), 
#                                                      param_grid_coarse_rf_reg_paternal, verbose=1, cv=3)
# grid_search_cv_coarse_rf_reg_paternal.fit(X_paternal_train, y_reg_train)

# print("The best parameters are: ", grid_search_cv_coarse_rf_reg_paternal.best_params_)

In [None]:
# Refined RandomForestRegressor GridSearch - paternal

# param_grid_refined_rf_reg_paternal = {'max_depth':[4,5,6,7,8], 
#               'n_estimators': [900, 950, 1000, 1050, 1100], 
#               'min_samples_split': [5,6,7,8,9,10,11,12]}

# grid_search_cv_refined_rf_reg_paternal = GridSearchCV(RandomForestRegressor(random_state=42), 
#                                                       param_grid_refined_rf_reg_paternal, verbose=1, cv=3)
# grid_search_cv_refined_rf_reg_paternal.fit(X_paternal_train, y_reg_train)

# print("The best parameters are: ", grid_search_cv_refined_rf_reg_paternal.best_params_)

In [None]:
# Final RandomForestRegressor GridSearch - paternal

# param_grid_final_rf_reg_paternal = {'max_depth':[5], 
#               'n_estimators': [750, 800, 850, 900, 950, 1000], 
#               'min_samples_split': [8]}

# grid_search_cv_final_rf_reg_paternal = GridSearchCV(RandomForestRegressor(random_state=42), 
#                                                     param_grid_final_rf_reg_paternal, verbose=1, cv=3)
# grid_search_cv_final_rf_reg_paternal.fit(X_paternal_train, y_reg_train)

# print("The best parameters are: ", grid_search_cv_final_rf_reg_paternal.best_params_)

In [None]:
# Optimal RandomForestRegressor Model based on GridSearch results for paternal features
# optimal_rf_reg_paternal = RandomForestRegressor(max_depth=grid_search_cv_final_rf_reg_paternal.best_params_['max_depth'],
#                                    n_estimators=grid_search_cv_final_rf_reg_paternal.best_params_['n_estimators'],
#                                    min_samples_split=grid_search_cv_final_rf_reg_paternal.best_params_['min_samples_split'],
#                                    random_state=42)
# optimal_rf_reg_paternal.fit(X_paternal_train, y_reg_train)

#### Maternal Features Model

In [None]:
# Coarse RandomForestRegressor GridSearch - maternal
param_grid_coarse_reg_maternal = {'estimator__max_depth':[2, 4, 5, 8],
              'estimator__n_estimators':[250, 500, 750, 1000],
              'estimator__min_samples_split':[3, 5, 7, 9]}

gscv_maternal_reg_coarse = GridSearchCV(reg_model, param_grid_coarse_reg_maternal, n_jobs=-1, cv=3)
gscv_maternal_reg_coarse.fit(X_maternal_train, y_clf_train)

print("The best coarse parameters are: ", gscv_maternal_reg_coarse.best_params_)

In [None]:
# Coarse RandomForestRegressor GridSearch - maternal
# param_grid_coarse_maternal = {'estimator__max_depth':[2, 4, 5, 8],
#               'estimator__n_estimators':[250, 500, 750, 1000],
#               'estimator__min_samples_split':[3, 5, 7, 9]}

# gscv_maternal_reg_coarse = GridSearchCV(reg_model, param_grid_coarse_maternal, n_jobs=-1, cv=3)
# gscv_maternal_reg_coarse.fit(X_maternal_train, y_clf_train)

# print("The best coarse parameters are: ", gscv_maternal_reg_coarse.best_params_)

In [None]:
# Refined RandomForestRegressor GridSearch - maternal

param_grid_refined_rf_reg_maternal = {'max_depth':[4,5,6,7,8], 
              'n_estimators': [900, 950, 1000, 1050, 1100], 
              'min_samples_split': [5,6,7,8,9,10,11,12]}

grid_search_cv_refined_rf_reg_maternal = GridSearchCV(RandomForestRegressor(random_state=42), 
                                                      param_grid_refined_rf_reg_maternal, verbose=1, cv=3)
grid_search_cv_refined_rf_reg_maternal.fit(X_maternal_train, y_reg_train)

print("The best parameters are: ", grid_search_cv_refined_rf_reg_maternal.best_params_)

In [None]:
# Final RandomForestRegressor GridSearch - maternal

param_grid_final_rf_reg_maternal = {'max_depth':[5], 
              'n_estimators': [750, 800, 850, 900, 950, 1000], 
              'min_samples_split': [8]}

grid_search_cv_final_rf_reg_maternal = GridSearchCV(RandomForestRegressor(random_state=42), 
                                                    param_grid_final_rf_reg_maternal, verbose=1, cv=3)
grid_search_cv_final_rf_reg_maternal.fit(X_maternal_train, y_reg_train)

print("The best parameters are: ", grid_search_cv_final_rf_reg_maternal.best_params_)

In [None]:
# Optimal RandomForestRegressor Model based on GridSearch results for maternal features
optimal_rf_reg_maternal = RandomForestRegressor(max_depth=grid_search_cv_final_rf_reg_maternal.best_params_['max_depth'],
                                   n_estimators=grid_search_cv_final_rf_reg_maternal.best_params_['n_estimators'],
                                   min_samples_split=grid_search_cv_final_rf_reg_maternal.best_params_['min_samples_split'],
                                   random_state=42)
optimal_rf_reg_maternal.fit(X_maternal_train, y_reg_train)

#### Combined Paternal and Maternal Features Model

In [None]:
# Coarse-Grained RandomForestRegressor GridSearch - combined

param_grid_coarse_rf_reg_combined = {'max_depth':[1,2,4,5,8,20], 
              'n_estimators': [250, 500, 1000], 
              'min_samples_split': [2,4,5,8,12,20]}

grid_search_cv_coarse_rf_reg_combined = GridSearchCV(RandomForestRegressor(random_state=42), 
                                                     param_grid_coarse_rf_reg_combined, verbose=1, cv=3)
grid_search_cv_coarse_rf_reg_combined.fit(X_combined_train, y_reg_train)

print("The best parameters are: ", grid_search_cv_coarse_rf_reg_combined.best_params_)

In [None]:
# Refined RandomForestRegressor GridSearch - combined

param_grid_refined_rf_reg_combined = {'max_depth':[4,5,6,7,8], 
              'n_estimators': [900, 950, 1000, 1050, 1100], 
              'min_samples_split': [5,6,7,8,9,10,11,12]}

grid_search_cv_refined_rf_reg_combined = GridSearchCV(RandomForestRegressor(random_state=42), 
                                                      param_grid_refined_rf_reg_combined, verbose=1, cv=3)
grid_search_cv_refined_rf_reg_combined.fit(X_combined_train, y_reg_train)

print("The best parameters are: ", grid_search_cv_refined_rf_reg_combined.best_params_)

In [None]:
# Final RandomForestRegressor GridSearch - combined

param_grid_final_rf_reg_combined = {'max_depth':[5], 
              'n_estimators': [750, 800, 850, 900, 950, 1000], 
              'min_samples_split': [8]}

grid_search_cv_final_rf_reg_combined = GridSearchCV(RandomForestRegressor(random_state=42), 
                                                    param_grid_final_rf_reg_combined, verbose=1, cv=3)
grid_search_cv_final_rf_reg_combined.fit(X_combined_train, y_reg_train)

print("The best parameters are: ", grid_search_cv_final_rf_reg_combined.best_params_)

In [None]:
# Optimal RandomForestRegressor Model based on GridSearch results for combined paternal and maternal features
optimal_rf_reg_combined = RandomForestRegressor(max_depth=grid_search_cv_final_rf_reg_combined.best_params_['max_depth'],
                                   n_estimators=grid_search_cv_final_rf_reg_combined.best_params_['n_estimators'],
                                   min_samples_split=grid_search_cv_final_rf_reg_combined.best_params_['min_samples_split'],
                                   random_state=42)
optimal_rf_reg_combined.fit(X_combined_train, y_reg_train)

### Permutation Importance for RandomForestRegressor() Models

In [None]:
# permutation importance for paternal RandomForestRegressor Model
r_reg_paternal = permutation_importance(rf_reg_paternal, X_paternal_test, y_reg_test, n_repeats=10, random_state=42)

In [None]:
r_reg_paternal.importances_mean

In [None]:
r_reg_paternal.importances_mean

In [None]:
# permutation importance for maternal RandomForestRegressor Model
r_reg_maternal = permutation_importance(rf_reg_maternal, X_maternal_test, y_reg_test, n_repeats=10, random_state=42)

In [None]:
r_reg_maternal.importances_mean

In [None]:
r_reg_maternal.importances

In [None]:
# permutation importance for combined RandomForestRegressor Model
r_reg_combined = permutation_importance(rf_reg_combined, X_combined_test, y_reg_test, n_repeats=10, random_state=42)

In [None]:
r_reg_combined.importances_mean

In [None]:
r_reg_combined.importances

### Generalization Error for RandomForestRegressor() Models

In [None]:
from sklearn.metrics import mean_squared_error

#https://scikit-learn.org/stable/auto_examples/ensemble/plot_random_forest_regression_multioutput.html#sphx-glr-auto-examples-ensemble-plot-random-forest-regression-multioutput-py
# rf_reg_paternal.score()

#generalization error paternal RandomForestRegressor model
optimal_rf_reg_paternal_y_pred = optimal_rf_reg_paternal.predict(X_paternal_test)
optimal_mse_rf_reg_paternal = mean_squared_error(y_reg_test, optimal_rf_reg_paternal_y_pred)

#generalization error maternal RandomForestRegressor model
optimal_rf_reg_maternal_y_pred = optimal_rf_reg_maternal.predict(X_maternal_test)
optimal_mse_rf_reg_maternal = mean_squared_error(y_reg_test, optimal_rf_reg_maternal_y_pred)

#generalization error combined RandomForestRegressor model
optimal_rf_reg_combined_y_pred = optimal_rf_reg_combined.predict(X_combined_test)
optimal_mse_rf_reg_combined = mean_squared_error(y_reg_test, optimal_rf_reg_combined_y_pred)


print("Generalization error for Optimal Paternal Random Forest Regressor Model: ", round(optimal_mse_rf_reg_paternal, 4))
print("Generalization error for Optimal Maternal Random Forest Regressor Model: ", round(optimal_mse_rf_reg_maternal, 4))
print("Generalization error for Optimal Combined Random Forest Regressor Model: ", round(optimal_mse_rf_reg_combined, 4))