In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import xgboost as xgb
import matplotlib.pyplot as plt
import warnings                               
warnings.filterwarnings('ignore')
from sklearn.impute import KNNImputer
from sklearn.preprocessing import LabelEncoder, StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

In [2]:
test_gen1 = pd.read_csv('adv-ml-2025/gen1_test_comp_final.csv') # ages 0 to 20 
test_gen2 = pd.read_csv('adv-ml-2025/gen2_test_upto9_comp_final.csv') # ages from 0 to 9

In [3]:
train_gen1 = pd.read_csv('adv-ml-2025/gen1_train_comp_final.csv') # ages 0 to 20
train_gen2 = pd.read_csv('adv-ml-2025/gen2_train_comp_final.csv') # ages 0 to 18

**Examining the data**

Training data:

In [4]:
train_gen1.head()

Unnamed: 0,gen1_id,sex_assigned_at_birth,age,SHgt_cm
0,774,F,0.1,56.961812
1,774,F,0.25,64.82619
2,774,F,0.5,74.340764
3,774,F,0.75,79.747338
4,774,F,1.0,84.092569


In [5]:
train_gen1['age'].unique()

array([ 0.1 ,  0.25,  0.5 ,  0.75,  1.  ,  1.5 ,  2.  ,  3.  ,  4.  ,
        5.  ,  6.  ,  7.  ,  8.  ,  9.  ,  9.5 , 10.  , 10.5 , 11.  ,
       11.5 , 12.  , 12.5 , 13.  , 13.5 , 14.  , 14.5 , 15.  , 15.5 ,
       16.  , 16.5 , 17.  , 17.5 , 18.  , 18.5 , 19.  , 19.5 , 20.  ])

In [6]:
train_gen2.head()

Unnamed: 0,gen2_id,sex_assigned_at_birth,study_parent_sex,study_parent_id_new,AgeGr,SHgt_cm,Wgt_kg
0,3012,M,mother,636,0.1,56.251625,4.636903
1,3012,M,mother,636,0.25,64.491579,
2,3012,M,mother,636,0.5,70.465927,
3,3012,M,mother,636,0.75,73.992677,
4,3012,M,mother,636,1.0,79.343537,


In [7]:
train_gen2['AgeGr'].unique()

array([ 0.1 ,  0.25,  0.5 ,  0.75,  1.  ,  1.5 ,  2.  ,  3.  ,  4.  ,
        5.  ,  6.  ,  7.  ,  8.  ,  9.  , 10.  , 11.  , 12.  , 13.  ,
       14.  , 15.  , 16.  , 18.  ])

Test data:

In [8]:
test_gen1.head()

Unnamed: 0,gen1_id,sex_assigned_at_birth,age,SHgt_cm
0,768,F,0.1,53.822825
1,768,F,0.25,61.455579
2,768,F,0.5,69.757527
3,768,F,0.75,73.385477
4,768,F,1.0,78.129137


In [9]:
test_gen1['age'].unique()

array([ 0.1 ,  0.25,  0.5 ,  0.75,  1.  ,  1.5 ,  2.  ,  3.  ,  4.  ,
        5.  ,  6.  ,  7.  ,  8.  ,  9.  ,  9.5 , 10.  , 10.5 , 11.  ,
       11.5 , 12.  , 12.5 , 13.  , 13.5 , 14.  , 14.5 , 15.  , 15.5 ,
       16.  , 16.5 , 17.  , 17.5 , 18.  , 18.5 , 19.  , 19.5 , 20.  ])

In [10]:
test_gen2.head()

Unnamed: 0,gen2_id,sex_assigned_at_birth,study_parent_sex,study_parent_id_new,AgeGr,SHgt_cm,Wgt_kg
0,2831,F,mother,455,0.1,52.912025,
1,2831,F,mother,455,0.25,59.532779,
2,2831,F,mother,455,0.5,67.733527,
3,2831,F,mother,455,0.75,70.450677,
4,2831,F,mother,455,1.0,74.991937,


In [11]:
test_gen2['AgeGr'].unique()

array([0.1 , 0.25, 0.5 , 0.75, 1.  , 1.5 , 2.  , 3.  , 4.  , 5.  , 6.  ,
       7.  , 8.  , 9.  ])

**Data Preprocessing:**

Mismatch in parent sex for a few gen_2 subjects:

In [12]:
gen2_train_ids = list(train_gen2['gen2_id'].unique())

for gen2_id in gen2_train_ids: # checking if study_parent_sex switches for a given child in the the gen_2 training data
    gen2_id_df = train_gen2[train_gen2['gen2_id'] == gen2_id]
    if len(gen2_id_df['study_parent_sex'].unique()) > 1: # there is a switch in parent
            parent = gen2_id_df['study_parent_id_new'].unique()[0]
            if parent in list(train_gen1['gen1_id'].unique()): # there are children in the gen_2 training data whose parent does not appear in the gen 1 training data
                correct_parent = train_gen1[train_gen1['gen1_id'] == parent]['sex_assigned_at_birth'].unique()[0] # looking up the parent in the gen 1 dataset to find their assigned sex
                if correct_parent == 'F':
                    study_parent_sex = 'mother'
                else:
                    study_parent_sex = 'father'
                train_gen2['study_parent_sex'] = np.where(train_gen2['gen2_id'] == gen2_id, study_parent_sex, train_gen2['study_parent_sex'])

train_gen2.head(30)

Unnamed: 0,gen2_id,sex_assigned_at_birth,study_parent_sex,study_parent_id_new,AgeGr,SHgt_cm,Wgt_kg
0,3012,M,father,636,0.1,56.251625,4.636903
1,3012,M,father,636,0.25,64.491579,
2,3012,M,father,636,0.5,70.465927,
3,3012,M,father,636,0.75,73.992677,
4,3012,M,father,636,1.0,79.343537,
5,3012,M,father,636,1.5,86.061664,
6,3012,M,father,636,2.0,92.497582,
7,3012,M,father,636,3.0,101.012604,
8,3012,M,father,636,4.0,111.528387,
9,3012,M,father,636,5.0,118.245241,26.368025


Handling missing values: We'll check for missing values in each dataset. If any exist, we'll fill them in using linear interpolation

In [13]:
train_gen1.isna().sum()

gen1_id                    0
sex_assigned_at_birth      0
age                        0
SHgt_cm                  310
dtype: int64

In [14]:
train_gen1 = train_gen1.fillna(method='ffill')
train_gen1.isna().sum()

gen1_id                  0
sex_assigned_at_birth    0
age                      0
SHgt_cm                  0
dtype: int64

In [15]:
train_gen2.isna().sum()

gen2_id                     0
sex_assigned_at_birth       0
study_parent_sex            0
study_parent_id_new         0
AgeGr                       0
SHgt_cm                   512
Wgt_kg                   2045
dtype: int64

In [16]:
train_gen2 = train_gen2.fillna(method='ffill')
train_gen2.isna().sum()

gen2_id                  0
sex_assigned_at_birth    0
study_parent_sex         0
study_parent_id_new      0
AgeGr                    0
SHgt_cm                  0
Wgt_kg                   0
dtype: int64

In [17]:
test_gen1.isna().sum()

gen1_id                    0
sex_assigned_at_birth      0
age                        0
SHgt_cm                  216
dtype: int64

In [18]:
test_gen1 = test_gen1.fillna(method='ffill')
test_gen1.isna().sum()

gen1_id                  0
sex_assigned_at_birth    0
age                      0
SHgt_cm                  0
dtype: int64

In [19]:
test_gen2.isna().sum()

gen2_id                    0
sex_assigned_at_birth      0
study_parent_sex           0
study_parent_id_new        0
AgeGr                      0
SHgt_cm                  132
Wgt_kg                   823
dtype: int64

In [20]:
test_gen2 = test_gen2.fillna(method='ffill')
test_gen2.isna().sum()

gen2_id                  0
sex_assigned_at_birth    0
study_parent_sex         0
study_parent_id_new      0
AgeGr                    0
SHgt_cm                  0
Wgt_kg                   9
dtype: int64

### Part B: What features of a parent’s growth curve are most predictive (if at all) of the magnitude (amount grown) of a child’s pubertal growth spurt, typically occurring between the ages of 9 and 15)? ###

We will fit a linear regression model on our data to determine which features of a parent's growth curve are the most predictive of the magnitude of a child's pubertal growth spurt. 

**More Data Preprocessing and Feature Engineering:**

First, we've calculated the growth spurt magnitude in Gen 2 by creating a new datafram called gen2_growth_spurt and subtracting their heights at age 9 from their heights at age 15. 

In [21]:
# creating two seperate dataframes for gen_2 at age 9 vs. age 15
gen2_age_9 = train_gen2[train_gen2['AgeGr'] == 9]
gen2_age_15 = train_gen2[train_gen2['AgeGr'] == 15]

# dropping 'Wgt_kg' as weight is a variable that changes with height and probably won't impact growth spurt. also dropping age as well
gen2_age_9 = gen2_age_9.drop(['Wgt_kg', 'AgeGr'], axis=1)
gen2_age_15 = gen2_age_15.drop(['Wgt_kg', 'AgeGr', 'sex_assigned_at_birth', 'study_parent_sex','study_parent_id_new'], axis=1)

gen2_age_9 = gen2_age_9[['gen2_id', 'sex_assigned_at_birth', 'study_parent_sex','study_parent_id_new', 'SHgt_cm']].rename(columns={'study_parent_id_new' : 'gen1_id', 'SHgt_cm' : 'SHgt_cm_9'})
gen2_age_15 = gen2_age_15[['gen2_id', 'SHgt_cm']].rename(columns={'sex_assigned_at_birth':'sex_assigned_at_birth_15', 'study_parent_sex' : 'study_parent_sex_15','study_parent_id_new' : 'study_parent_id_new_15', 'SHgt_cm' : 'SHgt_cm_15'})

gen2_growth_spurt = pd.merge(gen2_age_9, gen2_age_15, on='gen2_id')

gen2_growth_spurt['gen_2_height_diff'] = gen2_growth_spurt['SHgt_cm_15'] - gen2_growth_spurt['SHgt_cm_9'] # calculating gen_2's growth spurt magnitude by subtracting their heights at age 9 from their heights at age 15

gen2_growth_spurt.head()

Unnamed: 0,gen2_id,sex_assigned_at_birth,study_parent_sex,gen1_id,SHgt_cm_9,SHgt_cm_15,gen_2_height_diff
0,3012,M,father,636,143.319816,181.650013,38.330196
1,2830,F,mother,712,138.648385,159.748067,21.099682
2,2829,F,father,662,134.597483,167.438765,32.841282
3,2827,M,mother,744,130.375568,170.25579,39.880222
4,2825,M,mother,570,135.400949,135.400949,0.0


Applying feature engineering on the parent's growth curves to help identify features that might predict their children's growth spurt. We first created a dataframe identifying each parent's height at ages 9, 15, and 20

In [22]:
list(train_gen1.columns) # getting a list of columns in train_gen1

['gen1_id', 'sex_assigned_at_birth', 'age', 'SHgt_cm']

In [23]:
train_gen1_cols = ['gen1_id', 'SHgt_cm']

# creating seperate dataframes for gen_1 at ages 9,15, and 20
gen1_age_9 = train_gen1[train_gen1['age'] == 9]
gen1_age_12 = train_gen1[train_gen1['age'] == 12]
gen1_age_15 = train_gen1[train_gen1['age'] == 15]

# dropping 'Wgt_kg' as weight is a variable that changes with height and probably won't impact growth spurt. also dropping age as well
gen1_age_9 = gen1_age_9.drop(['age'], axis=1)
gen1_age_12 = gen1_age_12.drop(['sex_assigned_at_birth','age'], axis=1)
gen1_age_15 = gen1_age_15.drop(['sex_assigned_at_birth','age'], axis=1)

# renaming columns to easily identify which columns correspond to a specific age group in the merged parent dataframe
gen1_age_9 = gen1_age_9[['gen1_id', 'sex_assigned_at_birth', 'SHgt_cm']].rename(columns={'SHgt_cm' : 'SHgt_cm_9'})
gen1_age_12 = gen1_age_12[train_gen1_cols].rename(columns={'SHgt_cm' : 'SHgt_cm_12'})
gen1_age_15 = gen1_age_15[train_gen1_cols].rename(columns={'SHgt_cm' : 'SHgt_cm_15'})

# merging all 3 dataframes
temp_merged = pd.merge(gen1_age_9, gen1_age_12, on='gen1_id')
gen1_growth_spurt = pd.merge(temp_merged, gen1_age_15, on='gen1_id')

# calculating the parent's growth rate (ages 9-12 and 12-15)
gen1_growth_spurt['growth_rate_9_12'] = (gen1_growth_spurt['SHgt_cm_12'] - gen1_growth_spurt['SHgt_cm_9']) / (12 - 9)
gen1_growth_spurt['growth_rate_12_15'] = (gen1_growth_spurt['SHgt_cm_15'] - gen1_growth_spurt['SHgt_cm_12']) / (15 - 12)

# moving 'growth_rate_9_12' for readability

# moving_first_growth_rate = gen1_growth_spurt.pop('growth_rate_9_12')
# gen1_growth_spurt.insert(5, 'growth_rate_9_12', moving_first_growth_rate)

# # adding parent's final adult height:
gen1_age_20 = train_gen1[train_gen1['age'] == 20][['gen1_id', 'SHgt_cm']]
gen1_age_20 = gen1_age_20[['gen1_id', 'SHgt_cm']].rename(columns={'SHgt_cm' : 'SHgt_cm_20'})
gen1_growth = pd.merge(gen1_growth_spurt, gen1_age_20, on='gen1_id')

# moving 'SHgt_cm_20'
col_to_move = gen1_growth.pop('SHgt_cm_20')
gen1_growth.insert(5, 'SHgt_cm_20', col_to_move)
gen1_growth.head()

Unnamed: 0,gen1_id,sex_assigned_at_birth,SHgt_cm_9,SHgt_cm_12,SHgt_cm_15,SHgt_cm_20,growth_rate_9_12,growth_rate_12_15
0,774,F,145.226308,162.215606,182.591204,184.787392,5.663099,6.791866
1,771,F,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481
2,768,F,148.220095,163.692965,174.375385,175.410151,5.157623,3.560807
3,764,F,132.564578,147.255938,162.684385,167.446567,4.89712,5.142816
4,753,F,139.175094,157.206488,171.691764,172.525586,6.010465,4.828425


Concatenating the gen_1 and gen_2 growth spurt columns:

In [24]:
gen1_growth.shape

(101, 8)

In [25]:
gen2_growth_spurt.shape

(192, 7)

In [26]:
gen2_growth_spurt.isna().sum()

gen2_id                  0
sex_assigned_at_birth    0
study_parent_sex         0
gen1_id                  0
SHgt_cm_9                0
SHgt_cm_15               0
gen_2_height_diff        0
dtype: int64

* We need to perform further processing on gen2_growth_spurt since there may be children that appear in this dataset whose parent ids do not appear in the gen1_growth dataframe:

In [27]:
gen1_ids = list(gen1_growth['gen1_id'].unique())
gen2_parent_ids = list(gen2_growth_spurt['gen1_id'].unique())

for i in gen2_parent_ids:
    if i not in gen1_ids:
        gen2_growth_spurt = gen2_growth_spurt.loc[gen2_growth_spurt['gen1_id'] != i]

gen2_growth_spurt.shape

(151, 7)

In [28]:
# concatenating gen1_growth and gen2_growth_spurt 

growth_spurt_df = gen1_growth.merge(gen2_growth_spurt, on='gen1_id', how='inner')

growth_spurt_df = growth_spurt_df.drop(['study_parent_sex', 'gen2_id'], axis=1)

growth_spurt_df.head()

Unnamed: 0,gen1_id,sex_assigned_at_birth_x,SHgt_cm_9_x,SHgt_cm_12,SHgt_cm_15_x,SHgt_cm_20,growth_rate_9_12,growth_rate_12_15,sex_assigned_at_birth_y,SHgt_cm_9_y,SHgt_cm_15_y,gen_2_height_diff
0,774,F,145.226308,162.215606,182.591204,184.787392,5.663099,6.791866,M,140.200001,175.987663,35.787661
1,774,F,145.226308,162.215606,182.591204,184.787392,5.663099,6.791866,M,142.552073,175.519183,32.96711
2,771,F,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,M,134.59377,165.546779,30.953009
3,771,F,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,M,138.972537,171.72764,32.755102
4,771,F,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,M,134.192305,171.857849,37.665544


We processed growth_spurt_df even further by renaming columns for additional readability and encoding the assigned sexes columns

In [29]:
# renaming columns
growth_spurt_cols = list(growth_spurt_df.columns)
renamed_cols = {'sex_assigned_at_birth_x' : 'sex_assigned_at_birth_1', 'SHgt_cm_9_x' : 'SHgt_cm_9_1', 'SHgt_cm_15_x' : 'SHgt_cm_15_1', 'sex_assigned_at_birth_y' : 'sex_assigned_at_birth_2', 'SHgt_cm_9_y' : 'SHgt_cm_9_2', 'SHgt_cm_15_y' : 'SHgt_cm_15_2'}

growth_spurt_df = growth_spurt_df[growth_spurt_cols].rename(columns=renamed_cols)

growth_spurt_df.head()

Unnamed: 0,gen1_id,sex_assigned_at_birth_1,SHgt_cm_9_1,SHgt_cm_12,SHgt_cm_15_1,SHgt_cm_20,growth_rate_9_12,growth_rate_12_15,sex_assigned_at_birth_2,SHgt_cm_9_2,SHgt_cm_15_2,gen_2_height_diff
0,774,F,145.226308,162.215606,182.591204,184.787392,5.663099,6.791866,M,140.200001,175.987663,35.787661
1,774,F,145.226308,162.215606,182.591204,184.787392,5.663099,6.791866,M,142.552073,175.519183,32.96711
2,771,F,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,M,134.59377,165.546779,30.953009
3,771,F,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,M,138.972537,171.72764,32.755102
4,771,F,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,M,134.192305,171.857849,37.665544


In [30]:
# encoding the assigned sexes column using one-hot encoding

growth_spurt_final_df = growth_spurt_df.copy()
growth_spurt_final_df = pd.get_dummies(growth_spurt_final_df, columns=['sex_assigned_at_birth_1', 'sex_assigned_at_birth_2'], drop_first=True)

growth_spurt_final_df.head()

Unnamed: 0,gen1_id,SHgt_cm_9_1,SHgt_cm_12,SHgt_cm_15_1,SHgt_cm_20,growth_rate_9_12,growth_rate_12_15,SHgt_cm_9_2,SHgt_cm_15_2,gen_2_height_diff,sex_assigned_at_birth_1_M,sex_assigned_at_birth_2_M
0,774,145.226308,162.215606,182.591204,184.787392,5.663099,6.791866,140.200001,175.987663,35.787661,False,True
1,774,145.226308,162.215606,182.591204,184.787392,5.663099,6.791866,142.552073,175.519183,32.96711,False,True
2,771,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,134.59377,165.546779,30.953009,False,True
3,771,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,138.972537,171.72764,32.755102,False,True
4,771,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,134.192305,171.857849,37.665544,False,True


**Multiple Linear Regression:**

Model preparation and splitting the data:

In [31]:
X = growth_spurt_final_df.drop(['gen1_id', 'SHgt_cm_9_1', 'SHgt_cm_12', 'SHgt_cm_15_1', 'SHgt_cm_20', 'SHgt_cm_9_2', 'SHgt_cm_15_2', 'gen_2_height_diff', 'sex_assigned_at_birth_2_M'], axis=1)
y = growth_spurt_final_df['gen_2_height_diff'] # target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # creating an 80:20 train-test ration

Training the model:

In [32]:
lr = LinearRegression()
lr.fit(X_train, y_train)

Model evaluation:

In [33]:
y_pred = lr.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = lr.score(X_test, y_test)

print(f"RMSE: {round(rmse, 3)}")
print(f"R-squared: {round(r2, 3)}")

RMSE: 8.794
R-squared: 0.036


Interpreting our coefficients:

In [34]:
coefficients_b = pd.DataFrame({'feature': X.columns, 'Coefficient': lr.coef_})
print(coefficients_b)

                     feature  Coefficient
0           growth_rate_9_12    -1.751403
1          growth_rate_12_15     0.743782
2  sex_assigned_at_birth_1_M    -2.133467


### d. For questions b. and c., does the strength of heredity change depending on the relationship between parent/child sexes assigned at birth? (E.g. parent/child combinations of matched sexes – M/M, F/F, or opposite sexes – M/F, F/M.) If yes, how do these associations change in each case? ###

**Splitting data into various parent/child combinations:**

In [35]:
growth_spurt_df.head()

Unnamed: 0,gen1_id,sex_assigned_at_birth_1,SHgt_cm_9_1,SHgt_cm_12,SHgt_cm_15_1,SHgt_cm_20,growth_rate_9_12,growth_rate_12_15,sex_assigned_at_birth_2,SHgt_cm_9_2,SHgt_cm_15_2,gen_2_height_diff
0,774,F,145.226308,162.215606,182.591204,184.787392,5.663099,6.791866,M,140.200001,175.987663,35.787661
1,774,F,145.226308,162.215606,182.591204,184.787392,5.663099,6.791866,M,142.552073,175.519183,32.96711
2,771,F,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,M,134.59377,165.546779,30.953009
3,771,F,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,M,138.972537,171.72764,32.755102
4,771,F,135.672697,152.436453,162.046895,162.602366,5.587919,3.203481,M,134.192305,171.857849,37.665544


In [36]:
mom_son = growth_spurt_df[(growth_spurt_df["sex_assigned_at_birth_1"] == 'F') & (growth_spurt_df["sex_assigned_at_birth_2"] == 'M')]
mom_daughter = growth_spurt_df[(growth_spurt_df["sex_assigned_at_birth_1"] == 'F') & (growth_spurt_df["sex_assigned_at_birth_2"] == 'F')]
dad_daughter = growth_spurt_df[(growth_spurt_df["sex_assigned_at_birth_1"] == 'M') & (growth_spurt_df["sex_assigned_at_birth_2"] == 'F')]
dad_son = growth_spurt_df[(growth_spurt_df["sex_assigned_at_birth_1"] == 'M') & (growth_spurt_df["sex_assigned_at_birth_2"] == 'M')]

In [37]:
mom_son.columns

Index(['gen1_id', 'sex_assigned_at_birth_1', 'SHgt_cm_9_1', 'SHgt_cm_12',
       'SHgt_cm_15_1', 'SHgt_cm_20', 'growth_rate_9_12', 'growth_rate_12_15',
       'sex_assigned_at_birth_2', 'SHgt_cm_9_2', 'SHgt_cm_15_2',
       'gen_2_height_diff'],
      dtype='object')

Applying one-hot encoding to our new dataframes:

In [38]:
def binary_encoding(df):
    new_df = df.copy()
    encoded_df = pd.get_dummies(new_df, columns=['sex_assigned_at_birth_1', 'sex_assigned_at_birth_2'])
    return encoded_df

In [39]:
mom_son_final = binary_encoding(mom_son)
mom_daughter_final = binary_encoding(mom_daughter)
dad_daughter_final = binary_encoding(dad_daughter)
dad_son_final = binary_encoding(dad_son)

Linear regression:

In [40]:
def lin_reg(df):
    X = df.drop(['gen1_id', 'SHgt_cm_9_1', 'SHgt_cm_12', 'SHgt_cm_15_1', 'SHgt_cm_20', 'SHgt_cm_9_2', 'SHgt_cm_15_2', 'gen_2_height_diff'], axis=1)
    y = df['gen_2_height_diff'] # target

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    lr = LinearRegression()
    lr.fit(X_train, y_train)

    y_pred = lr.predict(X_test)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = lr.score(X_test, y_test)

    print(f"RMSE: {round(rmse, 3)}")
    print(f"R-squared: {round(r2, 3)}")    
    print()

    coefficients_d = pd.DataFrame({'feature': X.columns, 'Coefficient': lr.coef_})
    print(coefficients_d)    

In [41]:
print(f"Mother -> Son Results:")
print()
mom_son_lin_reg = lin_reg(mom_son_final)
mom_son_lin_reg

Mother -> Son Results:

RMSE: 13.588
R-squared: -13.26

                     feature  Coefficient
0           growth_rate_9_12    -7.241511
1          growth_rate_12_15     0.228800
2  sex_assigned_at_birth_1_F     0.000000
3  sex_assigned_at_birth_2_M     0.000000


In [42]:
print(f"Mother -> Daughter Results:")
print()
mom_daughter_lin_reg = lin_reg(mom_daughter_final)
mom_daughter_lin_reg

Mother -> Daughter Results:

RMSE: 10.783
R-squared: -0.067

                     feature  Coefficient
0           growth_rate_9_12     0.174657
1          growth_rate_12_15    -0.969565
2  sex_assigned_at_birth_1_F     0.000000
3  sex_assigned_at_birth_2_F     0.000000


In [43]:
print(f"Father -> Daughter Results:")
print()
dad_daughter_lin_reg = lin_reg(dad_daughter_final)
dad_daughter_lin_reg

Father -> Daughter Results:

RMSE: 4.579
R-squared: -0.97

                     feature  Coefficient
0           growth_rate_9_12    -1.484265
1          growth_rate_12_15     1.777718
2  sex_assigned_at_birth_1_M     0.000000
3  sex_assigned_at_birth_2_F     0.000000


In [44]:
print(f"Father -> Son Results:")
print()
dad_son_lin_reg = lin_reg(dad_son_final)
dad_son_lin_reg

Father -> Son Results:

RMSE: 18.131
R-squared: -0.187

                     feature  Coefficient
0           growth_rate_9_12    -6.629484
1          growth_rate_12_15     3.758739
2  sex_assigned_at_birth_1_M     0.000000
3  sex_assigned_at_birth_2_M     0.000000


Both the mother/son and the father/son relationships show that the growth rate of the parents between the ages of 9 and 12 has strong negative impact on the growth magnitude of their son's growth spurts. However, the mother/daughter and father/daughter relationships doesn't impact the growth magnitude for gen 2. 