In [644]:
import pandas as pd
import matplotlib.pyplot as plt
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression

In [645]:
gen1_train = pd.read_csv("gen1_train_comp_final.csv")
gen2_train = pd.read_csv("gen2_train_comp_final.csv")
gen1_test = pd.read_csv("gen1_test_comp_final.csv")
gen2_test = pd.read_csv("gen2_test_upto9_comp_final.csv")

In [646]:
gen1_train.head()

Unnamed: 0,gen1_id,sex_assigned_at_birth,age,SHgt_cm
0,774,F,0.1,56.961812
1,774,F,0.25,64.82619
2,774,F,0.5,74.340764
3,774,F,0.75,79.747338
4,774,F,1.0,84.092569


In [647]:
gen1_test.head()

Unnamed: 0,gen1_id,sex_assigned_at_birth,age,SHgt_cm
0,768,F,0.1,53.822825
1,768,F,0.25,61.455579
2,768,F,0.5,69.757527
3,768,F,0.75,73.385477
4,768,F,1.0,78.129137


In [648]:
gen2_train.head()

Unnamed: 0,gen2_id,sex_assigned_at_birth,study_parent_sex,study_parent_id_new,AgeGr,SHgt_cm,Wgt_kg
0,3012,M,mother,636,0.1,56.251625,4.636903
1,3012,M,mother,636,0.25,64.491579,
2,3012,M,mother,636,0.5,70.465927,
3,3012,M,mother,636,0.75,73.992677,
4,3012,M,mother,636,1.0,79.343537,


In [649]:
gen2_test.head()

Unnamed: 0,gen2_id,sex_assigned_at_birth,study_parent_sex,study_parent_id_new,AgeGr,SHgt_cm,Wgt_kg
0,2831,F,mother,455,0.1,52.912025,
1,2831,F,mother,455,0.25,59.532779,
2,2831,F,mother,455,0.5,67.733527,
3,2831,F,mother,455,0.75,70.450677,
4,2831,F,mother,455,1.0,74.991937,


In [650]:
gen2_test.isna().sum()

gen2_id                    0
sex_assigned_at_birth      0
study_parent_sex           0
study_parent_id_new        0
AgeGr                      0
SHgt_cm                  132
Wgt_kg                   823
dtype: int64

In [651]:
# Imputing height and weight with linear interpolation
gen1_train['SHgt_cm'] = gen1_train.groupby('gen1_id')['SHgt_cm'].transform(lambda x: x.interpolate(method='linear'))
gen1_test['SHgt_cm'] = gen1_test.groupby('gen1_id')['SHgt_cm'].transform(lambda x: x.interpolate(method='linear'))

gen2_train['SHgt_cm'] = gen2_train.groupby('gen2_id')['SHgt_cm'].transform(lambda x: x.interpolate(method='linear'))
gen2_test['SHgt_cm'] = gen2_test.groupby('gen2_id')['SHgt_cm'].transform(lambda x: x.interpolate(method='linear'))

gen2_train['Wgt_kg'] = gen2_train.groupby('gen2_id')['Wgt_kg'].transform(lambda x: x.interpolate(method='linear'))
gen2_test['Wgt_kg'] = gen2_test.groupby('gen2_id')['Wgt_kg'].transform(lambda x: x.interpolate(method='linear'))

In [652]:
# Imputing height  and weight with mean of same sex and age for remaining missing values
gen1_train['SHgt_cm'] = gen1_train.groupby(['sex_assigned_at_birth', 'age'])['SHgt_cm'].transform(lambda x: x.fillna(x.mean()))
gen1_test['SHgt_cm'] = gen1_test.groupby(['sex_assigned_at_birth', 'age'])['SHgt_cm'].transform(lambda x: x.fillna(x.mean()))

gen2_train['SHgt_cm'] = gen2_train.groupby(['sex_assigned_at_birth', 'AgeGr'])['SHgt_cm'].transform(lambda x: x.fillna(x.mean()))
gen2_test['SHgt_cm'] = gen2_test.groupby(['sex_assigned_at_birth', 'AgeGr'])['SHgt_cm'].transform(lambda x: x.fillna(x.mean()))

gen2_train['Wgt_kg'] = gen2_train.groupby(['sex_assigned_at_birth', 'AgeGr'])['Wgt_kg'].transform(lambda x: x.fillna(x.mean()))
gen2_test['Wgt_kg'] = gen2_test.groupby(['sex_assigned_at_birth', 'AgeGr'])['Wgt_kg'].transform(lambda x: x.fillna(x.mean()))


In [653]:
gen2_train.isna().sum()

gen2_id                    0
sex_assigned_at_birth      0
study_parent_sex           0
study_parent_id_new        0
AgeGr                      0
SHgt_cm                    0
Wgt_kg                   864
dtype: int64

In [654]:
# Merge parent and children training sets (dropping weight for now)

merged_train = gen2_train.merge(gen1_train, how='left', left_on=['study_parent_id_new','AgeGr'], right_on=['gen1_id', 'age'])

# merged_train = merged_train.drop(['study_parent_sex', 'study_parent_id_new', 'Wgt_kg'], axis=1)

# merged_train = merged_train.rename({'gen1_id':'parent_id', 'sex_assigned_at_birth_x':'parent_sex', 
#                      'age':'parent_age', 'SHgt_cm_x':'parent_height', 'gen2_id':'child_id',
#                      'sex_assigned_at_birth_y':'child_sex', 'AgeGr':'child_age', 
#                      'SHgt_cm_y':'child_height'}, axis=1)

merged_train.head(5)

Unnamed: 0,parent_id,parent_sex,parent_age,parent_height,child_id,child_sex,child_age,child_height
0,774,F,0.1,56.961812,2774,M,0.1,60.676532
1,774,F,0.1,56.961812,2692,M,0.1,58.068694
2,774,F,0.25,64.82619,2774,M,0.25,65.592071
3,774,F,0.25,64.82619,2692,M,0.25,64.25622
4,774,F,0.5,74.340764,2774,M,0.5,72.894888


In [655]:
# Merge parent and children test sets (dropping weight for now)

merged_test = gen2_test.merge(gen1_test, how='left', left_on=['study_parent_id_new','AgeGr'], right_on=['gen1_id', 'age'])

# merged_test = merged_test.drop(['study_parent_sex', 'study_parent_id_new', 'Wgt_kg'], axis=1)

# merged_test = merged_test.rename({'gen1_id':'parent_id', 'sex_assigned_at_birth_x':'parent_sex', 
#                      'age':'parent_age', 'SHgt_cm_x':'parent_height', 'gen2_id':'child_id',
#                      'sex_assigned_at_birth_y':'child_sex', 'AgeGr':'child_age', 
#                      'SHgt_cm_y':'child_height'}, axis=1)

merged_test.head()

Unnamed: 0,gen2_id,sex_assigned_at_birth_x,study_parent_sex,study_parent_id_new,AgeGr,SHgt_cm_x,Wgt_kg,gen1_id,sex_assigned_at_birth_y,age,SHgt_cm_y
0,2831,F,mother,455,0.1,52.912025,,,,,
1,2831,F,mother,455,0.25,59.532779,,,,,
2,2831,F,mother,455,0.5,67.733527,,,,,
3,2831,F,mother,455,0.75,70.450677,,,,,
4,2831,F,mother,455,1.0,74.991937,,,,,


In [656]:
le = LabelEncoder()
columns_to_encode = ['parent_sex', 'child_sex']

for col in columns_to_encode:
    merged_train[col] = le.fit_transform(merged_train[col])
    merged_test[col] = le.fit_transform(merged_test[col])

KeyError: 'parent_sex'

In [536]:
X_train = (
    merged_train[merged_train["child_age"] <= 9]
    .pivot(index="child_id", columns="child_age", values="child_height")
)

# Rename columns to reflect age-based features
X_train.columns = [f"height_at_age_{age}" for age in X_train.columns]

# Add parent-related features (aggregated)
parent_features = merged_train.groupby("child_id").agg(
    parent_max_height=("parent_height", "max"),
    parent_min_height=("parent_height", "min"),
    parent_mean_height=("parent_height", "mean"),
    parent_growth_slope=("parent_height", lambda x: (x.iloc[-1] - x.iloc[0]) / (x.index[-1] - x.index[0] + 1))
)
X_train = X_train.merge(parent_features, left_index=True, right_index=True)


In [537]:
y_train = (
    merged_train[merged_train["child_age"] > 9]
    .pivot(index="child_id", columns="child_age", values="child_height")
)

# Rename target columns
y_train.columns = [f"target_height_at_age_{int(age)}" for age in y_train.columns]


In [538]:
X_train.head()

Unnamed: 0_level_0,height_at_age_0.1,height_at_age_0.25,height_at_age_0.5,height_at_age_0.75,height_at_age_1.0,height_at_age_1.5,height_at_age_2.0,height_at_age_3.0,height_at_age_4.0,height_at_age_5.0,height_at_age_6.0,height_at_age_7.0,height_at_age_8.0,height_at_age_9.0,parent_max_height,parent_min_height,parent_mean_height,parent_growth_slope
child_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2509,58.340287,64.892308,69.87229,74.645393,77.889181,85.099498,90.794792,100.800558,109.603019,114.060892,122.412227,129.654912,135.335036,142.289936,187.320811,55.826458,123.964153,5.977016
2510,60.23063,66.481798,72.892351,75.839962,79.379582,85.720324,94.149611,102.995835,111.842058,117.804483,124.126552,132.424397,138.862341,145.5689,194.970999,57.149109,129.424931,2.153467
2513,52.931457,60.6703,67.65772,72.429313,77.864908,84.589526,88.423925,96.628471,103.67685,108.88313,116.009475,119.823838,124.445479,129.62211,182.991174,53.93581,120.347412,5.866153
2514,59.747495,67.777098,73.293031,78.117079,81.205328,89.44596,96.569437,105.724538,114.28734,120.079372,125.193908,134.636486,139.077291,144.910506,188.761179,56.888647,122.74048,5.994206
2515,57.874021,66.271674,71.975038,74.358996,81.091844,87.847264,97.178483,106.509702,113.283472,119.793981,113.630269,132.768035,138.650925,144.985123,189.772037,58.514945,125.008654,5.966231


In [539]:
y_train.head()

Unnamed: 0_level_0,target_height_at_age_10,target_height_at_age_11,target_height_at_age_12,target_height_at_age_13,target_height_at_age_14,target_height_at_age_15,target_height_at_age_16,target_height_at_age_18
child_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2509,147.28922,155.924073,164.750772,170.20808,170.20808,170.20808,170.20808,170.20808
2510,151.265047,155.998662,161.689019,167.173962,176.680109,183.780534,183.780534,183.780534
2513,137.312339,143.764537,147.231769,152.734873,161.902362,167.298785,167.298785,167.298785
2514,147.781505,154.520668,157.834,162.072648,168.498437,181.159295,181.159295,181.159295
2515,150.670882,157.367574,163.94036,168.924856,171.338386,173.033062,173.033062,173.033062


In [540]:
X_test = (
    merged_test.pivot(index="child_id", columns="child_age", values="child_height")
)
X_test.columns = [f"height_at_age_{age}" for age in X_test.columns]

parent_features = merged_test.groupby("child_id").agg(
    parent_max_height=("parent_height", "max"),
    parent_min_height=("parent_height", "min"),
    parent_mean_height=("parent_height", "mean"),
    parent_growth_slope=("parent_height", lambda x: (x.iloc[-1] - x.iloc[0]) / (x.index[-1] - x.index[0] + 1))
)

X_test = X_test.merge(parent_features, left_index=True, right_index=True)

In [541]:
X_test.head()

Unnamed: 0_level_0,height_at_age_0.1,height_at_age_0.25,height_at_age_0.5,height_at_age_0.75,height_at_age_1.0,height_at_age_1.5,height_at_age_2.0,height_at_age_3.0,height_at_age_4.0,height_at_age_5.0,height_at_age_6.0,height_at_age_7.0,height_at_age_8.0,height_at_age_9.0,parent_max_height,parent_min_height,parent_mean_height,parent_growth_slope
child_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
2332,,,,,,,,,,,,,,,127.085434,55.389265,89.637828,5.121155
2503,,,,,,,,,,,,,,,135.072529,54.471286,96.420927,5.757232
2504,55.40568,62.317808,69.192296,72.323885,75.885971,81.266292,86.87537,93.737598,102.832376,108.878133,113.749173,119.769676,125.057073,130.702941,129.174593,54.471286,91.648391,5.33595
2506,58.508684,64.940758,73.205793,79.522286,84.027084,88.531882,93.923465,103.697442,110.711241,118.652808,124.819605,132.336902,136.977551,142.997282,139.459044,57.50581,98.171809,5.853802
2508,,,,,,,,,,,,,,,142.476229,54.471286,97.529157,6.286067


In [542]:
models = {}
for age in [10, 11, 12, 13, 14, 15, 16, 18]:
    target_col = f"target_height_at_age_{age}"
    model = XGBRegressor(objective="reg:squarederror", n_estimators=500, learning_rate=0.05)
    model.fit(X_train, y_train[target_col])
    models[age] = model

In [543]:
predictions = {}
for age, model in models.items():
    predictions[f"predicted_height_at_age_{age}"] = model.predict(X_test)

# Convert predictions to a DataFrame
pred_df = pd.DataFrame(predictions, index=X_test.index).reset_index()

In [544]:
pred_df

Unnamed: 0,child_id,predicted_height_at_age_10,predicted_height_at_age_11,predicted_height_at_age_12,predicted_height_at_age_13,predicted_height_at_age_14,predicted_height_at_age_15,predicted_height_at_age_16,predicted_height_at_age_18
0,2332,154.495911,161.138840,162.855652,160.820007,160.335785,160.838470,162.609299,165.177277
1,2503,154.491608,161.096512,162.569031,160.145065,160.348862,161.126617,162.705704,165.752228
2,2504,135.234406,140.406876,146.358276,153.013992,159.297104,163.557693,166.612854,168.638229
3,2506,148.279434,154.494324,159.147461,164.586288,169.409927,172.248322,177.779633,176.509308
4,2508,154.552826,160.931137,161.969879,159.940231,160.358170,161.229553,162.707047,166.494263
...,...,...,...,...,...,...,...,...,...
83,2820,133.935501,140.436218,146.686691,151.142883,159.780518,164.481949,168.789719,171.324341
84,2821,135.701569,142.864334,148.396774,153.551819,159.766830,164.339127,166.978714,165.471451
85,2822,154.538116,161.278336,162.876831,160.833649,160.352127,160.886322,163.001389,165.752106
86,2823,134.834793,140.613998,145.950775,152.626083,158.500259,162.936050,168.467896,168.754868
