Instead of repeating the process again, we will use the `eid` in `eligible_data_imputed_missForest.csv` to merge the imputed data with the original data for the split.

In [28]:
import pandas as pd
from tqdm import tqdm
from IPython.display import display

total_data_unimputed = pd.read_csv("../impute_data/eligible_data_imputed_missForest.csv")
train_data_unimputed = pd.read_csv("train_data_unimputed.csv")
test_data_unimputed = pd.read_csv("test_data_unimputed.csv")
print(total_data_unimputed.shape, train_data_unimputed.shape, test_data_unimputed.shape)

(35159, 15) (28127, 101) (7032, 101)


In [8]:
total_data_imputed = pd.read_csv("../impute_data/eligible_data_imputed_missForest.csv")
print(total_data_imputed.shape)
total_data_imputed.head()

(35159, 15)


Unnamed: 0,eid,age,sex,ethnicity,BMI,smoking,diabetes,systolic_bp,hypertension_treatment,total_chol,hdl_chol,education,activity,max_workload,max_heart_rate
0,1000205,40,1,1,21.5595,0,0,149.0,0,4.569,1.228,2,0,130,139.0
1,1000239,65,0,1,22.9214,1,0,137.0,0,5.78,2.221,1,1,60,126.0
2,1000677,42,0,1,37.892,2,0,124.0,0,5.874,1.323,3,1,80,109.0
3,1000737,52,1,1,22.8374,0,0,148.0,0,4.429,1.361138,4,2,110,112.0
4,1000779,56,1,1,25.0194,0,0,144.0,0,6.258,1.406,3,2,110,112.0


**Instead of merging, we will create a dictionary for direct imputation**. This will be much faster than merging.

In [11]:
imputed_dict = {col: dict(zip(total_data_imputed['eid'], total_data_imputed[col])) 
                for col in total_data_imputed.columns if col != 'eid'}
imputed_dict.keys()

dict_keys(['age', 'sex', 'ethnicity', 'BMI', 'smoking', 'diabetes', 'systolic_bp', 'hypertension_treatment', 'total_chol', 'hdl_chol', 'education', 'activity', 'max_workload', 'max_heart_rate'])

In [12]:
# Here is an example of what the dictionary looks like:
for col in imputed_dict.keys():
    print(col)
    print(imputed_dict[col])
    break

age
{1000205: 40, 1000239: 65, 1000677: 42, 1000737: 52, 1000779: 56, 1000928: 63, 1001125: 64, 1001233: 68, 1001319: 62, 1001324: 48, 1001346: 63, 1001978: 52, 1002007: 54, 1002252: 65, 1002485: 66, 1002908: 41, 1002950: 42, 1003094: 45, 1003262: 62, 1003448: 40, 1003534: 67, 1003555: 40, 1003590: 68, 1003727: 53, 1003792: 48, 1003821: 55, 1003952: 42, 1004090: 64, 1004371: 53, 1004429: 58, 1004447: 51, 1004709: 68, 1004744: 66, 1004869: 62, 1004902: 55, 1005437: 63, 1005625: 62, 1005769: 51, 1005916: 41, 1005957: 66, 1006097: 62, 1006256: 63, 1006347: 65, 1006465: 50, 1006564: 65, 1006634: 60, 1006667: 62, 1006787: 53, 1007251: 52, 1007264: 61, 1007322: 61, 1007426: 42, 1007563: 61, 1007633: 66, 1007682: 60, 1007948: 56, 1008258: 51, 1008656: 45, 1008754: 44, 1008867: 66, 1008923: 49, 1008985: 59, 1009018: 61, 1009419: 54, 1009936: 46, 1009945: 64, 1010220: 62, 1010251: 47, 1010264: 64, 1010866: 54, 1010884: 58, 1010925: 66, 1011145: 49, 1011407: 44, 1011688: 57, 1011705: 57, 1011753

In [14]:
train_data_imputed = train_data_unimputed.copy()
test_data_imputed = test_data_unimputed.copy()

for col in tqdm(train_data_unimputed.columns):
    if col != 'eid' and col in imputed_dict:
        # * map() will read the dictionary and retrieve the corresponding value based on eid!
        train_data_imputed[col] = train_data_unimputed['eid'].map(imputed_dict[col])

for col in tqdm(test_data_unimputed.columns):
    if col != 'eid' and col in imputed_dict:
        test_data_imputed[col] = test_data_unimputed['eid'].map(imputed_dict[col])

100%|██████████| 101/101 [00:00<00:00, 986.97it/s]
100%|██████████| 101/101 [00:00<00:00, 1036.98it/s]


In [31]:
train_data_imputed["total_chol"] = round(train_data_imputed["total_chol"], 3)
train_data_imputed["hdl_chol"] = round(train_data_imputed["hdl_chol"], 3)

test_data_imputed["total_chol"] = round(test_data_imputed["total_chol"], 3)
test_data_imputed["hdl_chol"] = round(test_data_imputed["hdl_chol"], 3)

In [33]:
missing_columns = ["total_chol", "hdl_chol", "activity"]  # these columns are missing
compare_df = pd.concat([test_data_unimputed[missing_columns], test_data_imputed[missing_columns]], axis=1).head(100)

pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', None)

display(compare_df.head(20))

Unnamed: 0,total_chol,hdl_chol,activity,total_chol.1,hdl_chol.1,activity.1
0,,,0.0,6.469,1.619,0
1,6.646,1.221,1.0,6.646,1.221,1
2,6.292,1.225,1.0,6.292,1.225,1
3,5.436,1.186,1.0,5.436,1.186,1
4,5.162,0.902,2.0,5.162,0.902,2
5,6.365,1.618,0.0,6.365,1.618,0
6,6.896,1.339,0.0,6.896,1.339,0
7,4.977,1.354,2.0,4.977,1.354,2
8,5.531,1.105,1.0,5.531,1.105,1
9,,,1.0,5.913,1.29,1


Note the `Unnamed: 0` is created by `sklearn` when we split the data.

In [34]:
# save to csv
train_data_imputed.to_csv("train_data_imputed.csv", index=False)
test_data_imputed.to_csv("test_data_imputed.csv", index=False)