In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree

In [29]:
df1 = pd.read_csv("DataSet1.csv")
best_params = pd.read_csv('Best_Parameters_DataSet1.csv').fillna("")

targets = ['SHLT', 'MSTOT', 'COGTOT']
features = df1.columns.difference(['group', *targets])

groups = df1['group'].unique()

In [30]:
for group in groups:
    if group in ["0,0,1,1,1", "0,1,1,1,1"]:
        X = df1[features][df1['group'] == group]
        y = df1[targets][df1['group'] == group]

        rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
        rf.fit(X, y)

        # Visualize the first tree in the forest
        plt.figure(figsize=(100, 30))
        plot_tree(rf.estimators_[0], feature_names=features, max_depth = 5, label='root', fontsize =20, filled=True, rounded=True, proportion=True)
        plt.savefig(f"Tree_splition/Tree_{group}.png")
        plt.close()


In [31]:
df2 = pd.read_csv("DataSet2.csv")
best_params = pd.read_csv('Best_Parameters_DataSet2.csv').fillna("")

for group in groups:
    if group in ["0,0,1,1,1", "0,1,1,1,1"]:
        continue
    X = df1[features][df1['group'] == group]
    y = df1[targets][df1['group'] == group]

    if group in ["1,0,0,0,0"]:
        depth = 20
    else:
        depth = None

    rf = RandomForestRegressor(n_estimators=300, max_depth=depth, random_state=42, n_jobs=-1)
    rf.fit(X, y)

    # Visualize the first tree in the forest
    plt.figure(figsize=(100, 30))
    plot_tree(rf.estimators_[0], feature_names=features, max_depth = 5, label='root', fontsize =20, filled=True, rounded=True, proportion=True)
    plt.savefig(f"Tree_splition/Tree_{group}.png")
    plt.close()


In [2]:
feature_importance_1 = pd.read_csv("output_feature_importances_all_targets_Dataset1.csv")
feature_importance_1 = feature_importance_1[feature_importance_1["group"].isin(["0,0,1,1,1", "0,1,1,1,1"])]

feature_importance_2 = pd.read_csv("output_feature_importances_all_targets_Dataset2.csv")
feature_importance_2 = feature_importance_2[feature_importance_2["group"].isin(["0,0,1,1,1", "0,1,1,1,1"]) == False]

feature_importance_combained = pd.concat([feature_importance_1, feature_importance_2])
feature_importance_combained.to_csv("output_feature_importances_combained.csv", index=False)

In [3]:
feature_importance_combained

Unnamed: 0,group,targets,BMI,HAIRA,HATOTB,HCHILD,HHHRES,HITOT,IEARN,INHPFN,LIVSIB,PRPCNT
5,00111,"SHLT, MSTOT, COGTOT",0.206846,0.102273,0.211099,0.089465,0.027824,0.197395,0.041733,0.009942,0.085224,0.0282
6,01111,"SHLT, MSTOT, COGTOT",0.205366,0.165408,0.164281,0.086882,0.018887,0.190729,0.041454,0.011675,0.090251,0.025067
0,00000,"SHLT, MSTOT, COGTOT",0.186807,0.063827,0.18522,0.068439,0.048396,0.197449,0.139575,0.004504,0.080669,0.025113
1,00001,"SHLT, MSTOT, COGTOT",0.206882,0.103278,0.204477,0.07382,0.045222,0.179372,0.06601,0.001703,0.094715,0.024522
2,00100,"SHLT, MSTOT, COGTOT",0.177118,0.080345,0.194356,0.078711,0.0331,0.186548,0.132384,0.004482,0.081249,0.031707
3,"0,0,1,0,1 + 0,1,1,0,0 + 0,1,1,1,0","SHLT, MSTOT, COGTOT",0.182868,0.100472,0.204485,0.07752,0.025929,0.192284,0.102273,0.002776,0.084104,0.027289
4,00110,"SHLT, MSTOT, COGTOT",0.165806,0.093817,0.215859,0.081915,0.020001,0.181478,0.122888,0.005638,0.085034,0.027564
7,10000,"SHLT, MSTOT, COGTOT",0.190242,0.00816,0.187186,0.123934,0.07556,0.154372,0.08602,0.001495,0.146708,0.026324
