In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree

In [29]:
df1 = pd.read_csv("DataSet1.csv")
best_params = pd.read_csv('Best_Parameters_DataSet1.csv').fillna("")

targets = ['SHLT', 'MSTOT', 'COGTOT']
features = df1.columns.difference(['group', *targets])

groups = df1['group'].unique()

In [30]:
for group in groups:
    if group in ["0,0,1,1,1", "0,1,1,1,1"]:
        X = df1[features][df1['group'] == group]
        y = df1[targets][df1['group'] == group]

        rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
        rf.fit(X, y)

        # Visualize the first tree in the forest
        plt.figure(figsize=(100, 30))
        plot_tree(rf.estimators_[0], feature_names=features, max_depth = 5, label='root', fontsize =20, filled=True, rounded=True, proportion=True)
        plt.savefig(f"Tree_splition/Tree_{group}.png")
        plt.close()


In [31]:
df2 = pd.read_csv("DataSet2.csv")
best_params = pd.read_csv('Best_Parameters_DataSet2.csv').fillna("")

for group in groups:
    if group in ["0,0,1,1,1", "0,1,1,1,1"]:
        continue
    X = df1[features][df1['group'] == group]
    y = df1[targets][df1['group'] == group]

    if group in ["1,0,0,0,0"]:
        depth = 20
    else:
        depth = None

    rf = RandomForestRegressor(n_estimators=300, max_depth=depth, random_state=42, n_jobs=-1)
    rf.fit(X, y)

    # Visualize the first tree in the forest
    plt.figure(figsize=(100, 30))
    plot_tree(rf.estimators_[0], feature_names=features, max_depth = 5, label='root', fontsize =20, filled=True, rounded=True, proportion=True)
    plt.savefig(f"Tree_splition/Tree_{group}.png")
    plt.close()


In [7]:
feature_importance_1 = pd.read_csv("output_feature_importances_dataset1.csv")
feature_importance_1 = feature_importance_1[feature_importance_1["group"].isin(["0,0,1,1,1", "0,1,1,1,1"])]

feature_importance_2 = pd.read_csv("output_feature_importances_dataset2.csv")
feature_importance_2 = feature_importance_2[feature_importance_2["group"].isin(["0,0,1,1,1", "0,1,1,1,1"]) == False]

feature_importance_combained = pd.concat([feature_importance_1, feature_importance_2])
feature_importance_combained.to_csv("output_feature_importances_combained.csv", index=False)

In [9]:
feature_importance_combained

Unnamed: 0,group,target,BMI,HAIRA,HATOTB,HCHILD,HHHRES,HITOT,IEARN,INHPFN,LIVSIB,PRPCNT
15,00111,SHLT,0.214927,0.106904,0.218513,0.090942,0.026093,0.185862,0.039775,0.010203,0.080507,0.026274
16,00111,MSTOT,0.180268,0.086689,0.230536,0.079972,0.02798,0.213145,0.046133,0.009208,0.099905,0.026164
17,00111,COGTOT,0.173121,0.106367,0.20603,0.069856,0.026741,0.250736,0.047438,0.009362,0.083278,0.027072
18,01111,SHLT,0.217305,0.166958,0.165876,0.080337,0.019631,0.190739,0.04542,0.013792,0.078545,0.021398
19,01111,MSTOT,0.165215,0.138908,0.194458,0.112301,0.013292,0.195822,0.02952,0.010216,0.11134,0.028926
20,01111,COGTOT,0.169942,0.151332,0.18793,0.084775,0.01809,0.213458,0.037654,0.008542,0.097844,0.030433
0,00000,SHLT,0.194969,0.06541,0.186716,0.065231,0.045716,0.196229,0.139151,0.004886,0.077498,0.024194
1,00000,MSTOT,0.168243,0.064744,0.174128,0.06879,0.049524,0.210657,0.154777,0.002657,0.081374,0.025108
2,00000,COGTOT,0.164057,0.07146,0.179723,0.066568,0.045428,0.222978,0.143488,0.004149,0.076916,0.025233
3,00001,SHLT,0.219363,0.103757,0.205169,0.070899,0.045004,0.17399,0.067219,0.002056,0.089202,0.02334
