In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.tree import plot_tree

In [2]:
df1 = pd.read_csv("DataSet1.csv")
best_params = pd.read_csv('Best_Parameters_DataSet1.csv').fillna("")

targets = ['SHLT', 'MSTOT', 'COGTOT']
features = df1.columns.difference(['group', *targets])

groups = df1['group'].unique()

In [30]:
for group in groups:
    if group in ["0,0,1,1,1", "0,1,1,1,1"]:
        X = df1[features][df1['group'] == group]
        y = df1[targets][df1['group'] == group]

        rf = RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1)
        rf.fit(X, y)

        # Visualize the first tree in the forest
        plt.figure(figsize=(100, 30))
        plot_tree(rf.estimators_[0], feature_names=features, max_depth = 5, label='root', fontsize =20, filled=True, rounded=True, proportion=True)
        plt.savefig(f"Tree_splition/Tree_{group}.png")
        plt.close()


In [31]:
df2 = pd.read_csv("DataSet2.csv")
best_params = pd.read_csv('Best_Parameters_DataSet2.csv').fillna("")

for group in groups:
    if group in ["0,0,1,1,1", "0,1,1,1,1"]:
        continue
    X = df1[features][df1['group'] == group]
    y = df1[targets][df1['group'] == group]

    if group in ["1,0,0,0,0"]:
        depth = 20
    else:
        depth = None

    rf = RandomForestRegressor(n_estimators=300, max_depth=depth, random_state=42, n_jobs=-1)
    rf.fit(X, y)

    # Visualize the first tree in the forest
    plt.figure(figsize=(100, 30))
    plot_tree(rf.estimators_[0], feature_names=features, max_depth = 5, label='root', fontsize =20, filled=True, rounded=True, proportion=True)
    plt.savefig(f"Tree_splition/Tree_{group}.png")
    plt.close()


In [2]:
feature_importance_1 = pd.read_csv("output_feature_importances_all_targets_Dataset1.csv")
feature_importance_1 = feature_importance_1[feature_importance_1["group"].isin(["0,0,1,1,1", "0,1,1,1,1"])]

feature_importance_2 = pd.read_csv("output_feature_importances_all_targets_Dataset2.csv")
feature_importance_2 = feature_importance_2[feature_importance_2["group"].isin(["0,0,1,1,1", "0,1,1,1,1"]) == False]

feature_importance_combained = pd.concat([feature_importance_1, feature_importance_2])
feature_importance_combained.to_csv("output_feature_importances_combained.csv", index=False)

In [5]:
df1.describe()

Unnamed: 0,SHLT,BMI,MSTOT,COGTOT,INHPFN,HHHRES,HCHILD,LIVSIB,HAIRA,HATOTB,IEARN,HITOT,PRPCNT
count,32145.0,32145.0,32145.0,32145.0,32145.0,32145.0,32145.0,32145.0,32145.0,32145.0,32145.0,32145.0,32145.0
mean,0.855707,3.325061,2.600543,3.174848,-2.261322,0.940872,0.98035,0.628872,4.143813,12.133309,6.810352,11.127463,-0.516234
std,0.407729,0.175121,0.139179,0.170786,0.315842,0.304001,0.8099,1.19165,6.685256,2.00646,5.491602,1.052774,1.099296
min,0.09531,2.282382,1.410987,1.808289,-2.302585,0.741937,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585,-2.302585
25%,0.741937,3.202746,2.572612,3.095578,-2.302585,0.741937,0.741937,0.09531,-2.302585,11.429545,-2.302585,10.671263,-2.302585
50%,0.741937,3.314186,2.646175,3.182212,-2.302585,0.741937,1.131402,0.741937,-2.302585,12.380026,9.615812,11.170493,0.09531
75%,1.131402,3.433987,2.714695,3.299534,-2.302585,1.131402,1.410987,1.410987,11.002102,13.24281,10.596637,11.661346,0.09531
max,1.629241,4.11578,2.714695,3.558201,1.629241,2.493205,2.406945,2.895912,14.615268,16.410392,12.89922,14.10069,3.139833
