In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer, mean_squared_error
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}


In [3]:
# Initialize the regressor
rf = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='neg_mean_squared_error',  # or 'r2' for R² score
                           cv=10,  # Number of folds in cross-validation
                           n_jobs=-1,  # Use all available CPUs
                           verbose=2)


In [4]:
df = pd.read_csv("Removed_outliers_byGroup_data.csv")
df

Unnamed: 0,SHLT,BMI,MSTOT,COGTOT,INHPFN,HHHRES,HCHILD,LIVSIB,HINPOV,HAIRA,HATOTB,IEARN,HITOT,PENINC,HIGOV,PRPCNT,SLFEMP,RETMON
0,5.0,33.0,14.0,17.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,20000.0,22400.0,0.0,0.0,0.0,0.0,0
1,4.0,23.8,8.0,14.0,0.0,2.0,6.0,2.0,0.0,0.0,15000.0,25000.0,107000.0,0.0,0.0,1.0,0.0,0
2,3.0,26.0,15.0,27.0,0.0,2.0,2.0,1.0,0.0,40000.0,290000.0,103000.0,134384.0,0.0,0.0,1.0,0.0,0
3,4.0,40.7,11.0,16.0,0.0,3.0,4.0,7.0,0.0,0.0,16477.0,62000.0,72157.0,0.0,0.0,0.0,0.0,0
4,3.0,22.8,15.0,31.0,0.0,4.0,4.0,4.0,0.0,4000.0,138300.0,15000.0,95660.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33533,4.0,39.6,14.0,23.0,0.0,2.0,5.0,6.0,1.0,0.0,27500.0,27000.0,29112.0,0.0,0.0,0.0,0.0,0
33534,1.0,18.4,14.0,27.0,0.0,5.0,6.0,5.0,1.0,0.0,90000.0,0.0,0.0,0.0,0.0,1.0,0.0,0
33535,1.0,26.9,9.0,25.0,0.0,2.0,7.0,7.0,1.0,0.0,2341.0,3295.0,3295.0,0.0,0.0,1.0,0.0,0
33536,4.0,29.3,13.0,23.0,0.0,3.0,3.0,7.0,1.0,0.0,0.0,18000.0,18000.0,0.0,0.0,0.0,0.0,0


In [None]:
df['group'] = df.apply(lambda row: f"{int(row['HINPOV'])}, {int(row['PENINC'])}, {int(row['HIGOV'])}, {int(row['RETMON'])}, {int(row['SLFEMP'])}", axis=1)
groups = df['group'].unique()
models = {}
performance = {}
df

array(['0, 0, 0, 0, 0', '0, 0, 0, 0, 1', '0, 0, 1, 0, 0', '0, 0, 1, 0, 1',
       '0, 0, 1, 1, 0', '0, 0, 1, 1, 1', '0, 1, 1, 0, 0', '0, 1, 1, 1, 0',
       '0, 1, 1, 1, 1', '1, 0, 0, 0, 0'], dtype=object)

In [39]:
target_features = ['SHLT', 'COGTOT', 'MSTOT']
continuous_features = ['BMI', 'INHPFN',  'HHHRES', 'HCHILD','LIVSIB',  'HAIRA', 'HATOTB', 'IEARN','HITOT', 'PRPCNT']

In [40]:
for group in groups:
    # Filter data for the current group
    group_data = df[df['group'] == group]
    
    # Split the group data into features and targets
    X = group_data[['BMI', 'INHPFN',  'HHHRES', 'HCHILD','LIVSIB',  'HAIRA', 'HATOTB', 'IEARN','HITOT', 'PRPCNT']]
    y = group_data[['SHLT', 'COGTOT', 'MSTOT']]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    # Train a model for each target
    for target in y_train.columns:
        # Initialize the RandomForestRegressor
        rf = RandomForestRegressor(random_state=42)
        
        # Perform the search
        grid_search.fit(X_train, y_train[target])
        
        # Get the best model
        best_model = grid_search.best_estimator_
        
        # Predict using the best model
        y_pred = best_model.predict(X_test)
        
        # Calculate the performance
        mse = mean_squared_error(y_test[target], y_pred)
        
        # Store the best model and its performance
        models[(group, target)] = best_model
        performance[(group, target)] = mse
        
        # Print the best parameters for this group and target
        print(f"Best parameters for group {group} and target {target}: {grid_search.best_params_}")
        print(f"MSE for group {group} and target {target}: {mse}")
 
# Now 'models' contains the best model for each target within each group,
# and 'performance' contains the MSE for these models.

Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Best parameters for group 0.00.00.00.00.0 and target SHLT: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
MSE for group 0.00.00.00.00.0 and target SHLT: 0.28471086284429453
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Best parameters for group 0.00.00.00.00.0 and target COGTOT: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
MSE for group 0.00.00.00.00.0 and target COGTOT: 4.922834664839798
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Best parameters for group 0.00.00.00.00.0 and target MSTOT: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 300}
MSE for group 0.00.00.00.00.0 and target MSTOT: 0.9969194022859285
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Best parameters for group 0.00.00.00.01.0 and target SHLT: {'max_depth': 30, 'min_samples_leaf'