In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

In [2]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}


In [3]:
# Initialize the regressor
rf = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='r2',  # Use the R^2 score to evaluate the model
                           cv=10,  # Number of folds in cross-validation
                           n_jobs=-1,  # Use all available CPUs
                           verbose=2)


In [4]:
df = pd.read_csv("Removed_outliers_byGroup_data.csv")
df

Unnamed: 0,SHLT,BMI,MSTOT,COGTOT,INHPFN,HHHRES,HCHILD,LIVSIB,HINPOV,HAIRA,HATOTB,IEARN,HITOT,PENINC,HIGOV,PRPCNT,SLFEMP,RETMON
0,5.0,33.0,14.0,17.0,0.0,2.0,4.0,0.0,0.0,0.0,0.0,20000.0,22400.0,0.0,0.0,0.0,0.0,0
1,4.0,23.8,8.0,14.0,0.0,2.0,6.0,2.0,0.0,0.0,15000.0,25000.0,107000.0,0.0,0.0,1.0,0.0,0
2,3.0,26.0,15.0,27.0,0.0,2.0,2.0,1.0,0.0,40000.0,290000.0,103000.0,134384.0,0.0,0.0,1.0,0.0,0
3,4.0,40.7,11.0,16.0,0.0,3.0,4.0,7.0,0.0,0.0,16477.0,62000.0,72157.0,0.0,0.0,0.0,0.0,0
4,3.0,22.8,15.0,31.0,0.0,4.0,4.0,4.0,0.0,4000.0,138300.0,15000.0,95660.0,0.0,0.0,1.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33533,4.0,39.6,14.0,23.0,0.0,2.0,5.0,6.0,1.0,0.0,27500.0,27000.0,29112.0,0.0,0.0,0.0,0.0,0
33534,1.0,18.4,14.0,27.0,0.0,5.0,6.0,5.0,1.0,0.0,90000.0,0.0,0.0,0.0,0.0,1.0,0.0,0
33535,1.0,26.9,9.0,25.0,0.0,2.0,7.0,7.0,1.0,0.0,2341.0,3295.0,3295.0,0.0,0.0,1.0,0.0,0
33536,4.0,29.3,13.0,23.0,0.0,3.0,3.0,7.0,1.0,0.0,0.0,18000.0,18000.0,0.0,0.0,0.0,0.0,0


In [5]:
df['group'] = df.apply(lambda row: f"{row['HINPOV']},{row['PENINC']},{row['HIGOV']},{row['RETMON']},{row['SLFEMP']}", axis=1)
groups = df['group'].unique()
results = []

In [6]:
target_features = ['SHLT', 'COGTOT', 'MSTOT']
continuous_features = ['BMI', 'INHPFN',  'HHHRES', 'HCHILD','LIVSIB',  'HAIRA', 'HATOTB', 'IEARN','HITOT', 'PRPCNT']

In [7]:
for group in groups:
    # Filter data for the current group
    group_data = df[df['group'] == group]
    
    # Split the group data into features and targets
    X = group_data[['BMI', 'INHPFN',  'HHHRES', 'HCHILD','LIVSIB',  'HAIRA', 'HATOTB', 'IEARN','HITOT', 'PRPCNT']]
    y = group_data[['SHLT', 'COGTOT', 'MSTOT']]
    
    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)
    
    
    # Perform GridSearchCV
    grid_search.fit(X_train, y_train)
    
    # Extract results
    best_params = grid_search.best_params_
    best_train_score = grid_search.best_score_
    best_test_score = grid_search.score(X_test, y_test) 
    
    # Make predictions
    y_train_pred = grid_search.best_estimator_.predict(X_train)
    y_test_pred = grid_search.best_estimator_.predict(X_test)

    # Calculate RMSE and MAPE for training data
    rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
    mape_train = mean_absolute_percentage_error(y_train, y_train_pred)

    # Calculate RMSE and MAPE for testing data
    rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
    mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
    
    # Store results
    results.append({
        'Group': group,
        'Best Parameters': str(best_params),  # Convert dict to string for easier CSV writing
        'Train R^2 (avg)': best_train_score,
        'Test R^2 (avg)': best_test_score,
        'Train RMSE': rmse_train,
        'Test RMSE': rmse_test,
        'Train MAPE': mape_train,
        'Test MAPE': mape_test
    })

# Convert results to DataFrame
results_df = pd.DataFrame(results)

# Save results to CSV
results_df.to_csv('grid_search_results.csv', index=False)


print("grid search completed. Results saved to 'grid_search_results.csv'.")


Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
grid search completed. Results saved to 'grid_search_results.csv'.
