In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import make_scorer
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error

In [2]:
param_grid = {
    'n_estimators': [50, 100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
}


In [3]:
# Initialize the regressor
rf = RandomForestRegressor(random_state=42)

# Create the GridSearchCV object
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, 
                           scoring='r2',  # Use the R^2 score to evaluate the model
                           cv=10,  # Number of folds in cross-validation
                           n_jobs=-1,  # Use all available CPUs
                           verbose=2)


In [4]:
df = pd.read_csv("ThreeForthSemester_standard_MS+COG.csv")
df

Unnamed: 0,BMI,INHPFN,HHHRES,HCHILD,LIVSIB,HAIRA,HATOTB,IEARN,HITOT,PRPCNT,SHLT,group,MS+COG
0,0.920647,-0.126169,-0.600292,0.426414,-1.217448,-0.418911,-0.598980,-0.227751,-0.860736,-1.340820,2.704447,00000,-0.761523
1,-0.851875,-0.126169,-0.600292,1.504356,-0.375403,-0.418911,-0.579889,-0.090159,0.189366,0.358474,1.641170,00000,-2.900428
2,-0.428011,-0.126169,-0.600292,-0.651528,-0.796425,-0.152079,-0.229887,2.056276,0.529271,0.358474,0.577894,00000,0.816949
3,2.404172,-0.126169,0.366366,0.426414,1.729710,-0.418911,-0.578009,0.928022,-0.243124,-1.340820,1.641170,00000,-1.766660
4,-1.044541,-0.126169,1.333024,0.426414,0.466642,-0.392228,-0.422961,-0.365343,0.048608,0.358474,0.577894,00000,1.331470
...,...,...,...,...,...,...,...,...,...,...,...,...,...
33533,2.192240,-0.126169,-0.600292,0.965385,1.308688,-0.418911,-0.563979,-0.035122,-0.777423,-1.340820,1.641170,10000,0.010259
33534,-1.892269,-0.126169,2.299682,1.504356,0.887665,-0.418911,-0.484434,-0.778119,-1.138777,0.358474,-1.548659,10000,0.524780
33535,-0.254612,-0.126169,-0.600292,2.043327,1.729710,-0.418911,-0.596000,-0.687446,-1.097877,0.358474,-1.548659,10000,-1.193326
33536,0.207785,-0.126169,0.366366,-0.112557,1.729710,-0.418911,-0.598980,-0.282788,-0.915351,-1.340820,1.641170,10000,-0.281910


In [5]:
groups = df['group'].unique()
results = []

In [6]:
groups

array(['0,0,0,0,0', '0,0,0,0,1', '0,0,1,0,0',
       '0,0,1,0,1 + 0,1,1,0,0 + 0,1,1,1,0', '0,0,1,1,0', '0,0,1,1,1',
       '0,1,1,1,1', '1,0,0,0,0'], dtype=object)

In [7]:
# Define the feature columns
feature_columns = ['BMI', 'INHPFN',  'HHHRES', 'HCHILD', 'LIVSIB',  'HAIRA', 'HATOTB', 'IEARN', 'HITOT', 'PRPCNT']

# Define the target variables
target_features = ['SHLT', 'MS+COG']

# Assuming 'df' is your DataFrame and 'grid_search' is your GridSearchCV object
results = []

# Unique groups in the DataFrame
groups = df['group'].unique()

for target in target_features:
    for group in groups:
        # Filter data for the current group
        group_data = df[df['group'] == group]
        
        # Split the group data into features and the current target
        X = group_data[feature_columns]
        y = group_data[target]
        
        # Split the data into training and testing sets
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
        
        # Perform GridSearchCV for the current target and group
        grid_search.fit(X_train, y_train)
        
        # Extract results
        best_params = grid_search.best_params_
        best_train_score = grid_search.best_score_
        best_test_score = grid_search.score(X_test, y_test) 
        
        # Make predictions
        y_train_pred = grid_search.best_estimator_.predict(X_train)
        y_test_pred = grid_search.best_estimator_.predict(X_test)

        # Calculate RMSE and MAPE for training data
        rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
        mape_train = mean_absolute_percentage_error(y_train, y_train_pred)

        # Calculate RMSE and MAPE for testing data
        rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
        mape_test = mean_absolute_percentage_error(y_test, y_test_pred)
        
        # Store the results for the current group and target
        results.append({
            'Target': target,
            'Group': group,
            'Best Parameters': str(best_params),  # Convert dict to string for easier CSV writing
            'Train R^2 (avg)': best_train_score,
            'Test R^2 (avg)': best_test_score,
            'Train RMSE': rmse_train,
            'Test RMSE': rmse_test,
            'Train MAPE': mape_train,
            'Test MAPE': mape_test
        })

# Convert the results to a DataFrame and display
results_df = pd.DataFrame(results)
# Save results to CSV
results_df.to_csv('grid_search_results_SHLT_MS+COG.csv', index=False)


Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each of 144 candidates, totalling 1440 fits
Fitting 10 folds for each