In [13]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error, mean_squared_error, r2_score
from sklearn.model_selection import KFold

In [25]:
def rf_cross_validation(df, groups, feature_columns, target_features, n_estimators, max_depth, min_samples_split, min_samples_leaf):
    """
    Perform a grouped 10-fold cross-validation on Random Forest Regressor,
    stratified by the 'group' feature.
    
    Parameters:
    - df: pandas DataFrame containing the features and target.
    - groups: list of unique groups to stratify data by for cross-validation.
    - feature_columns: list of column names to be used as features.
    - target_features: list or string of the target feature name.
    - n_estimators: number of trees in the forest.
    - max_depth: maximum depth of the tree.
    - min_samples_split: minimum number of samples required to split an internal node.
    - min_samples_leaf: minimum number of samples required to be at a leaf node.
    
    Returns:
    - mean absolute percentage error averaged over all folds.
    """
    rf = RandomForestRegressor(
        n_estimators=n_estimators, 
        max_depth=max_depth, 
        min_samples_split=min_samples_split, 
        min_samples_leaf=min_samples_leaf, 
        random_state=42, 
        n_jobs=-1
    )


    # Group the data and perform stratified split
    group_kfolds = []
    for _, group_data in df.groupby('group'):
        kf = KFold(n_splits=10, shuffle=True, random_state=42)
        group_kfolds.extend(list(kf.split(group_data)))

    # Sort the indices so the folds are stratified by groups
    group_kfolds.sort(key=lambda x: x[0][0])
    
    # Perform cross-validation
    results = []
    for train_index, test_index in group_kfolds:
        # Create the training and test sets
        X_train, y_train = df.iloc[train_index][feature_columns], df.iloc[train_index][target_features]
        X_test, y_test = df.iloc[test_index][feature_columns], df.iloc[test_index][target_features]

        # Fit the model
        rf.fit(X_train, y_train)
        
        pred_train = rf.predict(X_train)
        pred_test = rf.predict(X_test)

        train_mse = mean_squared_error(y_train, pred_train)
        train_r2 = r2_score(y_train, pred_train)
        test_mse = mean_squared_error(y_test, pred_test)
        test_r2 = r2_score(y_test, pred_test)
        
        results.append({
            'Fold': len(results) + 1,
            'Train_MSE': train_mse,
            'Train_R2': train_r2,
            'Test_MSE': test_mse,
            'Test_R2': test_r2
        })

    results_df = pd.DataFrame(results)
    results_df.to_csv('group_kfold_results.csv', index=False)

In [26]:
df = pd.read_csv("ThreeForthSemester.csv")
groups = df['group'].unique()


feature_columns = ['BMI', 'INHPFN', 'HHHRES', 'HCHILD','LIVSIB','HATOTB','HAIRA','IEARN','HITOT', 'PRPCNT']
target_features = ["SHLT","MSTOT","COGTOT"]


n_estimators = 100
max_depth = None
min_samples_split = 2
min_samples_leaf = 1

In [27]:

rf_cross_validation(df, groups, feature_columns, target_features, n_estimators, max_depth, min_samples_split, min_samples_leaf)

In [11]:
# Group the data and perform stratified split
group_kfolds = []
for _, group_data in df.groupby('group'):
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    group_kfolds.extend(list(kf.split(group_data)))

# Sort the indices so the folds are stratified by groups
group_kfolds.sort(key=lambda x: x[0][0])

In [12]:
# 打印分组的详细情况
def print_group_details(df, groups, kf):
    # 打印总体分组情况
    print("Group distribution in the entire dataset:")
    print(df['group'].value_counts())

    # 对于每个组，打印出在每个折叠中的分布情况
    for group in groups:
        print(f"\nGroup: {group}")
        group_data = df[df['group'] == group]
        fold = 0
        for _, test_index in kf.split(group_data):
            fold += 1
            print(f" Fold {fold}: Test indices -> {test_index} - count: {len(test_index)}")

# 设置10折KFold
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# 打印分组的详细情况
unique_groups = df['group'].unique()
print_group_details(df, unique_groups, kf)

Group distribution in the entire dataset:
group
0,0,0,0,0                            11859
0,0,1,0,1 + 0,1,1,0,0 + 0,1,1,1,0     6138
0,0,1,0,0                             4554
0,0,1,1,0                             4067
0,0,1,1,1                             2967
0,0,0,0,1                             1969
0,1,1,1,1                             1418
1,0,0,0,0                              566
Name: count, dtype: int64

Group: 0,0,0,0,0
 Fold 1: Test indices -> [   14    19    33 ... 11819 11823 11857] - count: 1186
 Fold 2: Test indices -> [    0     3     8 ... 11847 11851 11858] - count: 1186
 Fold 3: Test indices -> [   20    23    32 ... 11827 11832 11844] - count: 1186
 Fold 4: Test indices -> [   12    17    30 ... 11839 11845 11853] - count: 1186
 Fold 5: Test indices -> [   26    27    28 ... 11766 11792 11808] - count: 1186
 Fold 6: Test indices -> [   15    44    52 ... 11849 11852 11854] - count: 1186
 Fold 7: Test indices -> [    2    18    43 ... 11833 11834 11855] - count: 11