In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import make_column_transformer

In [4]:
df = pd.read_csv('cleaned_extracted_data.csv').drop(["HINPOVA","INHPE"], axis = 1)

In [5]:
continuous_features = ['BMI', 'INHPFN',  'HHHRES', 'HCHILD','LIVSIB',  'HAIRA', 'HATOTB', 'IEARN','HITOT', 'PRPCNT']
cate_features = ['HINPOV','PENINC', 'HIGOV', 'RETMON', 'SLFEMP']

In [6]:
grouped_df = df.groupby(cate_features).size().reset_index(name='Count')
grouped_df

Unnamed: 0,HINPOV,PENINC,HIGOV,RETMON,SLFEMP,Count
0,0.0,0.0,0.0,0,0.0,12669
1,0.0,0.0,0.0,0,1.0,2080
2,0.0,0.0,0.0,1,0.0,441
3,0.0,0.0,0.0,1,1.0,272
4,0.0,0.0,1.0,0,0.0,4868
5,0.0,0.0,1.0,0,1.0,2737
6,0.0,0.0,1.0,1,0.0,4429
7,0.0,0.0,1.0,1,1.0,3178
8,0.0,1.0,0.0,0,0.0,215
9,0.0,1.0,0.0,0,1.0,48


In [7]:
filtered_df = pd.DataFrame()

for name, group in df.groupby(cate_features):
    isol_forest = IsolationForest(random_state=42, n_estimators=500 ,n_jobs=-1)
    outliers = isol_forest.fit_predict(group)

    filtered_group = group[outliers == 1]

    filtered_df = pd.concat([filtered_df, filtered_group], ignore_index=True)
    
filtered_df.groupby(cate_features).size().reset_index(name='Count').sort_values("Count", ascending = False)

Unnamed: 0,HINPOV,PENINC,HIGOV,RETMON,SLFEMP,Count
0,0.0,0.0,0.0,0,0.0,11981
4,0.0,0.0,1.0,0,0.0,4578
6,0.0,0.0,1.0,1,0.0,4122
7,0.0,0.0,1.0,1,1.0,2983
14,0.0,1.0,1.0,1,0.0,2705
5,0.0,0.0,1.0,0,1.0,2563
1,0.0,0.0,0.0,0,1.0,1981
15,0.0,1.0,1.0,1,1.0,1422
12,0.0,1.0,1.0,0,0.0,896
16,1.0,0.0,0.0,0,0.0,596


In [8]:
filtered_df.to_csv("Removed_outliers_byGroup_data.csv", index=False)

In [9]:
def kf_RandomForestRegressor(X, y, n_estimators =100 ,k=5):
    # Set up k-fold parameters
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    #Store the results of each validation
    train_r_squared = []
    test_r_squared = []
    train_mse = []
    test_mse = []
    
    importance = np.zeros(10)

    # Iterate through the cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # set up the RF model
        model = RandomForestRegressor(n_estimators = n_estimators, n_jobs=-1).fit(X_train, y_train)
        
        # Predict and calculate performance and errors
        train_pred = model.predict(X_train)        
        train_r_squared.append(r2_score(y_train, train_pred))
        train_mse.append(mean_squared_error(y_train, train_pred))
        
        test_pred = model.predict(X_test)
        test_r_squared.append(r2_score(y_test, test_pred))
        test_mse.append(mean_squared_error(y_test, test_pred))
        
        importance = importance + model.feature_importances_
        
    print(f"Train: R-squared = {np.mean(train_r_squared)}, MSE = {np.mean(train_mse)}\nTest: R-squared = {np.mean(test_r_squared)}, MSE = {np.mean(test_mse)}")
    return (importance/5)

In [10]:
target_features = ['SHLT', 'COGTOT', 'MSTOT']

# Filter groups with count > 10
filtered_groups = grouped_df[grouped_df['Count'] > 10]

# Initialize a dictionary to hold the results
group_importance_results = {}

# Iterate over each group
for index, row in filtered_groups.iterrows():
    # Get the combination of categorical features for the group
    cat_combination = tuple(row[cate_features])
    
    # Filter the original df for the group's data
    group_data = df[(df[cate_features[0]] == row[cate_features[0]]) &
                    (df[cate_features[1]] == row[cate_features[1]]) &
                    (df[cate_features[2]] == row[cate_features[2]]) &
                    (df[cate_features[3]] == row[cate_features[3]]) &
                    (df[cate_features[4]] == row[cate_features[4]])
                    ]
    
    # Split into X and y
    X = group_data[continuous_features].values
    y = group_data[target_features].values
    
    # Flatten y if you are running for a single target feature at a time
    # y = y.ravel()  # Do this if kf_RandomForestRegressor expects a 1D array for y
    
    # Run the kf_RandomForestRegressor function
    feature_importance = kf_RandomForestRegressor(X, y)
    
    # Store the feature importance with the group's categorical combination
    group_importance_results[cat_combination] = feature_importance

# Output the results
for cat_combination, importance in group_importance_results.items():
    print(f"Category Combination: {cat_combination}, Feature Importance: {importance}")


Train: R-squared = 0.9557212007254121, MSE = 0.3006976883215783
Test: R-squared = 0.6854474573595385, MSE = 2.159986556423899
Train: R-squared = 0.9523206177393888, MSE = 0.3081173878205132
Test: R-squared = 0.6486305701463643, MSE = 2.273892307692308
Train: R-squared = 0.9531564504741035, MSE = 0.31089297101682556
Test: R-squared = 0.7168556246981975, MSE = 1.7983800595846104
Train: R-squared = 0.9471487539676791, MSE = 0.335487626516721
Test: R-squared = 0.5565267811986547, MSE = 2.3252617845117847
Train: R-squared = 0.9566729814758202, MSE = 0.3390471963581789
Test: R-squared = 0.7080547988598156, MSE = 2.3213226425887736
Train: R-squared = 0.956689250856013, MSE = 0.3434249682047989
Test: R-squared = 0.6693182591983484, MSE = 2.6114211064332316
Train: R-squared = 0.9560058448072102, MSE = 0.3045056478952463
Test: R-squared = 0.6911216481329887, MSE = 2.1518752774143066
Train: R-squared = 0.9546842823907138, MSE = 0.3252622243357089
Test: R-squared = 0.6452426145904296, MSE = 2.5755

In [9]:
'https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html'

'\nhttps://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html\n见此网址以可视化\n'