In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import make_column_transformer

In [2]:
df = pd.read_csv('cleaned_extracted_data.csv').drop(["HINPOVA", "INHPE"], axis = 1)

In [3]:
continuous_features = ['BMI', 'INHPFN',  'HHHRES', 'HCHILD','LIVSIB',  'HAIRA', 'HATOTB', 'IEARN','HITOT', 'PRPCNT']
cate_features = ['HINPOV','PENINC', 'HIGOV','RETMON', 'SLFEMP']

In [4]:
grouped_df = df.groupby(cate_features).size().reset_index(name='Count')
grouped_df.sort_values("Count", ascending = False)

Unnamed: 0,HINPOV,PENINC,HIGOV,RETMON,SLFEMP,Count
0,0.0,0.0,0.0,0,0.0,12669
4,0.0,0.0,1.0,0,0.0,4868
6,0.0,0.0,1.0,1,0.0,4429
7,0.0,0.0,1.0,1,1.0,3178
14,0.0,1.0,1.0,1,0.0,2918
5,0.0,0.0,1.0,0,1.0,2737
1,0.0,0.0,0.0,0,1.0,2080
15,0.0,1.0,1.0,1,1.0,1512
12,0.0,1.0,1.0,0,0.0,985
16,1.0,0.0,0.0,0,0.0,647


In [5]:
filtered_df = pd.DataFrame()

for name, group in df.groupby(cate_features):
    isol_forest = IsolationForest(random_state=42, n_estimators=100 ,n_jobs=-1, bootstrap = True)
    outliers = isol_forest.fit_predict(group)

    filtered_group = group[outliers == 1]

    filtered_df = pd.concat([filtered_df, filtered_group], ignore_index=True)
    
filtered_df.groupby(cate_features).size().reset_index(name='Count').sort_values("Count", ascending = False)

Unnamed: 0,HINPOV,PENINC,HIGOV,RETMON,SLFEMP,Count
0,0.0,0.0,0.0,0,0.0,11859
4,0.0,0.0,1.0,0,0.0,4554
6,0.0,0.0,1.0,1,0.0,4067
7,0.0,0.0,1.0,1,1.0,2967
14,0.0,1.0,1.0,1,0.0,2723
5,0.0,0.0,1.0,0,1.0,2532
1,0.0,0.0,0.0,0,1.0,1969
15,0.0,1.0,1.0,1,1.0,1418
12,0.0,1.0,1.0,0,0.0,883
16,1.0,0.0,0.0,0,0.0,566


In [6]:

# Group the data by categorical features and count the number of samples in each group
grouped = filtered_df.groupby(cate_features).size()

# Filter the groups with more than 500 samples
filtered_groups = grouped[grouped > 500]

# Get the index of these groups
indices_of_large_groups = filtered_groups.index

# Filter the original dataframe to keep only the data belonging to the large groups
filtered_df = filtered_df[filtered_df.set_index(cate_features).index.isin(indices_of_large_groups)]
filtered_df.groupby(cate_features).size().reset_index(name='Count').sort_values("Count", ascending = False)

Unnamed: 0,HINPOV,PENINC,HIGOV,RETMON,SLFEMP,Count
0,0.0,0.0,0.0,0,0.0,11859
2,0.0,0.0,1.0,0,0.0,4554
4,0.0,0.0,1.0,1,0.0,4067
5,0.0,0.0,1.0,1,1.0,2967
7,0.0,1.0,1.0,1,0.0,2723
3,0.0,0.0,1.0,0,1.0,2532
1,0.0,0.0,0.0,0,1.0,1969
8,0.0,1.0,1.0,1,1.0,1418
6,0.0,1.0,1.0,0,0.0,883
9,1.0,0.0,0.0,0,0.0,566


In [17]:
# filtered_df.groupby(cate_features).size().reset_index(name='Count').sort_values("Count", ascending = False).to_csv("Table4.csv")

In [13]:
# filtered_df.to_csv("Removed_outliers_byGroup_data.csv", index=False)

In [14]:
def kf_RandomForestRegressor(X, y, n_estimators =100 ,k=5):
    # Set up k-fold parameters
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    
    #Store the results of each validation
    train_r_squared = []
    test_r_squared = []
    train_mse = []
    test_mse = []
    
    importance = np.zeros(10)

    # Iterate through the cross-validation
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        # set up the RF model
        model = RandomForestRegressor(n_estimators = n_estimators, n_jobs=-1).fit(X_train, y_train)
        
        # Predict and calculate performance and errors
        train_pred = model.predict(X_train)        
        train_r_squared.append(r2_score(y_train, train_pred))
        train_mse.append(mean_squared_error(y_train, train_pred))
        
        test_pred = model.predict(X_test)
        test_r_squared.append(r2_score(y_test, test_pred))
        test_mse.append(mean_squared_error(y_test, test_pred))
        
        importance = importance + model.feature_importances_
        
    print(f"Train: R-squared = {np.mean(train_r_squared)}, MSE = {np.mean(train_mse)}\nTest: R-squared = {np.mean(test_r_squared)}, MSE = {np.mean(test_mse)}")
    return (np.mean(train_r_squared),np.mean(train_mse), np.mean(test_r_squared),np.mean(test_mse), importance/5)

In [27]:
target_features = ['SHLT', 'COGTOT', 'MSTOT']

filtered_groups = filtered_df.groupby(cate_features).size().reset_index(name='Count')

# Initialize a dictionary to hold the results
group_importance_results = {}
group_performance = {}

# Iterate over each group
for index, row in filtered_groups.iterrows():
    # Get the combination of categorical features for the group
    cat_combination = tuple(row[cate_features])
    
    # Filter the original df for the group's data
    group_data = df[(df[cate_features[0]] == row[cate_features[0]]) &
                    (df[cate_features[1]] == row[cate_features[1]]) &
                    (df[cate_features[2]] == row[cate_features[2]]) &
                    (df[cate_features[3]] == row[cate_features[3]]) &
                    (df[cate_features[4]] == row[cate_features[4]])
                    ]
    
    # Split into X and y
    X = group_data[continuous_features].values
    y = group_data[target_features].values
    
    
    # Run the kf_RandomForestRegressor function
    train_r_squared, train_mse, test_r_squared, test_mse, feature_importance = kf_RandomForestRegressor(X, y)
    
    # Store the feature importance with the group's categorical combination
    group_importance_results[str(cat_combination)] = feature_importance
    group_performance[str(cat_combination)] = [train_r_squared, train_mse, test_r_squared, test_mse]

# Output the results
for cat_combination, importance in group_importance_results.items():
    print(f"Category Combination: {cat_combination}, Feature Importance: {importance}")


Train: R-squared = 0.9561150195247998, MSE = 0.29749988135701055
Test: R-squared = 0.6868551048689838, MSE = 2.151600491032709
Train: R-squared = 0.9522066494912325, MSE = 0.30881876602564123
Test: R-squared = 0.6469541764707849, MSE = 2.287411041666668
Train: R-squared = 0.9562375670380201, MSE = 0.341656529817223
Test: R-squared = 0.706917373207714, MSE = 2.3275703293721715
Train: R-squared = 0.9578589916153801, MSE = 0.33188701519497366
Test: R-squared = 0.6737440328074829, MSE = 2.575684491831135
Train: R-squared = 0.9564207307445214, MSE = 0.3013374947946841
Test: R-squared = 0.6891038781921012, MSE = 2.1664729736814143
Train: R-squared = 0.9545536402055929, MSE = 0.32576011064451516
Test: R-squared = 0.6459429306013543, MSE = 2.581822896086103
Train: R-squared = 0.951703051071774, MSE = 0.33824199661590526
Test: R-squared = 0.6447970561543688, MSE = 2.4502000676818954
Train: R-squared = 0.9527560079673222, MSE = 0.28420249987308605
Test: R-squared = 0.7021837093251355, MSE = 1.75

In [34]:
pd.DataFrame(group_importance_results).T

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
"(0.0, 0.0, 0.0, 0.0, 0.0)",0.157229,0.007073,0.049031,0.070496,0.082168,0.075788,0.167739,0.137938,0.225561,0.026977
"(0.0, 0.0, 0.0, 0.0, 1.0)",0.164511,0.006241,0.042286,0.06759,0.093524,0.131016,0.19371,0.05708,0.21697,0.027073
"(0.0, 0.0, 1.0, 0.0, 0.0)",0.152753,0.010076,0.033836,0.072654,0.090966,0.067521,0.177161,0.129137,0.24062,0.025277
"(0.0, 0.0, 1.0, 0.0, 1.0)",0.163886,0.020602,0.038283,0.07594,0.090522,0.095057,0.191355,0.041644,0.252441,0.030271
"(0.0, 0.0, 1.0, 1.0, 0.0)",0.158321,0.011141,0.029545,0.08103,0.088005,0.130219,0.180676,0.119359,0.178985,0.022719
"(0.0, 0.0, 1.0, 1.0, 1.0)",0.170709,0.018707,0.028815,0.079297,0.082847,0.100033,0.192775,0.053231,0.245705,0.027882
"(0.0, 1.0, 1.0, 0.0, 0.0)",0.14427,0.022784,0.04303,0.062231,0.078851,0.080786,0.167368,0.108573,0.258849,0.033257
"(0.0, 1.0, 1.0, 1.0, 0.0)",0.151578,0.005859,0.026634,0.081984,0.081558,0.089419,0.214576,0.12267,0.19427,0.031452
"(0.0, 1.0, 1.0, 1.0, 1.0)",0.178003,0.008791,0.022992,0.102555,0.08448,0.161896,0.167934,0.044092,0.197251,0.032006
"(1.0, 0.0, 0.0, 0.0, 0.0)",0.177398,0.021175,0.074165,0.101541,0.117623,0.06439,0.161335,0.087519,0.165821,0.029032


In [36]:
pd.DataFrame(group_performance).T.to_csv("groupPerformance.csv")

In [19]:
group_performance

{(0.0, 0.0, 0.0, 0.0, 0.0): [0.9562030431866744,
  0.29627537532446657,
  0.6865716697556015,
  2.1462549485948856],
 (0.0, 0.0, 0.0, 0.0, 1.0): [0.9516392525568573,
  0.31564933092948744,
  0.6487382575977653,
  2.280056282051282],
 (0.0, 0.0, 1.0, 0.0, 0.0): [0.9568560439641741,
  0.33586876343602046,
  0.7059020704314596,
  2.3293189478900898],
 (0.0, 0.0, 1.0, 0.0, 1.0): [0.9574276301119176,
  0.3384815811769518,
  0.6721494056118096,
  2.5721138237099517],
 (0.0, 0.0, 1.0, 1.0, 0.0): [0.9560009221509095,
  0.3036594898093885,
  0.6893021614688709,
  2.147946642078281],
 (0.0, 0.0, 1.0, 1.0, 1.0): [0.9545320255104286,
  0.32655319167749847,
  0.6435919972343435,
  2.5863732113273583],
 (0.0, 1.0, 1.0, 0.0, 0.0): [0.9516412619728152,
  0.33816107445008464,
  0.644611574234213,
  2.442480439932319],
 (0.0, 1.0, 1.0, 1.0, 0.0): [0.9527309771688603,
  0.2834105251151864,
  0.7051310405208995,
  1.7407489665718976],
 (0.0, 1.0, 1.0, 1.0, 1.0): [0.9506748882342132,
  0.2998028822126065,
