In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import make_column_transformer
import math

In [2]:
df = pd.read_csv('cleaned_extracted_data.csv').drop(["HINPOVA","INHPE"], axis = 1)

In [3]:
continuous_features = ['BMI', 'INHPFN',  'HHHRES', 'HCHILD','LIVSIB',  'HAIRA', 'HATOTB', 'IEARN','HITOT', 'PRPCNT']
cate_features = ['HINPOV','PENINC', 'HIGOV', 'RETMON', 'SLFEMP']

In [4]:
grouped_df = df.groupby(cate_features).size().reset_index(name='Count')

In [5]:
filtered_df = pd.DataFrame()

for name, group in df.groupby(cate_features):
    isol_forest = IsolationForest(random_state=42, n_estimators=500 ,n_jobs=-1)
    outliers = isol_forest.fit_predict(group)

    filtered_group = group[outliers == 1]

    filtered_df = pd.concat([filtered_df, filtered_group], ignore_index=True)
    
filtered_df.groupby(cate_features).size().reset_index(name='Count').sort_values("Count", ascending = False)

Unnamed: 0,HINPOV,PENINC,HIGOV,RETMON,SLFEMP,Count
0,0.0,0.0,0.0,0,0.0,11981
4,0.0,0.0,1.0,0,0.0,4578
6,0.0,0.0,1.0,1,0.0,4122
7,0.0,0.0,1.0,1,1.0,2983
14,0.0,1.0,1.0,1,0.0,2705
5,0.0,0.0,1.0,0,1.0,2563
1,0.0,0.0,0.0,0,1.0,1981
15,0.0,1.0,1.0,1,1.0,1422
12,0.0,1.0,1.0,0,0.0,896
16,1.0,0.0,0.0,0,0.0,596


In [6]:
filtered_df.to_csv("Removed_outliers_byGroup_data.csv", index=False)

In [7]:
# Load the dataset from a CSV file
data = pd.read_csv('Removed_outliers_byGroup_data.csv')

# Combine specified binary columns into one feature, converting them to integers, then to strings, and joining with commas
data['group'] = data[['HINPOV', 'PENINC', 'HIGOV', 'RETMON', 'SLFEMP']].astype(int).astype(str).agg(','.join, axis=1)

# Drop the original binary columns as they are now combined into the 'group' feature
data.drop(columns=['HINPOV', 'PENINC', 'HIGOV', 'RETMON', 'SLFEMP'], inplace=True)


In [10]:
# Count the occurrences of each group
group_counts = data['group'].value_counts()

# Find groups with more than 500 occurrences
groups_to_remove = group_counts[group_counts > 500].index

# Remove rows where the 'group' feature is in the list of groups to remove
filtered_data = data[~data['group'].isin(groups_to_remove)]

# Save the filtered DataFrame to a new CSV file without the rows from overrepresented groups
filtered_data.to_csv('test_data.csv', index=False)