In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_transformer

In [3]:
df = pd.read_csv('cleaned_extracted_data.csv')
#Separate the input features and target values
X = df.drop(['SHLT', 'COGTOT', 'MSTOT'], axis=1)  # Input features
y = df[['SHLT', 'COGTOT', 'MSTOT']]  # Target variables
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Initialize a dictionary to store the models and their metrics
models_metrics = {}

# Train separate models for each target variable
for target in y.columns:
    # Train the model
    model = RandomForestRegressor(n_estimators = 100)
    model.fit(X_train, y_train[target])
    
    # Predict on the testing set
    y_pred = model.predict(X_test)
    
    # Evaluate the model
    r2 = r2_score(y_test[target], y_pred)
    mse = mean_squared_error(y_test[target], y_pred)
    
    # Store the model and its metrics
    models_metrics[target] = {'R2 Score': r2, 'MSE': mse}

# Display the models' metrics
for target, metrics in models_metrics.items():
    print(f"Model for {target}: R2 Score = {metrics['R2 Score']}, MSE = {metrics['MSE']}")

Model for SHLT: R2 Score = 0.7156605313851188, MSE = 0.2684337100545596
Model for COGTOT: R2 Score = 0.7234275599545532, MSE = 4.811628059236166
Model for MSTOT: R2 Score = 0.7104828882414229, MSE = 1.047859288126786


In [6]:
model.feature_importances_

array([0.14997832, 0.01032019, 0.00043302, 0.03649614, 0.07105073,
       0.07855028, 0.00216959, 0.00375603, 0.09167484, 0.16577136,
       0.09988969, 0.21394835, 0.01065083, 0.01475316, 0.02359434,
       0.01297686, 0.01398627])

In [21]:
model2 = RandomForestRegressor(n_estimators = 100)
model2.fit(X_train, y_train[target])
    
    # Predict on the testing set
y_pred = model2.predict(X_test)
    
    # Evaluate the model
r2 = r2_score(y_test[target], y_pred)
mse = mean_squared_error(y_test[target], y_pred)

print(r2, mse)
model2.feature_importances_

0.7095477745009725 1.0512437776045727


array([0.14921797, 0.01012278, 0.00055709, 0.03570439, 0.07124558,
       0.0786982 , 0.0023367 , 0.00365143, 0.08987982, 0.16546021,
       0.10041599, 0.2157286 , 0.01056998, 0.01511196, 0.02399549,
       0.01303242, 0.01427139])

In [7]:
continuous_features = ['SHLT', 'BMI', 'MSTOT', 'COGTOT', 'INHPFN',  'HHHRES', 'HCHILD','LIVSIB',  'HAIRA', 'HATOTB', 'IEARN','HITOT', 'PRPCNT', 'SLFEMP', 'RETMON']
cate_features = ['INHPE','HINPOV', 'HINPOVA','PENINC', 'HIGOV']

In [8]:
# detect and remove outliers
print(df.shape)
clf = IsolationForest(random_state=42, contamination="auto")
outliers = clf.fit_predict(df)
cleaned_df = df[outliers == 1]
print(cleaned_df.shape)

(38487, 20)
(34921, 20)


In [9]:
# transfer data
trans = make_column_transformer((StandardScaler(), continuous_features),
                               (OneHotEncoder(), cate_features))
df2 = trans.fit(cleaned_df)

(38487, 25)

In [None]:
df2