In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.compose import make_column_transformer

In [21]:
def kf_RandomFroestRegressor(X, y, n_estimators =150 ,k=5):
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    train_r_squared = []
    test_r_squared = []
    train_mse = []
    test_mse = []
    
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        model = RandomForestRegressor(n_estimators = n_estimators, n_jobs=-1).fit(X_train, y_train)
        
        train_pred = model.predict(X_train)
        train_r_squared.append(r2_score(y_train, train_pred))
        train_mse.append(mean_squared_error(y_train, train_pred))
        
        test_pred = model.predict(X_test)
        test_r_squared.append(r2_score(y_test, test_pred))
        test_mse.append(mean_squared_error(y_test, test_pred))
        
    print(f"Train: R-squared = {np.mean(train_r_squared)}, MSE = {np.mean(train_mse)}\nTest: R-squared = {np.mean(test_r_squared)}, MSE = {np.mean(test_mse)}")
    return (np.mean(train_r_squared), np.mean(train_mse), np.mean(test_r_squared), np.mean(test_mse))


In [3]:
df = pd.read_csv('cleaned_extracted_data.csv')

X = df.drop(['SHLT', 'COGTOT', 'MSTOT','BMI'], axis=1).values
y = df[['SHLT', 'COGTOT', 'MSTOT']].values
kf_RandomFroestRegressor(X,y)

Train: R-squared = 0.9568241590327465, MSE = 0.30830550320460487
Test: R-squared = 0.7027791905827089, MSE = 2.132358532458506


(0.9568241590327465,
 0.30830550320460487,
 0.7027791905827089,
 2.132358532458506)

Run a random forest on raw data

In [4]:

# #Separate the input features and target values
# X = df.drop(['SHLT', 'COGTOT', 'MSTOT','BMI'], axis=1)  # Input features
# y = df[['SHLT', 'COGTOT', 'MSTOT']]  # Target variables
# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# # Initialize a dictionary to store the models and their metrics
# models_metrics = {}

# # Train separate models for each target variable
# for target in y.columns:
#     # Train the model
#     model = RandomForestRegressor(n_estimators = 200, n_jobs=-1)
#     model.fit(X_train, y_train[target])
    
#     # Predict on the testing set
#     y_pred = model.predict(X_test)
    
#     # Evaluate the model
#     r2 = r2_score(y_test[target], y_pred)
#     mse = mean_squared_error(y_test[target], y_pred)
    
#     # Store the model and its metrics
#     models_metrics[target] = {'R2 Score': r2, 'MSE': mse}

# # Display the models' metrics
# for target, metrics in models_metrics.items():
#     print(f"Model for {target}: R2 Score = {metrics['R2 Score']}, MSE = {metrics['MSE']}")

In [5]:
continuous_features = ['BMI', 'INHPFN',  'HHHRES', 'HCHILD','LIVSIB',  'HAIRA', 'HATOTB', 'IEARN','HITOT', 'PRPCNT', 'SLFEMP', 'RETMON']
cate_features = ['INHPE','HINPOV', 'HINPOVA','PENINC', 'HIGOV']

Use isolation Forest to remove outliers

In [6]:
# detect and remove outliers
print(df.shape)
clf = IsolationForest(random_state=42, contamination= 0.05, n_estimators= 500, n_jobs= -1)
outliers = clf.fit_predict(df)
cleaned_df = df[outliers == 1]
print(cleaned_df.shape)

(38487, 20)
(36563, 20)


Run the model on dataset with less outliers

In [7]:
X = cleaned_df.drop(['SHLT', 'COGTOT', 'MSTOT','BMI'], axis=1).values
y = cleaned_df[['SHLT', 'COGTOT', 'MSTOT']].values
kf_RandomFroestRegressor(X,y)

Train: R-squared = 0.9554202667874069, MSE = 0.29801639873079855
Test: R-squared = 0.6850458067701425, MSE = 2.0927710965838893


(0.9554202667874069,
 0.29801639873079855,
 0.6850458067701425,
 2.0927710965838893)

In [8]:
# X = cleaned_df.drop(['SHLT', 'COGTOT', 'MSTOT','BMI'], axis=1)  # Input features
# y = cleaned_df[['SHLT', 'COGTOT', 'MSTOT']]  # Target variables
# # Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# # Initialize a dictionary to store the models and their metrics
# models_metrics = {}

# # Train separate models for each target variable
# for target in y.columns:
#     # Train the model
#     model = RandomForestRegressor(n_estimators = 100, n_jobs=-1)
#     model.fit(X_train, y_train[target])
    
#     # Predict on the testing set
#     y_pred = model.predict(X_test)
    
#     # Evaluate the model
#     r2 = r2_score(y_test[target], y_pred)
#     mse = mean_squared_error(y_test[target], y_pred)
    
#     # Store the model and its metrics
#     models_metrics[target] = {'R2 Score': r2, 'MSE': mse}

# # Display the models' metrics
# for target, metrics in models_metrics.items():
#     print(f"Model for {target}: R2 Score = {metrics['R2 Score']}, MSE = {metrics['MSE']}")

Run the data on normalized and encoded data or data with polynomial feature

In [18]:
# transfer data
trans = make_column_transformer((StandardScaler(), continuous_features),
                                remainder="passthrough")

df2 = pd.DataFrame( trans.fit_transform( df ) , columns=trans.get_feature_names_out( ) )
df2.columns
target_columns = ['remainder__SHLT', 'remainder__MSTOT','remainder__COGTOT']

In [22]:
X = df2.drop(target_columns, axis=1).values
y = df2[target_columns].values
kf_RandomFroestRegressor(X,y, n_estimators=1000)

Train: R-squared = 0.9608627852177316, MSE = 0.28216372024156233
Test: R-squared = 0.7160203960296545, MSE = 2.0584321537889716


(0.9608627852177316,
 0.28216372024156233,
 0.7160203960296545,
 2.0584321537889716)

In [15]:
# target_columns = ['remainder__SHLT', 'remainder__MSTOT','remainder__COGTOT']
# X = df2.drop(target_columns, axis=1)  # Input features
# y = df2[target_columns]  # Target variables
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# for target in target_columns:
#     model = RandomForestRegressor(n_estimators = 150, n_jobs=-1)
#     model.fit(X_train, y_train[target])

#         # Predict on the testing set
#     y_pred = model.predict(X_test)

#         # Evaluate the model
#     r2 = r2_score(y_test[target], y_pred)
#     mse = mean_squared_error(y_test[target], y_pred)

#         # Store the model and its metrics
#     print(f"R2 Score: {r2} MSE: {mse}")

R2 Score: 0.7045574957808984 MSE: 0.283838751768136
R2 Score: 0.7120359573402713 MSE: 1.030715724141913
R2 Score: 0.7047157638645751 MSE: 5.107293813688981
