In [None]:
title: "Health & Lifestyle Data Analysis and ML Pipeline" output: html_document date: "r Sys.Date()"knitr::opts_chunk$set(echo = TRUE, warning = FALSE, message = FALSE)
1. Library ImportsWe will use the tidyverse for data manipulation and plotting, aws.s3 for loading data from AWS, and the tidymodels ecosystem (including themis for SMOTE) for machine learning.# Install packages if not already installed
if (!require("pacman")) install.packages("pacman")
pacman::p_load(
  tidyverse,    # Data manipulation and visualization
  aws.s3,       # AWS S3 connectivity (replaces boto3)
  tidymodels,   # Machine Learning framework (replaces sklearn)
  themis,       # For SMOTE (replaces imblearn)
  corrplot,     # For correlation heatmaps
  dotenv        # For environment variables
)

# Load environment variables if .env exists
tryCatch({
  load_dot_env()
}, error = function(e) {
  message("No .env file found or error loading it.")
})
2. Data LoadingThis section connects to AWS S3 to retrieve the dataset, mirroring the boto3 logic in the original code.# Specify bucket and file key
bucket_name <- 'dw-health-lifestyle-dataset'
file_key <- 'health_lifestyle_dataset.csv'

# Retrieve keys from environment variables
# Note: In R, use Sys.getenv()
aws_access_key <- Sys.getenv("aws_access_key_id")
aws_secret_key <- Sys.getenv("aws_secret_access_key")

# Function to load data
load_data_from_s3 <- function() {
  tryCatch({
    # Set AWS environment variables for the session
    Sys.setenv(
      "AWS_ACCESS_KEY_ID" = aws_access_key,
      "AWS_SECRET_ACCESS_KEY" = aws_secret_key,
      "AWS_DEFAULT_REGION" = "us-east-2"
    )

    if (aws_access_key != "" && aws_secret_key != "") {
        print(paste("Successful S3 connection. Loading", file_key, "..."))

        # Read object from S3
        obj <- get_object(object = file_key, bucket = bucket_name)

        # Parse CSV content
        raw_data <- rawToChar(obj)
        df <- read.csv(text = raw_data)
        return(df)
    } else {
        print("AWS Credentials not found. Loading dummy data or checking local path.")
        # Fallback or Placeholder if keys aren't present in this run
        return(NULL)
    }

  }, error = function(e) {
    print(paste("Error:", e$message))
    return(NULL)
  })
}

# Attempt to load data
df <- load_data_from_s3()

# If S3 fails (e.g., no creds in this env), ensure df exists for the rest of the notebook to run syntax checks
if (is.null(df)) {
    warning("Data could not be loaded from S3. Please ensure credentials are set.")
} else {
    print(head(df))
}
3. Exploratory Data Analysis (EDA)Checking structure, missing values, and statistics.if (!is.null(df)) {
  # Check structure (equivalent to df.info())
  str(df)

  # Check missing values
  print("Missing Values:")
  print(colSums(is.na(df)))

  # Basic statistics (equivalent to df.describe())
  print("Summary Statistics:")
  summary(df)

  # Remove duplicates
  initial_rows <- nrow(df)
  df <- df %>% distinct()
  print(paste("Duplicates removed:", initial_rows - nrow(df)))

  # Drop ID column if it exists
  if ("id" %in% names(df)) {
    df <- df %>% select(-id)
  }
}
Visualization FunctionsDefining R equivalents for the Python plotting functions using ggplot2.# Function to plot distribution
plot_distribution <- function(data, column) {
  p <- ggplot(data, aes_string(x = column)) +
    geom_histogram(aes(y = ..density..), fill = "skyblue", color = "black", bins = 30) +
    geom_density(color = "red", size = 1) +
    labs(title = paste("Distribution of", column), x = column, y = "Frequency") +
    theme_minimal()
  print(p)
}

# Function to plot correlation heatmap
plot_correlation_heatmap <- function(data) {
  # Select numeric columns
  nums <- unlist(lapply(data, is.numeric))
  numeric_data <- data[, nums]

  # Calculate correlation
  corr_matrix <- cor(numeric_data, use = "complete.obs")

  # Plot
  corrplot(corr_matrix, method = "color", type = "upper",
           addCoef.col = "black", tl.col = "black", tl.srt = 45,
           title = "Correlation Heatmap", mar = c(0,0,1,0))
}

# Function to plot boxplot
plot_boxplot <- function(data, column) {
  p <- ggplot(data, aes_string(y = column)) +
    geom_boxplot(fill = "lightgreen") +
    labs(title = paste("Boxplot of", column), y = column) +
    theme_minimal()
  print(p)
}

# Function to map relation of disease_risk with other columns
# Using stat_summary to mimic seaborn pointplot (mean with error bars)
plot_disease_risk_relation <- function(data, column) {
  p <- ggplot(data, aes_string(x = column, y = "disease_risk")) +
    stat_summary(fun.data = "mean_se", geom = "pointrange", color = "blue") +
    stat_summary(fun = "mean", geom = "line", group = 1, color = "blue") +
    labs(title = paste("Relation of", column, "with Disease Risk"),
         x = column, y = "Disease Risk (Mean)") +
    theme_minimal()
  print(p)
}

# Function to plot categorical count
plot_categorical_count <- function(data, column) {
  p <- ggplot(data, aes_string(x = column)) +
    geom_bar(fill = "orange", color = "black") +
    labs(title = paste("Count of", column), x = column, y = "Count") +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))
  print(p)
}
Visualizing the Dataif (!is.null(df)) {
  plot_distribution(df, 'age')
  plot_correlation_heatmap(df)
  plot_boxplot(df, 'age')
  plot_categorical_count(df, 'gender')

  # Relations
  plot_disease_risk_relation(df, 'resting_hr')
  plot_disease_risk_relation(df, 'smoker')
  plot_disease_risk_relation(df, 'alcohol')
  plot_disease_risk_relation(df, 'age')

  # Check Target Balance
  print("Target Variable Counts:")
  print(table(df$disease_risk))
}
4. Machine Learning PipelineWe will use tidymodels to replicate the sklearn pipeline. This handles splitting, preprocessing (scaling/encoding), SMOTE oversampling, and model training.Data Splittingif (!is.null(df)) {
  # Ensure target is a factor for classification
  df$disease_risk <- as.factor(df$disease_risk)

  set.seed(42)
  # Split: 80% Train, 20% Test, stratified by target
  split <- initial_split(df, prop = 0.8, strata = disease_risk)
  train_data <- training(split)
  test_data <- testing(split)

  print(paste("Training set size:", nrow(train_data)))
  print(paste("Testing set size:", nrow(test_data)))
}
Feature Engineering & Pipeline DefinitionWe define a recipe that performs:One-Hot Encoding (Dummy variables)Normalization (StandardScaler)SMOTE (Oversampling)if (!is.null(df)) {
  # Define the recipe
  ml_recipe <- recipe(disease_risk ~ ., data = train_data) %>%
    # Encode categorical variables (OneHotEncoder)
    step_dummy(all_nominal_predictors()) %>%
    # Normalize numeric variables (StandardScaler)
    step_normalize(all_numeric_predictors()) %>%
    # Apply SMOTE for class imbalance
    step_smote(disease_risk, seed = 42)
}
Model SpecificationsDefining Logistic Regression and Random Forest.# Logistic Regression
log_spec <- logistic_reg() %>%
  set_engine("glm") %>%
  set_mode("classification")

# Random Forest
rf_spec <- rand_forest(trees = 100) %>%
  set_engine("ranger", importance = "impurity") %>% # 'ranger' is a fast R implementation
  set_mode("classification")
Cross Validation and TrainingWe perform 5-fold cross-validation on the training set.if (!is.null(df)) {
  print("Starting Model Training with SMOTE...")

  set.seed(42)
  folds <- vfold_cv(train_data, v = 5, strata = disease_risk)

  # 1. Logistic Regression Workflow
  log_ wf <- workflow() %>%
    add_recipe(ml_recipe) %>%
    add_model(log_spec)

  log_res <- fit_resamples(
    log_wf,
    resamples = folds,
    metrics = metric_set(f1, accuracy, recall, roc_auc),
    control = control_resamples(save_pred = TRUE)
  )

  print("--- Logistic Regression CV Results ---")
  print(collect_metrics(log_res) %>% select(.metric, mean))

  # 2. Random Forest Workflow
  rf_wf <- workflow() %>%
    add_recipe(ml_recipe) %>%
    add_model(rf_spec)

  rf_res <- fit_resamples(
    rf_wf,
    resamples = folds,
    metrics = metric_set(f1, accuracy, recall, roc_auc)
  )

  print("--- Random Forest CV Results ---")
  print(collect_metrics(rf_res) %>% select(.metric, mean))
}
5. Final Evaluation on Test SetTraining on the full training data and predicting on the held-out test set.if (!is.null(df)) {

  evaluate_model <- function(workflow, name) {
    # Fit on training, predict on test
    final_fit <- last_fit(workflow, split, metrics = metric_set(accuracy, recall, f1, roc_auc))

    # Extract metrics
    metrics <- collect_metrics(final_fit)

    print(paste("--- Final Test Results:", name, "---"))
    print(metrics)

    return(final_fit)
  }

  log_final <- evaluate_model(log_wf, "Logistic Regression")
  rf_final <- evaluate_model(rf_wf, "Random Forest")

  # Feature Importance for Random Forest (Best Model analysis equivalent)
  # Extract the fitted model object
  fitted_rf <- extract_fit_parsnip(rf_final)

  print("Variable Importance (Random Forest):")
  # Using vip package if available, or extracting from ranger object
  if (require("vip")) {
    print(vip::vip(fitted_rf))
  } else {
    print(fitted_rf$fit$variable.importance)
  }

  # Coefficients for Logistic Regression
  fitted_log <- extract_fit_parsnip(log_final)
  print("Coefficients (Logistic Regression):")
  print(tidy(fitted_log))
}
