# R experiments

## Titanic

### Rules:

In [None]:
library(sirus)

In [None]:
file_name <- '../data/externals/titanic.csv'

In [None]:
train_data <- read.csv(file_name, header = TRUE)
train_data

In [None]:
train_data[,c(2,4,5,6,7,8)]

In [None]:
X_titanic <- train_data[,c(2,4,5,6,7,8)]

In [None]:
y_titanic <- train_data$Survived

In [None]:
X_titanic

In [None]:
set.seed(8)
library(sirus)
sirus.m <- sirus.fit(data=X_titanic, y=y_titanic,type='classif',
                        q = 10, mtry = 6, num.trees = 2000, num.rule=25,max.depth=2, #,alpha=0.05 p0 = 0.0,
                         num.threads = 1, replace = TRUE,sample.fraction=1.0, verbose = FALSE, seed = NULL)

In [None]:
print("RUUULES")
print(sirus.print(sirus.m))
print("END RUUULES")

### Perfs:

In [None]:
set.seed(0)
library(sirus)

In [None]:
model_train <- function(X, y) {
    set.seed(0)
    sirus.m <- sirus.fit(data=X, y=y,type='classif',
                         p0 = 0.0, q = 10, mtry = 6, num.trees = 1000, num.rule=25,max.depth=2, #,alpha=0.05
                         num.threads = 1, replace = TRUE,sample.fraction=1.0, verbose = FALSE, seed = NULL)
    return(sirus.m)
  }

In [None]:
model_pred <- function(model, X_test) {
    res <- sirus.predict(model, X_test)
    return(res)
  }


In [None]:
cross_validate <- function(data_path, model_train,model_prediction,is_clf=TRUE,
                           output_path_preds = "predictions.csv", target_col = "target", fold_col = "fold",seed=0) {
  # Load the data
  df <- read.csv(data_path)
  
  folds <- sort(unique(df[[fold_col]]))
  results <- data.frame(fold = folds, metric = NA)
  all_preds = list()
  for (f in folds) {
    cat("\n--- Fold", f, "---\n")
    
    train_data <- subset(df, df[[fold_col]] != f)
    test_data  <- subset(df, df[[fold_col]] == f)
    
    # Extract X and y explicitly
    X_train <- train_data[, !(names(train_data) %in% c(target_col, fold_col))]
    y_train <- train_data[[target_col]]
    X_test  <- test_data[, !(names(test_data) %in% c(target_col, fold_col))]
    y_test  <- test_data[[target_col]]

    # Create dummy variables from training data
    X_train <- model.matrix(~ . - 1, data = X_train)
    
    # Ensure test factors have same levels as training
    for (col in names(X_train)) {
      if (is.factor(X_train[[col]])) {
        X_test[[col]] <- factor(X_test[[col]], levels = levels(X_train[[col]]))
      }
    }
    
    # Apply same encoding
    X_test <- model.matrix(~ . - 1, data = X_test)
    
    # Train model
    model <- model_train(as.data.frame(X_train), y_train)
    
    # Predict
    preds <- model_prediction(model, as.data.frame(X_test))

    # --- Build results data.frame ---
    res_preds <- data.frame(prediction = preds)
    if (is_clf){colnames(res_preds) <- paste0("class_", 1:1)} else { colnames(res_preds) <- c("pred") }
    
      
    res_preds <- cbind(fold = f, res_preds)
    # Append to list
    all_preds[[f+1]] <- res_preds
    #all_preds <- c(all_preds,res_preds)
    
    # Compute metric
    if (is.factor(y_test) || length(unique(y_test)) < 10) {
      # Classification accuracy
      if (is.numeric(preds)) {
        preds_class <- round(preds)
      } else {
        preds_class <- preds
      }
      acc <- mean(preds_class == y_test)
      results$metric[results$fold == f] <- acc
      cat("Accuracy:", round(acc, 4), "\n")
    } else {
      # Regression RMSE
      rmse <- sqrt(mean((preds - y_test)^2))
      results$metric[results$fold == f] <- rmse
      cat("RMSE:", round(rmse, 4), "\n")
    }
  }
  pred_matrix <- do.call(rbind, all_preds)
  pred_df <- as.data.frame(pred_matrix)
  # --- Save to CSV ---
  write.csv(pred_df, file = output_path_preds, row.names = FALSE)
  
  cat("\n--- Overall ---\n")
  print(results)
  cat("Mean metric:", mean(results$metric), "\n")
  
  return(results)
}


In [None]:
results <- cross_validate(data_path= "../reproduce-exp/titanic-folds.csv",
                          model_train=model_train,model_prediction=model_pred,
                          output_path_preds = "../reproduce-exp/r-predictions-titanic.csv",
                          target_col = "target", fold_col = "fold",is_clf=TRUE)

## House Sales:

### Rules:

In [None]:
file_name = "../reproduce-exp/house_sales-folds.csv"

In [None]:
train_data <- read.csv(file_name, header = TRUE)
train_data

In [None]:
X_house <- train_data[,c(1:15)]
X_house

In [None]:
y_house <- train_data$target

In [None]:
sirus.m <- sirus.fit(data=X_house, y=y_house,type='reg',
                         p0 = 0.0, q = 10, mtry = 15, num.trees = 2000, num.rule=25,max.depth=2, #,alpha=0.05
                         num.threads = 1, replace = TRUE,sample.fraction=1.0, verbose = FALSE, seed = NULL)

In [None]:
print("RUUULES")
print(sirus.print(sirus.m))
print("END RUUULES")

### Perfs:

In [None]:
model_train_reg <- function(X, y) {
    set.seed(0)
    sirus.m <- sirus.fit(data=X, y=y,type='reg',
                         p0 = 0.0, q = 10, mtry = 15, num.trees = 1000, num.rule=25,max.depth=2, #,alpha=0.05
                         num.threads = 1, replace = TRUE,sample.fraction=1.0, verbose = FALSE, seed = NULL)
    return(sirus.m)
  }

In [None]:
results <- cross_validate(data_path= "../reproduce-exp/house_sales-folds.csv",
                          model_train=model_train_reg,model_prediction=model_pred,is_clf=FALSE,
                          output_path_preds = "../reproduce-exp/r-predictions-house_sales.csv",
                          target_col = "target", fold_col = "fold")