<a href="https://colab.research.google.com/github/c-walls/Essay-Score-Predictions/blob/main/RandomForest_w_CV.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:

# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES
# TO THE CORRECT LOCATION (/kaggle/input) IN YOUR NOTEBOOK,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S R
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.

DATA_SOURCE_MAPPING = 'linking-writing-processes-to-writing-quality:https%3A%2F%2Fstorage.googleapis.com%2Fkaggle-competitions-data%2Fkaggle-v2%2F59291%2F6678907%2Fbundle%2Farchive.zip%3FX-Goog-Algorithm%3DGOOG4-RSA-SHA256%26X-Goog-Credential%3Dgcp-kaggle-com%2540kaggle-161607.iam.gserviceaccount.com%252F20240327%252Fauto%252Fstorage%252Fgoog4_request%26X-Goog-Date%3D20240327T170124Z%26X-Goog-Expires%3D259200%26X-Goog-SignedHeaders%3Dhost%26X-Goog-Signature%3D7c049acaa9e334a59332a46b1f01e7bcf6c51dcb6469585e3be311976153159af67a3306ee5ddac3c498ed5ee01e9294575cd53fbdaaa2075addab89c5a653c10f35ec963f6d9d36894814cc8f5efa8099fdca407f900b8f48a9a78a91e860ed1c6447f0b248ba8dff5c12409eedcffa6cdf2e3a97134c62226f1812a134801939b747d341952931e85b1fc478a516688e3d0ae9e04dca812525f746353ae8cffd2976b50b0228e1ef75d9943440c14361913193614022856130aea59871c740c21efe915e21f0e187661c7312864f1ce0a1f891f028504d0e993b68a68e615ca5974b06d2520d4dfe828562125426424615f8b5bfee5902023f622347ed9fb2'

KAGGLE_INPUT_PATH = '/kaggle/input'
KAGGLE_WORKING_PATH = '/kaggle/working'

system(paste0('sudo umount ', '/kaggle/input'))
system(paste0('sudo rmdir ', '/kaggle/input'))
system(paste0('sudo mkdir -p -- ', KAGGLE_INPUT_PATH), intern=TRUE)
system(paste0('sudo chmod 777 ', KAGGLE_INPUT_PATH), intern=TRUE)
system(
  paste0('sudo ln -sfn ', KAGGLE_INPUT_PATH,' ',file.path('..', 'input')),
  intern=TRUE)

system(paste0('sudo mkdir -p -- ', KAGGLE_WORKING_PATH), intern=TRUE)
system(paste0('sudo chmod 777 ', KAGGLE_WORKING_PATH), intern=TRUE)
system(
  paste0('sudo ln -sfn ', KAGGLE_WORKING_PATH, ' ', file.path('..', 'working')),
  intern=TRUE)

data_source_mappings = strsplit(DATA_SOURCE_MAPPING, ',')[[1]]
for (data_source_mapping in data_source_mappings) {
    path_and_url = strsplit(data_source_mapping, ':')
    directory = path_and_url[[1]][1]
    download_url = URLdecode(path_and_url[[1]][2])
    filename = sub("\\?.+", "", download_url)
    destination_path = file.path(KAGGLE_INPUT_PATH, directory)
    print(paste0('Downloading and uncompressing: ', directory))
    if (endsWith(filename, '.zip')){
      temp = tempfile(fileext = '.zip')
      download.file(download_url, temp)
      unzip(temp, overwrite = TRUE, exdir = destination_path)
      unlink(temp)
    }
    else{
      temp = tempfile(fileext = '.tar')
      download.file(download_url, temp)
      untar(temp, exdir = destination_path)
      unlink(temp)
    }
    print(paste0('Downloaded and uncompressed: ', directory))
}

print(paste0('Data source import complete'))


In [1]:
library(moments)
library(ggplot2)
library(caTools)
library(randomForest)
library(readr)
library(caret)

train_scores <- read.csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_scores.csv")
train_logs <- read.csv("/kaggle/input/linking-writing-processes-to-writing-quality/train_logs.csv")
sample_submission <- read.csv("/kaggle/input/linking-writing-processes-to-writing-quality/sample_submission.csv")
test_logs <- read.csv("/kaggle/input/linking-writing-processes-to-writing-quality/test_logs.csv")
test_scores <- data.frame(id = unique(test_logs$id))

ERROR: Error in library(moments): there is no package called ‘moments’


# DATA CLEANING

In [None]:
#Combine Event Columns
train_logs <- train_logs[, names(train_logs) != "up_event"]
names(train_logs)[names(train_logs) == "down_event"] <- "event"


#Simplify Move Activities
i <- 1
while (i <= nrow(train_logs)) {
  if (substr(train_logs$activity[i+1], 1, 1) == "M" & train_logs$event[i] == "Leftclick" & train_logs$text_change[i] == "NoChange") {

    new_text <- sub(".*?(\\[.*?\\]).*", "\\1", train_logs$activity[i + 1])
    train_logs$text_change[i] = paste0("Highlighted ", new_text)
    train_logs$activity[i+1] <- "Move"
    rm(new_text)

  }
  i <- i + 1
}

#Remove extra movement info from 'undo' rows & convert to factor
train_logs[substr(train_logs$activity, 1, 1) == "M" & train_logs$event == "z", ]$activity <- "Move"
train_logs$activity <- as.factor(train_logs$activity)
str(train_logs$activity)


#Simplify Event Values
train_logs$event <- gsub("(?i)(F[0-9]{1,2}|.*Media.*|.*Audio.*|.*Pause.*).*", "DeviceFeatureChangeKey", train_logs$event)
train_logs$event <- gsub("(?i)(Scroll.*|Meta|Dead|Process|OS|AltGraph|Mode.*)", "SpecialProcessKey", train_logs$event)
train_logs$event <- gsub("(?i)(Clear|Cancel)", "Escape", train_logs$event)
train_logs$event <- gsub("^[0-9]$", "DigitKey", train_logs$event)
train_logs$event <- gsub("(?i).*Arrow.*", "ArrowKey", train_logs$event)
train_logs$event <- gsub("(?i)(Home|End|^Page.*)", "TextJumpKey", train_logs$event)

#Compile list of allowed event values
event_labels <- c("^[A-Za-z]$", "^[[:punct:]]$", "click", "Shift", "Space", "Backspace", "Enter", "Tab", "Caps", "Control", "Delete", "Insert", "Escape", "Print", "RareKey", "NumLock", "Alt", "ContextMenu", "ArrowKey", "TextJumpKey", "DeviceFeatureChangeKey", "SpecialProcessKey", "DigitKey", "Unidentified")
valid_events <- unique(train_logs$event)[apply(sapply(event_labels, function(x) grepl(x, unique(train_logs$event))), 1, any)]

#Replace remaining event values with "Unidentified"
cat("\n", "\n", "INVALID VALUES: ")
print(table(train_logs$event, exclude = valid_events))
train_logs$event <- ifelse(train_logs$event %in% valid_events, train_logs$event, "Unidentified")

cat("\n", "RARE VALUES: ")
print(names(which(table(train_logs$event) < 25)))
train_logs$event[train_logs$event %in% names(which(table(train_logs$event) < 25))] <- "RareKey"

cat("\n", "REMAINING VALUES: ")
print(unique(train_logs$event))
cat("Total 'Event' Values: ", length(unique(train_logs$event)))

#Convert event variable to factor and check
train_logs$event <- as.factor(train_logs$event)
str(train_logs$event)

# FEATURE ENGINEERING

In [None]:
total_events <- function(logs, scores) {
  result <- aggregate(event_id ~ id, data = logs, FUN = max)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "event_id"] <- "total_events"
  return(result)
}

event_diversity <- function(logs, scores) {
  result <- aggregate(event ~ id, data = logs, FUN = function(x) length(unique(x)))
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "event"] <- "event_diversity"
  return(result)
}

normalized_event_diversity <- function(scores) {
  scores$normalized_event_diversity <- scores$event_diversity / scores$total_events
  return(scores)
}

submitted_words <- function(logs, scores) {
  result <- aggregate(word_count ~ id, data = logs, FUN = function(x) tail(x, 1))
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "word_count"] <- "submitted_words"
  return(result)
}

max_words <- function(logs, scores) {
  result <- aggregate(word_count ~ id, data = logs, FUN = max)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "word_count"] <- "max_words"
  return(result)
}

word_reduction <- function(scores) {
  scores$word_reduction <- scores$max_words - scores$submitted_words
  return(scores)
}

word_count_mean <- function(logs, scores) {
  result <- aggregate(word_count ~ id, data = logs, FUN = mean)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "word_count"] <- "word_count_mean"
  return(result)
}

word_count_sd <- function(logs, scores) {
  result <- aggregate(word_count ~ id, data = logs, FUN = sd)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "word_count"] <- "word_count_sd"
  return(result)
}

word_count_skew <- function(logs, scores) {
  result <- aggregate(word_count ~ id, data = logs, FUN = skewness)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "word_count"] <- "word_count_skew"
  return(result)
}

word_count_kurt <- function(logs, scores) {
  result <- aggregate(word_count ~ id, data = logs, FUN = kurtosis)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "word_count"] <- "word_count_kurt"
  return(result)
}

word_count_median <- function(logs, scores) {
  result <- aggregate(word_count ~ id, data = logs, FUN = median)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "word_count"] <- "word_count_median"
  return(result)
}

word_count_IQR <- function(logs, scores) {
  result <- aggregate(word_count ~ id, data = logs, FUN = IQR)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "word_count"] <- "word_count_IQR"
  return(result)
}

submission_time <- function(logs, scores) {
  result <- aggregate(up_time ~ id, data = logs, FUN = max)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "up_time"] <- "submission_time"
  return(result)
}

first_input <- function(logs, scores) {
  result <- aggregate(down_time ~ id, data = logs[logs$activity == "Input", ], FUN = min)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "down_time"] <- "first_input"
  return(result)
}

writing_time <- function(scores) {
  scores$writing_time <- scores$submission_time - scores$first_input
  return(scores)
}

submit_word_rate <- function(scores) {
  scores$submit_word_rate <- scores$submitted_words / scores$writing_time
  return(scores)
}

max_word_rate <- function(scores) {
  scores$max_word_rate <- scores$max_words / scores$writing_time
  return(scores)
}

normalized_cursor_position <- function(logs, scores) {
  logs <- merge(scores[ , c("id", "submitted_words")], logs, by = "id", all.x = TRUE)
  logs$normalized_cursor_position <- logs$cursor_position / logs$submitted_words
  return(logs)
}

cursor_position_mean <- function(logs, scores) {
  result <- aggregate(cursor_position ~ id, data = logs, FUN = mean)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "cursor_position"] <- "cursor_position_mean"
  return(result)
}

cursor_position_sd <- function(logs, scores) {
  result <- aggregate(cursor_position ~ id, data = logs, FUN = sd)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "cursor_position"] <- "cursor_position_sd"
  return(result)
}

cursor_position_skew <- function(logs, scores) {
  result <- aggregate(cursor_position ~ id, data = logs, FUN = skewness)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "cursor_position"] <- "cursor_position_skew"
  return(result)
}

cursor_position_kurt <- function(logs, scores) {
  result <- aggregate(cursor_position ~ id, data = logs, FUN = kurtosis)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "cursor_position"] <- "cursor_position_kurt"
  return(result)
}

cursor_position_median <- function(logs, scores) {
  result <- aggregate(cursor_position ~ id, data = logs, FUN = median)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "cursor_position"] <- "cursor_position_median"
  return(result)
}

cursor_position_IQR <- function(logs, scores) {
  result <- aggregate(cursor_position ~ id, data = logs, FUN = IQR)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "cursor_position"] <- "cursor_position_IQR"
  return(result)
}

cursor_position_max <- function(logs, scores) {
  result <- aggregate(cursor_position ~ id, data = logs, FUN = max)
  result <- merge(result, scores, by = "id", all.x = TRUE)
  names(result)[names(result) == "cursor_position"] <- "cursor_position_max"
  return(result)
}

word_reduction <- function(scores) {
  scores$word_reduction <- scores$max_words - scores$submitted_words
  return(scores)
}

input_count <- function(logs, scores) {
  result <- aggregate(activity ~ id, data = logs, FUN = function(x) sum(x == "Input"))
  result <- merge(scores, result, by = "id", all.x = TRUE)
  names(result)[names(result) == "activity"] <- "input_count"
  return(result)
}

cut_count <- function(logs, scores) {
  result <- aggregate(activity ~ id, data = logs, FUN = function(x) sum(x == "Remove/Cut"))
  result <- merge(scores, result, by = "id", all.x = TRUE)
  names(result)[names(result) == "activity"] <- "cut_count"
  return(result)
}

paste_count <- function(logs, scores) {
  result <- aggregate(activity ~ id, data = logs, FUN = function(x) sum(x == "Paste"))
  result <- merge(scores, result, by = "id", all.x = TRUE)
  names(result)[names(result) == "activity"] <- "paste_count"
  return(result)
}

move_count <- function(logs, scores) {
  result <- aggregate(activity ~ id, data = logs, FUN = function(x) sum(x == "Move"))
  result <- merge(scores, result, by = "id", all.x = TRUE)
  names(result)[names(result) == "activity"] <- "move_count"
  return(result)
}

replace_count <- function(logs, scores) {
  result <- aggregate(activity ~ id, data = logs, FUN = function(x) sum(x == "Replace"))
  result <- merge(scores, result, by = "id", all.x = TRUE)
  names(result)[names(result) == "activity"] <- "replace_count"
  return(result)
}

nonproduction_count <- function(logs, scores) {
  result <- aggregate(activity ~ id, data = logs, FUN = function(x) sum(x == "Nonproduction"))
  result <- merge(scores, result, by = "id", all.x = TRUE)
  names(result)[names(result) == "activity"] <- "nonproduction_count"
  return(result)
}

input_frequency <- function(scores) {
  scores$input_frequency <- scores$input_count / scores$total_events
  return(scores)
}

nonproduction_frequency <- function(scores) {
  scores$nonproduction_frequency <- scores$nonproduction_count / scores$total_events
  return(scores)
}

cut_frequency <- function(scores) {
  scores$cut_frequency <- scores$cut_count / scores$total_events
  return(scores)
}

cut_input_ratio <- function(scores) {
  scores$cut_input_ratio <- scores$cut_count / scores$input_count
  return(scores)
}

input_rate <- function(scores) {
  scores$input_rate <- scores$input_count / scores$writing_time
  return(scores)
}

input_productivity <- function(scores) {
  scores$input_productivity <- scores$input_count / scores$submitted_words
  return(scores)
}

In [None]:
## Feature Calls ##
train_scores <- total_events(train_logs, train_scores)
train_scores <- event_diversity(train_logs, train_scores)
train_scores <- normalized_event_diversity(train_scores)
train_scores <- submitted_words(train_logs, train_scores)
train_scores <- max_words(train_logs, train_scores)
train_scores <- word_reduction(train_scores)
train_scores <- word_count_mean(train_logs, train_scores)
train_scores <- word_count_sd(train_logs, train_scores)
train_scores <- word_count_skew(train_logs, train_scores)
train_scores <- word_count_kurt(train_logs, train_scores)
train_scores <- word_count_median(train_logs, train_scores)
train_scores <- word_count_IQR(train_logs, train_scores)
train_scores <- submission_time(train_logs, train_scores)
train_scores <- first_input(train_logs, train_scores)
train_scores <- writing_time(train_scores)
train_scores <- submit_word_rate(train_scores)
train_scores <- max_word_rate(train_scores)
train_logs <- normalized_cursor_position(train_logs, train_scores)
train_scores <- cursor_position_mean(train_logs, train_scores)
train_scores <- cursor_position_sd(train_logs, train_scores)
train_scores <- cursor_position_skew(train_logs, train_scores)
train_scores <- cursor_position_kurt(train_logs, train_scores)
train_scores <- cursor_position_median(train_logs, train_scores)
train_scores <- cursor_position_IQR(train_logs, train_scores)
train_scores <- cursor_position_max(train_logs, train_scores)
train_scores <- input_count(train_logs, train_scores)
train_scores <- cut_count(train_logs, train_scores)
train_scores <- paste_count(train_logs, train_scores)
train_scores <- move_count(train_logs, train_scores)
train_scores <- replace_count(train_logs, train_scores)
train_scores <- nonproduction_count(train_logs, train_scores)
train_scores <- input_frequency(train_scores)
train_scores <- nonproduction_frequency(train_scores)
train_scores <- cut_frequency(train_scores)


cat("\n", "LOG VARIABLES:", "\n")
print(names(train_logs))
cat("\n", "\n", "SCORE VARIABLES:", "\n")
print(names(train_scores))

# PREDICTIVE MODELING

In [None]:
# Create a training control object for cross-validation
set.seed(123)
ctrl <- trainControl(method = "cv", number = 5)

# Specify the parameter grid for tuning
grid <- expand.grid(.mtry = c(2, 5, 10, 15, 20, 25, 30, 35))

# Train the model using cross-validation
set.seed(1325)
classifier_RF <- train(
  x = train_scores[, names(train_scores) != "score"],
  y = train_scores$score,
  method = "rf",
  trControl = ctrl,
  tuneGrid = grid
)

# View the tuned parameters
print(classifier_RF)

# View Feature Importance
feature_imp <- varImp(classifier_RF)
dotPlot(feature_imp)

# PREPPING THE TEST SET

In [None]:
#Combine Event Columns
test_logs <- test_logs[, names(test_logs) != "up_event"]
names(test_logs)[names(test_logs) == "down_event"] <- "event"

#Simplify Move Activities
i <- 1
while (i <= nrow(test_logs)) {
  if (substr(test_logs$activity[i+1], 1, 1) == "M" & test_logs$event[i] == "Leftclick" & test_logs$text_change[i] == "NoChange") {

    new_text <- sub(".*?(\\[.*?\\]).*", "\\1", test_logs$activity[i + 1])
    test_logs$text_change[i] = paste0("Highlighted ", new_text)
    test_logs$activity[i+1] <- "Move"
    rm(new_text)

  }
  i <- i + 1
}

#Remove extra movement info from 'undo' rows
if (substr(test_logs$activity, 1, 1) == "M" && test_logs$event == "z") {
  test_logs[substr(test_logs$activity, 1, 1) == "M" & test_logs$event == "z", ]$activity <- "Move"
}
test_logs$activity <- as.factor(test_logs$activity)
str(test_logs$activity)

#Simplify Event Values
test_logs$event <- gsub("(?i)(F[0-9]{1,2}|.*Media.*|.*Audio.*|.*Pause.*).*", "DeviceFeatureChangeKey", test_logs$event)
test_logs$event <- gsub("(?i)(Scroll.*|Meta|Dead|Process|OS|AltGraph|Mode.*)", "SpecialProcessKey", test_logs$event)
test_logs$event <- gsub("(?i)(Clear|Cancel)", "Escape", test_logs$event)
test_logs$event <- gsub("^[0-9]$", "DigitKey", test_logs$event)
test_logs$event <- gsub("(?i).*Arrow.*", "ArrowKey", test_logs$event)
test_logs$event <- gsub("(?i)(Home|End|^Page.*)", "TextJumpKey", test_logs$event)

#Compile list of allowed event values
new_event_labels <- c("^[A-Za-z]$", "^[[:punct:]]$", "click", "Shift", "Space", "Backspace", "Enter", "Tab", "Caps", "Control", "Delete", "Insert", "Escape", "Print", "RareKey", "NumLock", "Alt", "ContextMenu", "ArrowKey", "TextJumpKey", "DeviceFeatureChangeKey", "SpecialProcessKey", "DigitKey", "Unidentified")
new_valid_events <- unique(test_logs$event)[apply(sapply(new_event_labels, function(x) grepl(x, unique(test_logs$event))), 1, any)]

test_logs$event <- ifelse(test_logs$event %in% new_valid_events, test_logs$event, "Unidentified")
test_logs$event[test_logs$event %in% names(which(table(test_logs$event) < 25))] <- "RareKey"

test_logs$event <- as.factor(test_logs$event)
str(test_logs$event)

train_scores$score <- as.factor(train_scores$score)
str(train_scores)

# ADDING TEST FEATURES

In [None]:
#Add Features to Test Set
test_scores <- total_events(test_logs, test_scores)
test_scores <- event_diversity(test_logs, test_scores)
test_scores <- normalized_event_diversity(test_scores)
test_scores <- submitted_words(test_logs, test_scores)
test_scores <- max_words(test_logs, test_scores)
test_scores <- word_reduction(test_scores)
test_scores <- word_count_mean(test_logs, test_scores)
test_scores <- word_count_sd(test_logs, test_scores)
test_scores <- word_count_skew(test_logs, test_scores)
test_scores <- word_count_kurt(test_logs, test_scores)
test_scores <- word_count_median(test_logs, test_scores)
test_scores <- word_count_IQR(test_logs, test_scores)
test_scores <- submission_time(test_logs, test_scores)
test_scores <- first_input(test_logs, test_scores)
test_scores <- writing_time(test_scores)
test_scores <- submit_word_rate(test_scores)
test_scores <- max_word_rate(test_scores)
test_logs <- normalized_cursor_position(test_logs, test_scores)
test_scores <- cursor_position_mean(test_logs, test_scores)
test_scores <- cursor_position_sd(test_logs, test_scores)
test_scores <- cursor_position_skew(test_logs, test_scores)
test_scores <- cursor_position_kurt(test_logs, test_scores)
test_scores <- cursor_position_median(test_logs, test_scores)
test_scores <- cursor_position_IQR(test_logs, test_scores)
test_scores <- cursor_position_max(test_logs, test_scores)
test_scores <- input_count(test_logs, test_scores)
test_scores <- cut_count(test_logs, test_scores)
test_scores <- paste_count(test_logs, test_scores)
test_scores <- move_count(test_logs, test_scores)
test_scores <- replace_count(test_logs, test_scores)
test_scores <- nonproduction_count(test_logs, test_scores)
test_scores <- input_frequency(test_scores)
test_scores <- nonproduction_frequency(test_scores)
test_scores <- cut_frequency(test_scores)

cat("\n", "\n", "TRAIN_SCORES VARIABLES:", "\n")
print(names(train_scores))

cat("\n", "\n", "TEST_SCORES VARIABLES:", "\n")
print(names(test_scores))

print(classifier_RF)

test_scores[test_scores == "NaN"] <- 0
test_scores[test_scores == "Inf"] <- 0
test_scores[is.na(test_scores)] <- 0

# MAKE PREDICTIONS

In [None]:
test_predictions <- predict(classifier_RF, newdata = test_scores)
sample_submission <- data.frame(id = test_scores$id, score = test_predictions)
write_csv(sample_submission, "submission.csv")