# Functions

In [1]:
# plot frequency for each class
plot_label_freq <- function(dataset) {

    options(repr.plot.width = 15, repr.plot.height = 5)
    par(mfrow = c(1, 3))

    col <- brewer.pal(max(c(8, length(label_list))), 'Dark2')

    for (i in 1:3) {

        label_freq <- dataset[[i]] %>% 
            group_by(Labels) %>%
            summarise(counts = length(Labels))

        barplot(label_freq[[2]]/sum(label_freq[[2]]), names.arg = label_freq[[1]], col = col,
            main = names(dataset)[i], xlab = 'label', ylab = 'frequency [%]',
            cex.lab = 1.5, cex.axis = 1.5, cex.names = 1.5, cex.main = 2)

    }
}

In [2]:
# define function to tokenize document (bag of words model)
tokenize <- function(document) {

    # lowercase
    temp <- tolower(document)

    # remove everything that is not a letter
    temp <- str_replace_all(temp, '[^a-z\\s]', ' ')

    # shrink down to just one white space
    temp <- str_replace_all(temp,'[\\s]+', ' ')

    # split it
    temp <- str_split(temp, ' ')[[1]]
    
    # remove empty characters
    temp <- temp[!(temp %in% '')]

    return(temp)

}

In [3]:
cleaning <- function(document) {

    # remove stop words
    stop_words <- unique(str_split(paste0(stopwords(), collapse = ' '), '[\\s|\']')[[1]])
    temp <- document[!(document %in% stop_words)]

    # remove single letters
    temp <- temp[!(temp %in% letters)]

    return(temp)

}

In [4]:
pre_processing <- function(document, CLEAN = TRUE, LEM = TRUE) {

    # transform string in tokens
    document <- tokenize(document)

    # clean tokens
    if (CLEAN) { document <- cleaning(document) }

    # lemmatize tokens
    if (LEM) { document <- lemmatize_words(document) }

    return(document)
}

v_pre_processing <- Vectorize(pre_processing, vectorize.args = c('document'), USE.NAMES = FALSE)

In [5]:
make_vocabulary <- function(train) {

    # join every list of tokens into a unique list
    res <- do.call(c, train$Tokens)

    # create the vocabulary of words of the training set
    return(sort(unique(res)))

}

In [6]:
get_mutual_information <- function(train, vocabulary) {

    # mutual information
    mutual_information <- list()
    
    # function to count number of documents containing a given token
    count_documents <- function(token, documents) {

        counts <- sum(sapply(documents, function(x) token %in% x))
        return(counts)

    }
                
    v_count_documents <- Vectorize(count_documents, vectorize.args = c('token'), USE.NAMES = FALSE)

    for (label in label_list) {

        # number of documents that contain token t and have label l
        N_11 <- v_count_documents(vocabulary, documents = train$Tokens[train$Labels == label])
        # number of documents that do not contain token t and have label l
        N_01 <- nrow(train[train$Labels == label,]) - N_11

        # number of documents that contain token t and do not have label l
        N_10 <- v_count_documents(vocabulary, documents = train$Tokens[train$Labels != label])
        # number of documents that do not contain token t and do not have label l
        N_00 <- nrow(train[train$Labels != label,]) - N_10

        # total number of documents
        N <- N_11 + N_01 + N_10 + N_00

        # log losses
        log_loss_11 <- ifelse(N_11 > 0, log2(N*N_11/((N_10 + N_11)*(N_01 + N_11))), 0)
        log_loss_01 <- ifelse(N_01 > 0, log2(N*N_01/((N_00 + N_01)*(N_01 + N_11))), 0)
        log_loss_10 <- ifelse(N_10 > 0, log2(N*N_10/((N_10 + N_11)*(N_00 + N_10))), 0)
        log_loss_00 <- ifelse(N_00 > 0, log2(N*N_00/((N_00 + N_01)*(N_00 + N_10))), 0)
        
        # mutual information
        MI <- (N_11/N)*log_loss_11 +
            (N_01/N)*log_loss_01 + 
            (N_10/N)*log_loss_10 + 
            (N_00/N)*log_loss_00

        # add names
        names(MI) <- vocabulary

        # add mutual information to the list
        mutual_information[[paste0('Label_', label)]] <- MI
 
    }

    return(mutual_information)
         
}

In [7]:
grid_search <- function(train, val, pp_grid, k_grid) {

    # evaluation grid
    evaluation_grid <- matrix(0, nrow = length(pp_grid), ncol = length(k_grid))

    for (i in 1:length(pp_grid)) {

        # pre-process documents in training set
        train$Tokens <- v_pre_processing(train$Text, CLEAN = pp_grid[[i]][1], LEM = pp_grid[[i]][2])

        # make vocabulary
        vocabulary <- make_vocabulary(train)
        
        # mutual information
        mutual_information <- get_mutual_information(train, vocabulary)

        for (j in 1:length(k_grid)) {
            
            # feature selection
            k <- k_grid[j]
            res <- features_selection(mutual_information, k)

            selected_features <- res$selected_features
            vocabulary <- res$vocabulary
            
            # learning model
            res <- learning_model(train, vocabulary)

            conditional_probs <- res$conditional_probs
            priors <- res$priors
            tot_tokens_per_label <- res$tot_tokens_per_label
            
            # predict
            true_labels <- val$Labels
            predicted_labels <- v_predict(val$Text, 
                                vocabulary = vocabulary,
                                conditional_probs = conditional_probs, priors = priors,
                                tot_tokens_per_label = tot_tokens_per_label,
                                CLEAN = pp_grid[[i]][1], LEM = pp_grid[[i]][2])
            
            # evaluate
            res <- evaluate(true_labels, predicted_labels)
            accuracy <- res$accuracy
            evaluation_grid[i,j] <- accuracy

        }
    }

    return(evaluation_grid)
}

In [8]:
plot_grid_search <- function(pp_grid, k_grid, evaluation_grid) {
    
    options(repr.plot.width = 14, repr.plot.height = 7)

    col <- brewer.pal(max(c(8, length(label_list))), 'Dark2')

    for (i in 1:length(pp_grid)) {

        if (i == 1) {
            plot(k_grid, evaluation_grid[1,], type = 'b', pch = 16, lwd = 2, col = col[i],
            xlim = c(0,1), ylim = c(min(evaluation_grid)-0.1, max(evaluation_grid)+0.1),
            main = 'Grid search', xlab = 'k', ylab = 'accuracy',
            cex.lab = 1.25, cex.axis = 1, cex.main = 1.5)
        } else {   
            points(k_grid, evaluation_grid[i,], type = 'b', pch = 16, lwd = 2, col = col[i])
        }

    }

    grid(lty = 2)

    legend(x = 'bottomright',
        inset = 0.02,
        legend = c('no pre-processing', 'cleaning and lemmatizing'),
        lty = rep(1,2),
        lwd = rep(1,2),
        col = col[1:length(pp_grid)],
        #bty = 'n',
        cex = 1)


}

In [9]:
features_selection <- function(mutual_information, k) {

    # selected features
    selected_features <- list()

    for (label in label_list) {

        # sort
        MI <- sort(mutual_information[[paste0('Label_', label)]], decreasing = T)

        # select k [%] features
        MI <- MI[1:floor(k*length(MI))]

        # add selected features to the list
        selected_features[[paste0('Label_', label)]] <- MI

    }

    # join every named vector into a unique list
    res <- do.call(c, unname(selected_features))

    # create the new vocabulary of tokens of the training set after feature selection
    vocabulary <- sort(unique(names(res)))

    return(list(selected_features = selected_features,
                vocabulary = vocabulary))

}

In [10]:
plot_MI <- function(mutual_information, k, x_legend) {

    # plot ordered mutual information for each class
    options(repr.plot.width = 14, repr.plot.height = 7)
    par(mfrow = c(1, 2))
    
    col <- brewer.pal(max(c(8, length(label_list))), 'Dark2')

    for (label in label_list) {

        if ((label %% 2) == 0) {
            options(repr.plot.width = 14, repr.plot.height = 7)
            par(mfrow = c(1, 2))
            }

        MI <- mutual_information[[paste0('Label_', label)]]
        MI[is.nan(MI)] <- 0
        MI <- sort(MI, decreasing = T)
        plot(1:length(MI), MI, ylim = c(0, 0.005), type = 'l', lwd = 2, col = col[1],
            main = paste0('Label_', label), xlab = 'token index', ylab = 'MI',
            cex.lab = 1.5, cex.axis = 1.5, cex.main = 2)
        abline(v = floor(k*length(MI)), lwd = 2, lty = 2, col = col[2])

        legend(x = x_legend, y = 1*0.005,
        legend = c('mutual information', 'k*len(vocabulary)'),
        lty = rep(1,2),
        lwd = rep(2,2),
        col = c(col[1], col[2]),
        bty = 'n',
        cex = 1.3)
    }
}

In [11]:
# visualize top 10 features for each label
get_feature_ranking <- function(selected_features, label_names) {

    feature_ranking <- list()

    for (label in label_list) {

        # select token sorted by descending mutual information for one label
        feature_ranking[[label+1]] <- names(selected_features[[paste0('Label_', label)]])
    
    }

    feature_ranking <- as.data.frame(feature_ranking)
    colnames(feature_ranking) <- label_names

    return(feature_ranking)

}

In [12]:
learning_model <- function(train, vocabulary) {

    # create dataframe of conditional probabilities
    conditional_probs <- data.frame(Token = vocabulary)
    priors <- NULL
    tot_tokens_per_label <- NULL

    # number of training data
    N <- nrow(train)

    # function to compute unnormalized conditional probability with Laplace smoothing
    conditional_prob <- function(token, tokens_list){

        n_token <- sum(tokens_list == token) + 1
        return(n_token)

    }
    
    v_conditional_prob <- Vectorize(conditional_prob, vectorize.args = c('token'), USE.NAMES = FALSE)

    for (label in label_list) {
        
        # create list of tokens grouped by label
        tokens_list <- do.call(c, train$Tokens[train$Labels == label])

        # count total number of token in label l
        tot_tokens <- length(tokens_list)

        # save number of tokens per label (will be used in the test set)
        tot_tokens_per_label  <- c(tot_tokens_per_label, tot_tokens + length(vocabulary))

        # calculate the conditional probability function for every token of the vocabulary
        conditional_probs[[paste0('Label_', label)]] <- v_conditional_prob(conditional_probs$Token, tokens_list=tokens_list)

        # normalize the conditional probabilities
        normalization <- (tot_tokens + length(vocabulary))
        conditional_probs[[paste0('Label_', label)]] <- conditional_probs[[paste0('Label_', label)]]/normalization

        # calculate prior for the labels
        N_l <- sum(train$Labels == label)
        priors <- c(priors, N_l/N)
    }

    return(list(conditional_probs = conditional_probs,
                priors = priors,
                tot_tokens_per_label = tot_tokens_per_label))

}

In [13]:
predict <- function(document, vocabulary,
                    conditional_probs, priors, tot_tokens_per_label,
                    CLEAN = TRUE, LEM = TRUE){

    # pre-process document
    tokens_in_document <- pre_processing(document, CLEAN = CLEAN, LEM = LEM)

    # select conditional probabilities of token in document
    conditional_probs <- conditional_probs[vocabulary %in% tokens_in_document,]

    # calculate log-likelihood of the document for each label
    log_likelihoods <- colSums(log(conditional_probs[, 2:(1+length(label_list))]))

    # calculate log-posterior for each label 
    log_posteriors <- log_likelihoods + log(priors) 

    # count the number of tokens not in the vocabulary
    n_unseen_tokens <- length(setdiff(tokens_in_document, vocabulary))

    # add the log-posterior for the tokens not in the vocabulary
    log_posteriors <- log_posteriors - n_unseen_tokens*log(tot_tokens_per_label)

    predicted_labels <- (which.max(log_posteriors) - 1)

    return(predicted_labels)

}

v_predict <- Vectorize(predict, vectorize.args = c('document'), USE.NAMES = FALSE)

In [14]:
get_confusion_matrix <- function(true_labels, predicted_labels) {

    # confusion matrix
    cm <- confusionMatrix(as.factor(predicted_labels),
                        as.factor(true_labels),
                        dnn = c('Prediction', 'Reference'))
    return(cm)

}

In [15]:
get_micro_f1_measure <- function(true_labels, predicted_labels) {

    label_count = aggregate(true_labels, by = list(true_labels), FUN = length)
    names(label_count) <- c('Labels', 'Count')

    # total true positive
    tot_TP <- 0
    # total false positive
    tot_FP <- 0
    # total false negative
    tot_FN <- 0

    for (label in label_count$Labels) {

        tot_TP <- tot_TP + sum(predicted_labels[true_labels == label] == label)
        tot_FP <- tot_FP + sum(predicted_labels[true_labels != label] == label)
        tot_FN <- tot_FN + sum(predicted_labels[true_labels == label] != label)

    }

    micro_precision = tot_TP/(tot_TP + tot_FP)
    micro_recall = tot_TP/(tot_TP + tot_FN)

    return(2*(micro_precision*micro_recall)/(micro_precision + micro_recall))

    }

In [16]:
get_accuracy <- function(true_labels, predicted_labels) {

    return(sum(true_labels == predicted_labels)/length(true_labels))
    
}

In [17]:
evaluate <- function(true_labels, predicted_labels) {

    return(list(confusion_matrix = get_confusion_matrix(true_labels, predicted_labels),
                micro_f1_measure = get_micro_f1_measure(true_labels, predicted_labels),
                accuracy = get_accuracy(true_labels, predicted_labels)))
                
}

In [18]:
# plot confusion matrix
plot_confusion_matrix <- function(cm) {

    options(repr.plot.width = 7, repr.plot.height = 7)

    col <- brewer.pal(max(c(8, length(label_list))), 'Dark2')

    # divide each column by its sum
    plt <- as.data.frame(round(sweep(cm$table, 2, colSums(cm$table), FUN = '/'), 2))
    plt$Prediction <- factor(plt$Prediction, levels = rev(levels(plt$Prediction)))

    ggplot(plt, aes(Prediction, Reference, fill = Freq)) +
        geom_tile() + geom_text(aes(label = Freq), size = 7) +
        scale_fill_gradient(low = 'white', high = col[1]) +
        labs(title = 'Confusion matrix', x = 'reference', y = 'prediction') +
        scale_x_discrete(labels = as.character(label_list)) +
        scale_y_discrete(labels = as.character(rev(label_list))) +
        theme(axis.text.x = element_text(size = 14),
            axis.text.y = element_text(size = 14),  
            axis.title.x = element_text(size = 16),
            axis.title.y = element_text(size = 16),
            title = element_text(size = 18)) +
        theme(legend.position = 'none')

}

In [19]:
save(list = ls(all=TRUE), file = 'functions')