In [1]:
# install packages
#install.packages('stringr')
#install.packages('stopwords')
#install.packages('caret')

# import packages
library(stringr)
library(stopwords)
library(caret)

Loading required package: ggplot2

Loading required package: lattice



In [2]:
# import data
data <- read.csv('data/train.csv')
data <- data[1:5000, ]
str(data)

'data.frame':	5000 obs. of  3 variables:
 $ Labels  : int  1 2 3 1 2 5 0 2 2 3 ...
 $ Text    : chr  "Says the Annies List political group supports third-trimester abortions on demand." "When did the decline of coal start? It started when natural gas took off that started to begin in (President Ge"| __truncated__ "Hillary Clinton agrees with John McCain \"by voting to give George Bush the benefit of the doubt on Iran.\"" "Health care reform legislation is likely to mandate free sex change surgeries." ...
 $ Text_Tag: chr  "abortion" "energy,history,job-accomplishments" "foreign-policy" "health-care" ...


In [3]:
# make splitting reproducible
set.seed(54321)

# use 80% of dataset as training set and 20% as test set
sample <- sample(c(TRUE, FALSE), nrow(data), replace = TRUE, prob = c(0.8,0.2))
train <- data[sample, ]
test <- data[!sample, ]

In [4]:
# define function to clean strings (bag of words model)
clean_string <- function(string) {

    # lowercase
    temp <- tolower(string)

    # remove everything that is not a letter
    temp <- str_replace_all(temp, '[^a-z\\s]', ' ')

    # shrink down to just one white space
    temp <- str_replace_all(temp,'[\\s]+', ' ')

    # split it
    temp <- str_split(temp, ' ')[[1]]

    # remove stop words
    stop_words <- unique(str_split(paste0(stopwords(source = 'smart'), collapse = ' '), '[\\s|\']')[[1]])
    temp <-  temp[!(temp %in% stop_words)]

    # remove single letters and empty characters
    temp <- temp[!(temp %in%  c(letters, ''))]

    return(temp)

}

In [5]:
# clean strings in training set
train$Text_Clean <- lapply(train$Text, clean_string)

# get list of labels
label_list <- sort(unique(data$Labels))

# join every list of words into a unique list
res <- do.call(c, train$Text_Clean)

# create the vocabulary of words of the training set
vocabulary <- sort(unique(res))
str(vocabulary)

 chr [1:7141] "aaron" "aarp" "aba" "abandoned" "abandoning" "abbas" ...


In [6]:
0*log2(0.001)

In [None]:
feature_selection <- T

if (feature_selection) {

    # named list of named vectors
    mutual_information <- list()

    count_documents <- function(word, documents) {

        counts <- sum(sapply(documents, function(x) word %in% x))
        return(counts)

    }

    # mutual information
    for (label in label_list) {

        # number of documents that contain t and are in class c
        N_11 <- sapply(vocabulary, count_documents, documents = train$Text_Clean[train$Labels == label], USE.NAMES = F)
        # number of documents that do not contain t and are in class c
        N_01 <- nrow(train[train$Labels == label,]) - N_11

        # number of documents that contain t and are not in class c
        N_10 <- sapply(vocabulary, count_documents, documents = train$Text_Clean[train$Labels != label], USE.NAMES = F)
        # number of documents that do not contain t and are not in class c
        N_00 <- nrow(train[train$Labels != label,]) - N_10

        # total number of documents
        N <- N_11 + N_01 + N_10 + N_00

        # mutual information
        MI <- (N_11/N)*log2(N*N_11/((N_10 + N_11)*(N_01 + N_11))) +
            (N_01/N)*log2(N*N_01/((N_00 + N_01)*(N_01 + N_11))) + 
            (N_10/N)*log2(N*N_10/((N_10 + N_11)*(N_00 + N_10))) + 
            (N_00/N)*log2(N*N_00/((N_00 + N_01)*(N_00 + N_10)))

        # add names
        names(MI) <- vocabulary
        # sort and remove NaN
        MI <- sort(MI, decreasing = T)
        # select k [%] features
        MI <- MI[1:floor(0.5*length(MI))]

        mutual_information[[paste0('Label_', label)]] <- MI

    }

    str(mutual_information)

    # join every named vector into a unique list
    res <- do.call(c, unname(mutual_information))

    # create the new vocabulary of words of the training set after feature selection
    vocabulary <- sort(unique(names(res)))
    str(vocabulary)

}

In [None]:
conditional_prob <- function(word, words_list){

    # count occurrence of word in class i
    n.word <- sum(words_list == word)

    # conditional probability for one words given one class
    conditional.prob <- (n.word + 1)

    return(conditional.prob)
}

In [None]:
# create dataframe of conditional probabilities
cond.prob.df <- data.frame(Words = vocabulary)
priors <- NULL
tot.words.per.lab <- NULL

# number of training data
N <- nrow(train)

for (label in label_list) {
    
    # create list of words grouped by class
    words_list <- do.call(c, train$Text_Clean[train$Labels == label])

    # count total number of words in class i
    tot.words <- length(words_list)

    # save number of words per label
    tot.words.per.lab  <- c(tot.words.per.lab, tot.words + length(vocabulary))

    # calculate the conditional probability function for every word of the vocabulary
    cond.prob.df[[paste0('Label_', label)]] <- sapply(cond.prob.df$Words, conditional_prob, words_list=words_list)

    # normalize the conditional probabilities
    Normalization <- (tot.words + length(vocabulary))
    cond.prob.df[[paste0('Label_', label)]] <- cond.prob.df[[paste0('Label_', label)]]/Normalization

    # calculate prior for the classes
    N.c <- sum(train$Labels == label)
    priors <- c(priors, N.c/N)
}



In [None]:
str(cond.prob.df)

In [None]:
posterior_prob <- function(text, cond.prob.df, priors, tot.words.per.lab){

    # clean text to list of words
    words_in_text <- clean_string(text)

    filtered.df <- cond.prob.df[ vocabulary %in% words_in_text  ,]

    # calculate log-likelihood dataframe
    log.likelihood <- log(filtered.df[, 2:(1+length(label_list))])

    # calculate log-posterior per label 
    log.posterior <- colSums(log.likelihood) + log(priors) 

    # count the number of words not in dictionary
    number.new.words <- length(setdiff(words_in_text, vocabulary))

    # calculate log posterior for the words not in the dictionary
    log.posterior.new <- - number.new.words*log(tot.words.per.lab)

    # update the posterior if we have words not in dictionary
    log.posterior <- log.posterior + log.posterior.new

    label <- unname(which.max(log.posterior) - 1)
    return(label)
}


In [None]:
# predict on test set
labels_true <- test$Labels
labels_predicted <- sapply(test$Text, posterior_prob,
                    cond.prob.df = cond.prob.df, priors = priors, tot.words.per.lab = tot.words.per.lab)
labels_predicted <- unname(labels_predicted)

In [None]:
# confusion matrix
results <- table(factor(labels_true), factor(labels_predicted))
results <- matrix(results, ncol = length(label_list))
results <- t(t(results)/colSums(results))
print(round(results, 2))

In [None]:
str(vocabulary)

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=4685fa97-7e91-4cd5-abb2-977409f41ab1' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>