In [37]:
library(tidyverse)
library(magrittr)
library(text2vec)
library(tokenizers)
library(glmnet)
library(doParallel)
library(tm)
library(caret)
registerDoParallel(4)

## Data import

In [38]:
data_set <- read.csv("DATA/train.csv")
targets <- c("toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate")

In [39]:
print(dim(data_set))
#print(head(data_set))
print(names(data_set))

[1] 159571      8
[1] "id"            "comment_text"  "toxic"         "severe_toxic" 
[5] "obscene"       "threat"        "insult"        "identity_hate"


## Adding new feature to the data set

In [40]:
data <- data_set %>% 
  select(-one_of(targets)) %>% 
  mutate(length = str_length(comment_text), ncap = str_count(comment_text, "[A-Z]"), ncap_len = ncap / length, nexcl = str_count(comment_text, fixed("!")), nquest = str_count(comment_text, fixed("?")), npunct = str_count(comment_text, "[[:punct:]]"), nword = str_count(comment_text, "\\w+"), nsymb = str_count(comment_text, "&|@|#|\\$|%|\\*|\\^")) %>% 
  select(-id)

## Preprocessing and comments tokenization  

In [41]:
comment_tokens <- data %$%
  str_to_lower(comment_text) %>%
  str_replace_all("[^[:alpha:]]", " ") %>%
  str_replace_all("\\s+", " ") %>%
  gsub("\\b\\w{1}\\s","", .) %>% #Remove words with lengh less than 2
  itoken(tokenizer = tokenize_word_stems)

##  Token vectorization as vocabulary for document term matrix

In [None]:
vectorizer <- create_vocabulary(comment_tokens, ngram = c(1, 1), stopwords = stopwords("en")) %>%
  prune_vocabulary(term_count_min = 3, doc_proportion_max = 0.5, vocab_term_max = 4000) %>%
  vocab_vectorizer()

## TFIDF class and fit in the document term matrix of the tokens

In [43]:
m_tfidf <- TfIdf$new(norm = "l2", sublinear_tf = T)
tfidf <- create_dtm(comment_tokens, vectorizer) %>%
  fit_transform(m_tfidf)

In [44]:
print(names(data))

[1] "comment_text" "length"       "ncap"         "ncap_len"     "nexcl"       
[6] "nquest"       "npunct"       "nword"        "nsymb"       


# Generate the final dataset

## First data set 
If you want to use the following features :
length, ncap, ncap_len, nexcl, nquest, npunct, nword, nsymb + documentTermMatrix for your classifier

In [35]:
finaldata <- data %>%
    select(-comment_text) %>%
    sparse.model.matrix(~ . - 1, .) %>%
    cbind(tfidf)

## Second data set 
If you want to use only the document term matrix for you classifier

In [34]:
finaldata <- tfidf

In [36]:
set.seed(42)
smp_size <- floor(0.80 * nrow(data_set))
train_ind <- sample(seq_len(nrow(data_set)), size = smp_size)
test <- finaldata[-train_ind, ]
train <- finaldata[train_ind, ]