# Simple Example of Text Classification with R

## https://gist.github.com/primaryobjects/094d24084d1045c011b7

In [3]:
# Install required packages only if missing...
if (!('tm' %in% rownames(installed.packages()))){
  install.packages('tm')
}

if (!('SnowballC' %in% rownames(installed.packages()))){
  install.packages('SnowballC')
}

if (!('minqa' %in% rownames(installed.packages()))){
  install.packages('minqa')
}

if (!('e1071' %in% rownames(installed.packages()))){
  install.packages('e1071')
}

In [4]:
# Load the packages...

library('SnowballC')
library('minqa')
library('e1071')
library('caret')
library('tm')

"package 'caret' was built under R version 3.4.4"Loading required package: lattice
Loading required package: ggplot2
"package 'tm' was built under R version 3.4.4"Loading required package: NLP

Attaching package: 'NLP'

The following object is masked from 'package:ggplot2':

    annotate



In [23]:
# Convert to a data.frame for training and assign a classification (factor) to each document.

# Training data.
data <- c('Patient has diabetes, cardiac issues, COPD, and ICDX51 diagnosed.', 'Patient needs dialysis treatment.')
corpus <- VCorpus(VectorSource(data))

# Create a document term matrix.
tdm <- DocumentTermMatrix(corpus, list(removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))

# Convert to a data.frame for training and assign a classification (factor) to each document.

train <- as.matrix(tdm)
train <- cbind(train, c(0, 1))

colnames(train)[ncol(train)] <- 'y'

train <- as.data.frame(train)

train$y <- as.factor(train$y)

print(train)

  cardiac copd diabet diagnos dialysi icdx issu need patient treatment y
1       1    1      1       1       0    1    1    0       1         0 0
2       0    0      0       0       1    0    0    1       1         1 1


In [21]:
# Train.
fit <- train(y ~ ., data = train, method = 'bayesglm')

# Check accuracy on training.
print(predict(fit, newdata = train))

"predictions failed for Resample03: parameter=none Error in family(object)$linkinv(pred) : 
  Argument eta must be a nonempty numeric vector
"predictions failed for Resample04: parameter=none Error in family(object)$linkinv(pred) : 
  Argument eta must be a nonempty numeric vector
"predictions failed for Resample05: parameter=none Error in family(object)$linkinv(pred) : 
  Argument eta must be a nonempty numeric vector
"predictions failed for Resample06: parameter=none Error in family(object)$linkinv(pred) : 
  Argument eta must be a nonempty numeric vector
"predictions failed for Resample07: parameter=none Error in family(object)$linkinv(pred) : 
  Argument eta must be a nonempty numeric vector
"predictions failed for Resample09: parameter=none Error in family(object)$linkinv(pred) : 
  Argument eta must be a nonempty numeric vector
"predictions failed for Resample12: parameter=none Error in family(object)$linkinv(pred) : 
  Argument eta must be a nonempty numeric vector
"predictions 

[1] 0 1
Levels: 0 1


## Let's try a new sentence.  Does it match one in the corpus?

In [17]:
# Test data.
data2 <- c('Mary Jones shows signs of cardiac stress, and COPD along with other high blood pressure.')
corpus <- VCorpus(VectorSource(data2))
tdm <- DocumentTermMatrix(corpus, control = list(dictionary = Terms(tdm), removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))
test <- as.matrix(tdm)

In [18]:
# Check accuracy on test.
# Does the new sentence match the others?
print(data)
print(data2)
print(test)
print('Is the new sentence like the corpuse ones? 0 = No, 1 = Yes')
predict(fit, newdata = test)

[1] "Patient has diabetes, cardiac issues, COPD, and ICDX51 diagnosed."
[2] "Patient needs dialysis treatment."                                
[1] "Mary Jones shows signs of cardiac stress, and COPD along with other high blood pressure."
    Terms
Docs cardiac copd diabet diagnos dialysi icdx issu need patient treatment
   1       1    1      0       0       0    0    0    0       0         0
[1] "Is the new sentence like the corpuse ones? 0 = No, 1 = Yes"


### Notice the model identifies matching terms...

In [24]:
print(test)

    Terms
Docs cardiac copd diabet diagnos dialysi icdx issu need patient treatment
   1       0    0      0       0       1    0    0    1       0         1


## Another sentence that should match...

In [19]:
# Another example that does match...
data3 <- c('Joe Smith needs dialysis treatment.')
corpus <- VCorpus(VectorSource(data3))
tdm <- DocumentTermMatrix(corpus, control = list(dictionary = Terms(tdm), removePunctuation = TRUE, stopwords = TRUE, stemming = TRUE, removeNumbers = TRUE))
test <- as.matrix(tdm)

# Check accuracy on test.
# Correctly identify if the new sentence matches the others.
print(data)
print(data3)
print(test)
print('Is the new sentence like the corpuse ones? 0 = No, 1 = Yes')
predict(fit, newdata = test)

[1] "Patient has diabetes, cardiac issues, COPD, and ICDX51 diagnosed."
[2] "Patient needs dialysis treatment."                                
[1] "Joe Smith needs dialysis treatment."
    Terms
Docs cardiac copd diabet diagnos dialysi icdx issu need patient treatment
   1       0    0      0       0       1    0    0    1       0         1
[1] "Is the new sentence like the corpuse ones? 0 = No, 1 = Yes"
