<font size="7"><b> Text mining </font>

In [None]:
#Cargar librerias
library(NLP)
library(tm)
library(SnowballC)
library(wordcloud)
library(ggplot2)
library(dplyr)
library(readr)
library(cluster)
library(xlsx)
library(data.table)
library(plotly)
library(reshape)
library(rpart.plot)
library(Matrix)
library(arules)

# data reading

In [None]:
base=fread("../data/Datos.csv", sep=";")#, encoding="UTF-8")
head(base,3)

# data cleaning

In [None]:
# first cloud
base_text=base
wordcloud(base_text$Descripcion, max.words = 100, random.order = F, colors = brewer.pal(name = "Dark2", n = 8))

In [None]:
#caracteres especiales
base_text$Descripcion <- gsub("[[:cntrl:]]", " ", base_text$Descripcion)

#minusculas
base_text$Descripcion <- tolower(base_text$Descripcion)

#quitar puntuaciÃ³n
base_text$Descripcion <- removePunctuation(base_text$Descripcion)

#Quitar numeros
base_text$Descripcion <- removeNumbers(base_text$Descripcion)

# quitar multiples Espacios 
base_text$Descripcion <- stripWhitespace(base_text$Descripcion)

#eliminar palabras vacias (preposiciones y muletillas)
base_text$Descripcion <- removeWords(base_text$Descripcion, words = stopwords("spanish"))

# quitar palabras que no aportan valor (por criterio experto)
base_text$Descripcion <- removeWords(base_text$Descripcion, words = c("usuario"))

# reemplazar palabras segun diccionario de datos
base_text$Descripcion <- gsub("reinicia", "reiniciar", base_text$Descripcion)

#Tildes
base_text$Descripcion <- gsub("á", "a", base_text$Descripcion)
base_text$Descripcion <- gsub("é", "e", base_text$Descripcion)
base_text$Descripcion <- gsub("í", "i", base_text$Descripcion)
base_text$Descripcion <- gsub("ó", "o", base_text$Descripcion)
base_text$Descripcion <- gsub("ú", "u", base_text$Descripcion)
base_text$Descripcion <- gsub("ñ", "n", base_text$Descripcion)
base_text$Descripcion <- gsub("ü", "u", base_text$Descripcion)

# Corpus

In [None]:
#Corpus 1
base_corpus <- Corpus(VectorSource(base_text$Descripcion))

In [None]:
#corpus 2 sin "genero"
base_corpus_stem <- tm_map(base_corpus, stemDocument, language = "spanish")

In [None]:
wordcloud(base_corpus_stem, max.words = 100, random.order = F, colors = brewer.pal(name = "Dark2", n = 8))

In [None]:
#Matriz de términos
base_tdm <- TermDocumentMatrix(base_corpus)

In [None]:
#Matriz de términos stem
base_tdm_stem <- TermDocumentMatrix(base_corpus_stem)

In [None]:
#Eliminar términos dispersos
base_new = base_tdm
base_new <- removeSparseTerms(base_tdm, sparse = .965)

In [None]:
#Eliminar términos dispersos con stem
base_new_stem = base_tdm_stem
base_new_stem <- removeSparseTerms(base_tdm_stem, sparse = .965)

In [None]:
dim(base_new)

In [None]:
dim(base_new_stem)

In [None]:
#matriz de términos
base_mat <- as.matrix(base_new)

In [None]:
# con stem
base_mat_stem <- as.matrix(base_new_stem)

# Words frequency

In [None]:
# sumas de renglon
base_mat_Freq <- base_mat %>% rowSums() %>% sort(decreasing = TRUE)
base_mat_Freq <- data.frame(palabra = names(base_mat_Freq), frec = base_mat_Freq)

In [None]:
# sumas de renglon con stem
base_mat_Freq_stem <- base_mat_stem %>% rowSums() %>% sort(decreasing = TRUE)
base_mat_Freq_stem <- data.frame(palabra = names(base_mat_Freq_stem), frec = base_mat_Freq_stem)

In [None]:
#Top 10 de palabras comunes 
head(base_mat_Freq,10)

In [None]:
#Top 10 de palabras comunes 
head(base_mat_Freq_stem,10)

In [None]:
base_mat_Freq[1:10, ] %>%
  ggplot(aes(palabra, frec)) +
  geom_bar(stat = "identity", color = "black", fill = "darkcyan") +
  geom_text(aes(hjust = 1.3, label = frec)) + 
  coord_flip() + 
  labs(title = "Diez palabras más frecuentes en Niebla",  x = "Palabras", y = "Número de usos")

# Cluster K-means

In [None]:
#Estandarizar
base_mat2_stem=scale(base_mat_stem)

In [None]:
dim(base_mat2_stem)

In [None]:
wss <- (nrow(base_mat_stem)-1)*sum(apply(base_mat_stem,2,var))

In [None]:
wss

In [None]:
######------------------------------------ Determine numero de clusters
wss <- (nrow(base_mat)-1)*sum(apply(base_mat,2,var))

for (i in 2:10) wss[i] <- sum(kmeans(base_mat,centers=i)$withinss)


In [None]:
######------------------------------------ Determine numero de clusters
wss <- (nrow(base_mat2_stem)-1)*sum(apply(base_mat2_stem,2,var))

for (i in 2:5) wss[i] <- sum(kmeans(base_mat2_stem, centers=i)$withinss)

In [None]:
plot(1:10, wss[2:10], type="b", xlab="Number of Clusters",
  ylab="Within groups sum of squares", main="Clusters) Sum_Cuad")

In [None]:
dim(base_mat)
dim(base_mat_stem)

In [None]:

######---------------------------------- K-Means Cluster Analysis
fit <- kmeans(base_mat, 7) # 3 cluster solution
# get cluster means 
#aggregate(base_mat,by=list(fit$cluster),FUN=mean)
# append cluster assignment
DatosKmeans <- data.frame(base_mat, fit$cluster)


In [None]:
table(DatosKmeans$fit.cluster)

In [None]:
DatosKmeans[DatosKmeans$fit.cluster==6,]

# Cluster Jerárquico aglomerativo

In [None]:
#Estandarizar
base_mat2=scale(base_mat)

In [None]:
#Distancias
base_dist <- dist(base_new, method = "euclidian")

In [None]:
# Hierarchical clustering
base_hclust <-  hclust(base_dist, method = "ward.D")

In [None]:
plot(base_hclust, main = "Dendrograma de palabras", sub = "", xlab = "")

In [None]:
plot(base_hclust, main = "Dendrograma de palabras", sub = "", xlab = "")
rect.hclust(base_hclust, k = 4, border="blue")

# Association rules

In [None]:
findAssocs(base_new_stem,  terms = c("evolucion","fractur","traum","continu","establ","derech","izquierd","quemadur","clinic","diagnost","accident","cirugi","hemodinam","transit","control","dolor","antecedent"), corlimit = .05)