# 1. Table Data 

In [2]:
library(tm)
library(tm.plugin.sentiment)
library(glmnet)

## 총 정리 및 복습.

In [27]:
mobile <- read.csv("mobile2014.csv", stringsAsFactors = F, encoding="UTF-8")

In [28]:
corpus <- Corpus(VectorSource(mobile$Texts))

In [29]:
dtm <- DocumentTermMatrix(corpus,
                          control = list(tolower = T,
                                         removePunctuation = T,
                                         removeNumbers = T,
                                         wordLengths=c(2, 10),
                                         stopwords = stopwords("SMART"),
                                         weighting = weightTfIdf))

In weighting(x): empty document(s): 534 1585

In [30]:
my.stopwords <- c(stopwords("SMART"), "aaa") # stopwords에 추가.

In [31]:
my.stopwords[1:10]

In [32]:
X <- as.matrix(dtm)
Y <- mobile$Sentiment

In [33]:
res.lm <- glmnet(X, Y, family = 'binomial', lambda = 0)
coef.lm <- coef(res.lm)[,1]

pos.lm <- coef.lm[coef.lm > 0]
neg.lm <- coef.lm[coef.lm < 0]

pos.lm <- sort(pos.lm, decreasing = T)
neg.lm <- sort(neg.lm, decreasing = F)

In [34]:
res.lasso <- glmnet(X, Y, family = 'binomial', alpha = 1)
set.seed(12345)
res.lasso <- cv.glmnet(X, Y, family = 'binomial', alpha = 1,
                       nfolds = 4, type.measure = 'class') # 최적의 람다값을 찾기위해서 cv.glmnet 을 사용.

# nfodls : 훈련할 학습할 데이터와 검증할 데이터를 나누는 방법을 설정 
# 몇대 몇 : nfodls = 3:! 로 나눠서 트레이닝 및 테스트

In [35]:
coef.lasso <- coef(res.lasso, s = 'lambda.min')[,1]
pos.lasso <- coef.lasso[coef.lasso > 0]
neg.lasso <- coef.lasso[coef.lasso < 0]
pos.lasso <- sort(pos.lasso, decreasing = T)
neg.lasso <- sort(neg.lasso, decreasing = F)

In [36]:
set.seed(12345)
res.ridge <- cv.glmnet(X, Y, family = 'binomial', alpha = 0,
                       nfolds = 4, type.measure = 'class')
coef.ridge <- coef(res.ridge, s = 'lambda.min')[,1]
pos.ridge <- coef.ridge[coef.ridge > 0]
neg.ridge <- coef.ridge[coef.ridge < 0]
pos.ridge <- sort(pos.ridge, decreasing = T)
neg.ridge <- sort(neg.ridge, decreasing = F)

In [37]:
set.seed(12345)
res.elastic <- cv.glmnet(X, Y, family = 'binomial', alpha = .5,
                         nfolds = 4, type.measure = 'class')
coef.elastic <- coef(res.elastic, s = 'lambda.min')[,1]
pos.elastic <- coef.elastic[coef.elastic > 0]
neg.elastic <- coef.elastic[coef.elastic < 0]
pos.elastic <- sort(pos.elastic, decreasing = T)
neg.elastic <- sort(neg.elastic, decreasing = F)

In [38]:
data.test <- read.csv('tablet2014_test.csv', stringsAsFactors = F, encoding="UTF-8")

In [39]:
head(data.test,2)

Unnamed: 0,X,Title,Author,ReviewID,Texts,YMD,Sentiment
1,118530,do not reccomend,April Tyler,R3GRP77KRB556H,"Did not meet expectations. Slow did not work well with wi-fi at all. Slow downloads, games, apps....was returned day after Christmas:(",1/2/14,0
2,49108,Very poor product!,YVONNE BOWDEN,R27E8KSLJMUQZV,"I would advise you not purchase this product. I bought three of them for my grandchildren, and they are all a problem. The screen blacks out, chargers don't work, and if you do get it charged, it lasts for maybe 45 minutes. If it was jus one of them, maybe just a defective item, but all three are bad.",1/16/14,0


In [40]:
corpus <- Corpus(VectorSource(data.test$Texts))

In [42]:
Terms(dtm)

   [1] "african"               "calls"                 "disable"              
   [4] "incoming"              "money"                 "outgoing"             
   [7] "phone"                 "settings"              "south"                
  [10] "text"                  "waste"                 "work"                 
  [13] "bought"                "completely"            "decision"             
  [16] "despiste"              "iphone"                "problem"              
  [19] "product"               "recieved"              "satisfied"            
  [22] "solve"                 "unlocked"              "yo"                   
  [25] "batteries"             "boy"                   "brand"                
  [28] "daughter"              "eats"                  "faster"               
  [31] "lg"                    "model"                 "thought"              
  [34] "wrong"                 "att"                   "bad"                  
  [37] "card"                  "good"               

In [43]:
dtm.test <- DocumentTermMatrix(corpus,
                               control = list(tolower = T,
                                              removePunctuation = T,
                                              removeNumbers = T,
                                              wordLengths=c(2, 5),
                                              stopwords = stopwords("SMART"),
                                              weighting = weightTfIdf,
                                              dictionary = Terms(dtm)))

ERROR: Error in tolower(txt): 'utf8towcs'내에 입력 'its a good good good good good product,sometimes little bit stuck<95>_<ce>all in all it's a good good product any more'이 잘못되었습니다


In [None]:
senti.lm.test <- polarity(dtm.test, names(pos.lm), names(neg.lm))
senti.lasso.test <- polarity(dtm.test, names(pos.lasso), names(neg.lasso))
senti.ridge.test <- polarity(dtm.test, names(pos.ridge), names(neg.ridge))
senti.elastic.test <- polarity(dtm.test, names(pos.elastic), names(neg.elastic))

senti.lm.b.test <- ifelse(senti.lm.test > 0, 1, 0)
senti.lasso.b.test <- ifelse(senti.lasso.test > 0, 1, 0)
senti.ridge.b.test <- ifelse(senti.ridge.test > 0, 1, 0)
senti.elastic.b.test <- ifelse(senti.elastic.test > 0, 1, 0)

library(caret)

confusionMatrix(senti.lm.b.test, data.test$Sentiment)
confusionMatrix(senti.lasso.b.test, data.test$Sentiment)
confusionMatrix(senti.ridge.b.test, data.test$Sentiment)
confusionMatrix(senti.elastic.b.test, data.test$Sentiment)