-
Notifications
You must be signed in to change notification settings - Fork 2
/
train_model.R
101 lines (88 loc) · 3.09 KB
/
train_model.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
# Code taken from http://analyzecore.com/2017/02/08/twitter-sentiment-analysis-doc2vec/
# loading packages
library(twitteR)
library(ROAuth)
library(tidyverse)
library(text2vec)
library(caret)
library(glmnet)
library(ggrepel)
library(tm)
### loading and preprocessing a training set of tweets
# function for converting some symbols
conv_fun <- function(x) iconv(x, "latin1", "ASCII", "")
##### loading classified tweets ######
# source: http://help.sentiment140.com/for-students/
# 0 - the polarity of the tweet (0 = negative, 4 = positive)
# 1 - the id of the tweet
# 2 - the date of the tweet
# 3 - the query. If there is no query, then this value is NO_QUERY.
# 4 - the user that tweeted
# 5 - the text of the tweet
tweets_classified <- read_csv('training.1600000.processed.noemoticon.csv',
col_names = c('sentiment', 'id', 'date', 'query', 'user', 'text')) %>%
# converting some symbols
dmap_at('text', conv_fun) %>%
# replacing class values
mutate(sentiment = ifelse(sentiment == 0, 0, 1))
# there are some tweets with NA ids that we replace with dummies
tweets_classified_na <- tweets_classified %>%
filter(is.na(id) == TRUE) %>%
mutate(id = c(1:n()))
tweets_classified <- tweets_classified %>%
filter(!is.na(id)) %>%
rbind(., tweets_classified_na)
# data splitting on train and test
set.seed(2340)
trainIndex <- createDataPartition(tweets_classified$sentiment, p = 0.8,
list = FALSE,
times = 1)
tweets_train <- tweets_classified[trainIndex, ]
tweets_test <- tweets_classified[-trainIndex, ]
##### doc2vec #####
# define preprocessing function and tokenization function
prep_fun <- tolower
tok_fun <- word_tokenizer
it_train <- itoken(tweets_train$text,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = tweets_train$id,
progressbar = TRUE)
it_test <- itoken(tweets_test$text,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = tweets_test$id,
progressbar = TRUE)
# creating vocabulary and document-term matrix
vocab <- create_vocabulary(it_train)
vectorizer <- vocab_vectorizer(vocab)
dtm_train <- create_dtm(it_train, vectorizer)
dtm_test <- create_dtm(it_test, vectorizer)
# define tf-idf model
tfidf <- TfIdf$new()
# fit the model to the train data and transform it with the fitted model
dtm_train_tfidf <- fit_transform(dtm_train, tfidf)
dtm_test_tfidf <- fit_transform(dtm_test, tfidf)
# train the model
t1 <- Sys.time()
glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf,
y = tweets_train[['sentiment']],
family = 'binomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 5-fold cross-validation
nfolds = 5,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
print(difftime(Sys.time(), t1, units = 'mins'))
plot(glmnet_classifier)
print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))
preds <- predict(glmnet_classifier, dtm_test_tfidf, type = 'response')[ ,1]
auc(as.numeric(tweets_test$sentiment), preds)
# save the model for future using
saveRDS(glmnet_classifier, 'glmnet_classifier.RDS')
#######################################################