Skip to content
Permalink
Branch: master
Find file Copy path
Find file Copy path
1 contributor

Users who have contributed to this file

410 lines (358 sloc) 20.5 KB
# code in DMAR chapter
# This code was last run sucessfully on Nov 2013 in the this environment:
# R version 3.0.2 (2013-09-25)
# Platform: x86_64-w64-mingw32/x64 (64-bit)
# and these packages:
# [1] topicmodels_0.1-12 slam_0.1-30 pvclust_1.2-2 plyr_1.8 Snowball_0.0-10
# [6] rJava_0.9-4 tm_0.5-9.1 Matrix_1.1-0 sna_2.3-1 igraph_0.6.6
# [11] ggplot2_0.9.3.1 stringr_0.6.2
# get package with functions for interacting with Twitter.com
require(twitteR)
# get 1500 tweets with #aaa2011 tag, note that 1500 is the max, and it's subject to filtering and other restrictions by Twitter
aaa2011 <- searchTwitter('#AAA2011', n=1500)
# convert to data frame
df <- do.call("rbind", lapply(aaa2011, as.data.frame))
# get column names to see structure of the data
names(df)
# look at the first three rows to check content
head(df,3)
# Note that the code accompanying this chapter allows for the reader to
# collect their own data directly from Twitter. However, the Twitter API changes
# frequently, causing difficulties in reproducing the results of this code. To
# improve the reproducbility of this study, the data used in this chapter
# are also available as a CSV file. The data are available at
# http://dx.doi.org/10.6084/m9.figshare.855630
# The following three lines will read in the
# CSV file to the workspace read for further analysis
dir <- "..." # full path to the folder on your computer containing the file
file <- "Marwick_DMAR-chapter-3.csv" # name of the CSV file
df <- read.csv(paste(dir, file, sep = "/"), stringsAsFactors = FALSE)
# see how many unique Twitter accounts in the sample
length(unique(df$screenName))
# Create a new column of random numbers in place of the usernames and redraw the plots
# find out how many random numbers we need
n <- length(unique(df$screenName))
# generate a vector of random number to replace the names, we'll get four digits just for convenience
randuser <- round(runif(n, 1000, 9999),0)
# match up a random number to a username
screenName <- unique(df$screenName)
screenName <- sapply(screenName, as.character)
randuser <- cbind(randuser, screenName)
# Now merge the random numbers with the rest of the Twitter data, and match up the correct random numbers with multiple instances of the usernames...
rand.df <- merge(randuser, df, by="screenName")
# determine the frequency of tweets per account
counts <- table(rand.df$randuser)
# create an ordered data frame for further manipulation and plotting
countsSort <- data.frame(user = unlist(dimnames(counts)), count = sort(counts, decreasing = TRUE), row.names = NULL)
# create a subset of those who tweeted at least 5 times or more
countsSortSubset <- subset(countsSort,countsSort$count > 0)
# extract counts of how many tweets from each account were retweeted
# first clean the twitter messages by removing odd characters
rand.df$text <- sapply(rand.df$text,function(row) iconv(row,to = 'UTF-8'))
# remove @ symbol from user names
trim <- function (x) sub('@','',x)
# pull out who the message is to
require(stringr)
rand.df$to <- sapply(rand.df$text, function(name) trim(name))
# extract who has been retweeted
rand.df$rt <- sapply(rand.df$text, function(tweet)
trim(str_match(tweet,"^RT (@[[:alnum:]_]*)")[2]))
# replace names with corresponding anonymising number
randuser <- data.frame(randuser)
rand.df$rt.rand <- as.character(randuser$randuser)[match(as.character(rand.df$rt),
as.character(randuser$screenName))]
# make a table with anonymised IDs and number of RTs for each account
countRT <- table(rand.df$rt.rand)
countRTSort <- sort(countRT)
# subset those people RT’d at least twice
countRTSortSubset <- subset(countRTSort,countRT>2)
# create a data frame for plotting
countRTSortSubset.df <-data.frame(user = as.factor(unlist(dimnames(countRTSortSubset))), RT_count = as.numeric(unlist(countRTSortSubset)))
# combine tweet and retweet counts into one data frame
countUser <- merge(randuser, countsSortSubset, by.x = "randuser", by.y = "user")
TweetRetweet <- merge(countUser, countRTSortSubset.df, by.x = "randuser", by.y = "user", all.x = TRUE)
# create a Cleveland dot plot of tweet counts and retweet counts per Twitter account
# solid data point = number of tweets, letter R = number of retweets
require(ggplot2)
require(grid)
ggplot() +
geom_point(data = TweetRetweet, mapping = aes(reorder(randuser, count), count), size = 3) +
geom_point(data = TweetRetweet, mapping = aes(randuser, RT_count), size = 4, shape = "R") +
xlab("Author") +
ylab("Number of messages") +
coord_flip() +
theme_bw() +
theme(axis.title.x = element_text(vjust = -0.5, size = 14)) +
theme(axis.title.y = element_text(size = 14, angle=90)) +
theme(plot.margin = unit(c(1,1,2,2), "lines"))
# calculate the number of followers of each Twitter account
# extract the usernames from the non-anonymised dataset
# Note that this may take some time! And that's it's dependant on the Twitter
# API, which changes frequently.
users <- unique(df$screenName)
users <- sapply(users, as.character)
# make a data frame for further manipulation
users.df <- data.frame(users = users, followers = "", stringsAsFactors = FALSE)
# loop to populate users$followers with a follower count obtained from Twitter API
for (i in 1:nrow(users.df))
{
# tell the loop to skip a user if their account is protected
# or some other error occurs
result <- try(getUser(users.df$users[i])$followersCount, silent = FALSE);
if(class(result) == "try-error") next;
# get the number of followers for each user
users.df$followers[i] <- getUser(users.df$users[i])$followersCount
# tell the loop to pause for 60 s between iterations to
# avoid exceeding the Twitter API request limit
# this is going to take a long time if there are a lot
# of users, good idea to let it run overnight
print('Sleeping for 60 seconds...')
Sys.sleep(60);
}
# merge follower count with number of tweets per author
followerCounts <- merge(TweetRetweet, users.df, by.x = "screenName", by.y = "users")
# convert to value to numeric for further analysis
followerCounts$followers <- as.numeric(followerCounts$followers)
followerCounts$counts <- as.numeric(followerCounts$counts)
# create a plot
ggplot(data = followerCounts, aes(count, followers)) +
geom_text(aes(label = randuser, size = RT_count)) +
scale_size(range=c(3,10)) +
scale_x_log10(breaks = c(10,20,40,60,80,100)) +
scale_y_log10(breaks = c(10,100,seq(1000,7000,1000))) +
xlab("Number of Messages") +
ylab("Number of Followers") +
theme_bw() +
theme(axis.title.x = element_text(vjust = -0.5, size = 14)) +
theme(axis.title.y = element_text(size = 14, angle=90)) +
theme(plot.margin = unit(c(1,1,2,2), "lines"))
# Ratio of retweets to tweets for some more insights
#
# make table with counts of tweets per person
t <- as.data.frame(table(rand.df$randuser))
# make table with counts of retweets per person
rt <- as.data.frame(table(rand.df$rt.rand))
# combine tweet count and retweet count per person
t.rt <- merge(t,rt,by="Var1")
# creates new col and adds ratio tweet/retweet
t.rt["ratio"] <- t.rt$Freq.y / t.rt$Freq.x
# sort it to put names in order by ratio
sort.t.rt <- t.rt[order(t.rt$ratio),]
# exclude those with 2 tweets or less
sort.t.rt.subset <- subset(sort.t.rt,sort.t.rt$Freq.x>2)
#
# drop unused levels leftover from subsetting
sort.t.rt.subset.drop <- droplevels(sort.t.rt.subset)
# plot nicely ordered counts of tweets by person for
# people > 5 tweets
ggplot(sort.t.rt.subset.drop, aes(reorder(Var1, ratio), ratio)) +
xlab("Author") +
ylab("Ratio of messages retweeted by others to original messages")+
geom_point() +
coord_flip() +
theme_bw() +
theme(axis.title.x = element_text(vjust = -0.5, size = 14)) +
theme(axis.title.y = element_text(size = 14, angle=90)) +
theme(plot.margin = unit(c(1,1,2,2), "lines"))
# extract tweeter-retweeted pairs
rt <- data.frame(user=rand.df$randuser, rt=rand.df$rt.rand)
# omit pairs with NA and get only unique pairs
rt.u <- na.omit(unique(rt)) #
# begin social network analysis plotting
require(igraph)
require (sna)
degree <- sna::degree
g <- graph.data.frame(rt.u, directed = F)
# note that the igraph package appears to have changed its plot
# function between the time that the original code was used for the
# analysis and revision for publication. The following line will
# plot a basic network graph, ready for further customisation:
plot.igraph(g)
# and here is a rough approximation of the published plot
plot(g, #the graph to be plotted
layout=layout.fruchterman.reingold, # the layout method. see the igraph documentation for details
vertex.label.dist=0.5, #puts the name labels slightly off the dots
vertex.frame.color='blue', #the color of the border of the dots
vertex.label.color='black', #the color of the name labels
vertex.label.font=3, #the font of the name labels
vertex.label.cex=0.5, #specifies the size of the font of the labels. can also be made to vary
vertex.size = 0.5
)
# get some basic network attributes
g.adj <- get.adjacency(g, sparse=FALSE)
grecip(g.adj) # reciprocity
gtrans(g.adj) # transitivity
centralization(g.adj, degree)
# calculate Univariate Conditional Uniform Graph Tests
# density
print(cug.gden <- cug.test(g.adj, gden))
plot(cug.gden)
range(cug.gden$rep.stat)
# reciprocity
cug.recip <- cug.test(g.adj, grecip)
print(cug.recip)
plot(cug.recip)
range(cug.recip$rep.stat)
# transistivity
cug.gtrans <- cug.test(g.adj, gtrans)
print(cug.gtrans)
plot(cug.gtrans)
range(cug.gtrans$rep.stat)
# centralisation
print(cug.cent <- cug.test(g.adj, centralization, FUN.arg=list(FUN=degree)))
# find out how many communities exist in the network using the walktrap
g.wc <- walktrap.community(g, steps = 1000, modularity=TRUE)
plot(as.dendrogram(g.wc))
max(g.wc$membership)+1
# some basic and widely-used text mining techniques to identify the issues
# that captured the attention of Twitter-using anthropologists during the meeting.
require(tm)
a <- Corpus(VectorSource(df$text)) # create corpus object
a <- tm_map(a, tolower) # convert all text to lower case
a <- tm_map(a, removePunctuation)
a <- tm_map(a, removeNumbers)
a <- tm_map(a, removeWords, stopwords("english")) # this list needs to be edited and this function repeated a few times to remove high frequency context specific words with no semantic value
require(rJava) # needed for stemming function
require(Snowball) # also needed for stemming function
a <- tm_map(a, stemDocument, language = "english") # converts terms to tokens
a.tdm <- TermDocumentMatrix(a, control = list(minWordLength = 3)) # create a term document matrix, keepiing only tokens longer than three characters, since shorter tokens are very hard to interpret
inspect(a.tdm[1:10,1:10]) # have a quick look at the term document matrix
findFreqTerms(a.tdm, lowfreq=30) # have a look at common words, in this case, those that appear at least 30 times, good to get high freq words and add to stopword list and re-make the dtm, in this case add aaa, panel, session
findAssocs(a.tdm, 'scienc', 0.3) # find associated words and strength of the common words. I repeated this function for the ten most frequent words.
# investigate the URLs contained in the Twitter messages
require(stringr)
require(ggplot2)
require(grid)
df$link <- sapply(df$text,function(tweet) str_extract(tweet,("http[[:print:]]+"))) # creates new field and extracts the links contained in the tweet
df$link <- sapply(df$text,function(tweet) str_extract(tweet,"http[[:print:]]{16}")) # limits to just 16 characters after http so I just get the shortened link.
countlink <- data.frame(URL = as.character(unlist(dimnames(sort(table(df$link))))), N = sort(table(df$link))) # get frequencies of each link and put in rank order
rownames(countlink) <- NULL # remove rownames
countlinkSub <- subset(countlink, N>2) # subset of just links appearing more than twice
# plot to see distribution of links
ggplot(countlinkSub, aes(reorder(URL, N), N)) +
xlab("URL") +
ylab("Number of messages containing the URL")+
geom_point() +
coord_flip() +
theme_bw() +
theme(axis.title.x = element_text(vjust = -0.5, size = 14)) +
theme(axis.title.y = element_text(size = 14, angle=90)) +
theme(plot.margin = unit(c(1,1,2,2), "lines"))
# plot links
countlink<-data.frame(na.omit((df$link)))
names(countlink)="link"
qplot(countlink$link, geom="bar")+coord_flip() # but not sorted, so let's keep going...
require(plyr)
links<-count(countlink, "link") # to get a data frame with two named vectors for sorting
qplot(reorder(link,freq),freq,data=links, geom="bar")+coord_flip() # and here it is in order
links2<-subset(links,links$freq>2) # subset of just links appearing more than twice
links2$link<-factor(links2$link,levels=links2$link) # create factor for sorting
qplot(reorder(links2$link,links2$freq),links2$freq,data=links2, geom="bar")+coord_flip() # plot nicely sorted
# This is based on Jeffrey Breen's excellent tutorial at http://jeffreybreen.wordpress.com/2011/07/04/twitter-text-mining-r-slides/
# download sentiment word list from here: http://www.cs.uic.edu/~liub/FBS/opinion-lexicon-English.rar un-rar and put somewhere logical on your computer
hu.liu.pos = scan('E:/My Documents/My Papers/conferences/SAA2010/opinion-lexicon-English/positive-words.txt', what = 'character',comment.char=';') #load +ve sentiment word list
hu.liu.neg = scan('E:/My Documents/My Papers/conferences/SAA2010/opinion-lexicon-English/negative-words.txt',what = 'character',comment.char= ';') #load -ve sentiment word list
pos.words = c(hu.liu.pos)
neg.words = c(hu.liu.neg)
score.sentiment = function(sentences, pos.words, neg.words, .progress='none')
{
require(plyr)
require(stringr)
# we got a vector of sentences. plyr will handle a list
# or a vector as an "l" for us
# we want a simple array ("a") of scores back, so we use
# "l" + "a" + "ply" = "laply":
scores = laply(sentences, function(sentence, pos.words, neg.words) {
# clean up sentences with R's regex-driven global substitute, gsub():
sentence = gsub('[[:punct:]]', '', sentence)
sentence = gsub('[[:cntrl:]]', '', sentence)
sentence = gsub('\\d+', '', sentence)
# and convert to lower case:
sentence = tolower(sentence)
# split into words. str_split is in the stringr package
word.list = str_split(sentence, '\\s+')
# sometimes a list() is one level of hierarchy too much
words = unlist(word.list)
# compare our words to the dictionaries of positive & negative terms
pos.matches = match(words, pos.words)
neg.matches = match(words, neg.words)
# match() returns the position of the matched term or NA
# we just want a TRUE/FALSE:
pos.matches = !is.na(pos.matches)
neg.matches = !is.na(neg.matches)
# and conveniently enough, TRUE/FALSE will be treated as 1/0 by sum():
score = sum(pos.matches) - sum(neg.matches)
return(score)
}, pos.words, neg.words, .progress=.progress )
scores.df = data.frame(score=scores, text=sentences)
return(scores.df)
}
require(plyr)
aaa.text <- df$text # get text of tweets
length(aaa.text) #check how many tweets, make sure it agrees with the original sample size
head(aaa.text, 5) #check content sample, see that it looks as expected, no weird characters, etc.
aaa.scores <- score.sentiment(aaa.text,pos.words,neg.words,.progress='text') # get scores for the tweet text
# create a histogram of sentiment scores
ggplot(aaa.scores, aes(x=score)) +
geom_histogram(binwidth=1) +
xlab("Sentiment score") +
ylab("Frequency") +
theme_bw() +
opts(axis.title.x = theme_text(vjust = -0.5, size = 14)) +
opts(axis.title.y=theme_text(size = 14, angle=90, vjust = -0.25)) +
opts(plot.margin = unit(c(1,1,2,2), "lines"))
aaa.pos <- subset(aaa.scores,aaa.scores$score >= 2) # get tweets with only very +ve scores
aaa.neg <- subset(aaa.scores,aaa.scores$score <= -2) # get tweets with only very -ve scores
# Now create subset based on tweets with certain words, such as the high frequency words identified in the text mining. eg. science
scien <- subset(aaa.scores, regexpr("scien", aaa.scores$text) > 0) # extract tweets containing only 'scien'
# plot histogram for this token,
ggplot(scien, aes(x = score)) + geom_histogram(binwidth = 1) + xlab("Sentiment score for the token 'scien'") + ylab("Frequency") + theme_bw() + opts(axis.title.x = theme_text(vjust = -0.5, size = 14)) + opts(axis.title.y = theme_text(size = 14, angle = 90, vjust = -0.25)) + opts(plot.margin = unit(c(1,1,2,2), "lines"))
# repeat this block with different high frequency words
a.tdm.sp <- removeSparseTerms(a.tdm, sparse=0.989) # I found I had to iterate over this to ensure the tdm doesn't get too small... for example: 0.990 nrow=88, 0.989, nrow=67, 0.985, nrow=37, 0.98 nrow=23, 0.95 nrow=6
a.tdm.sp.df <- as.data.frame(inspect(a.tdm.sp )) # convert document term matrix to data frame
nrow(a.tdm.sp.df) # check to see how many words we're left with after removing sparse terms
# this analysis is based on http://www.statmethods.net/advstats/cluster.html
# scale and transpose data for cluster analysis
a.tdm.sp.df.sc.t <- t(scale(a.tdm.sp.df))
require(pvclust)
fit <- pvclust(a.tdm.sp.df.sc.t, method.hclust = "average", method.dist = "correlation", nboot = 10) # this method may take a few hours the bootstraping, you can reduce the nboot value for a quicker result
plot(fit, cex = 1.5, cex.pv = 1.2, col.pv = c(1,0,0), main="", xlab="", sub="") # draw the dendrogram
require(slam)
a.tdm.sp.t <- t(a.tdm.sp) # transpose document term matrix, necessary for the next steps using mean term frequency-inverse document frequency (tf-idf) to select the vocabulary for topic modeling
summary(col_sums(a.tdm.sp.t)) # check median...
term_tfidf <- tapply(a.tdm.sp.t$v/row_sums(a.tdm.sp.t)[a.tdm.sp.t$i], a.tdm.sp.t$j,mean) * log2(nDocs(a.tdm.sp.t)/col_sums(a.tdm.sp.t>0)) # calculate tf-idf values
summary(term_tfidf) # check median... note value for next line...
a.tdm.sp.t.tdif <- a.tdm.sp.t[,term_tfidf>=1.0] # keep only those terms that are slightly less frequent that the median
a.tdm.sp.t.tdif <- a.tdm.sp.t[row_sums(a.tdm.sp.t) > 0, ]
summary(col_sums(a.tdm.sp.t.tdif)) # have a look
# Before going right into generating the topic model and analysing the output, we need to decide on the number of topics that the model should use
# Here's a function to loop over different topic numbers, get the log liklihood of the model for each topic number and plot it so we can pick the best one
# The best number of topics is the one with the highest log liklihood value.
require(topicmodels)
best.model <- lapply(seq(2, 50, by = 1), function(d){LDA(a.tdm.sp.t.tdif, d)}) # this will make a topic model for every number of topics between 2 and 50... it will take some time!
best.model.logLik <- as.data.frame(as.matrix(lapply(best.model, logLik))) # this will produce a list of logLiks for each model...
# plot the distribution of logliklihoods by topic
best.model.logLik.df <- data.frame(topics=c(2:50), LL = as.numeric(as.matrix(best.model.logLik)))
ggplot(best.model.logLik.df, aes(x = topics, y = LL)) +
xlab("Number of topics") +
ylab("Log likelihood of the model") +
geom_line() +
theme_bw() +
opts(axis.title.x = theme_text(vjust = -0.5, size = 14)) +
opts(axis.title.y=theme_text(size = 14, angle=90, vjust= -0.25)) +
opts(plot.margin = unit(c(1,1,2,2), "lines"))
# ggsave(file = "model_LL_per_topic_number.pdf") # export the plot to a PDF file
# it's not easy to see exactly which topic number has the highest LL, so let's look at the data...
best.model.logLik.df.sort <- best.model.logLik.df[order(-best.model.logLik.df$LL), ] # sort to find out which number of topics has the highest loglik, in this case 23 topics.
best.model.logLik.df.sort # have a look to see what's at the top of the list, the one with the highest score
ntop <- best.model.logLik.df.sort[1,]$topics
lda <- LDA(a.tdm.sp.t.tdif, ntop) # generate a LDA model the optimum number of topics
get_terms(lda, 5) # get keywords for each topic, just for a quick look
get_topics(lda, 5) # gets topic numbers per document
lda_topics<-get_topics(lda, 5)
beta <- lda@beta # create object containing parameters of the word distribution for each topic
gamma <- lda@gamma # create object containing posterior topic distribution for each document
terms <- lda@terms # create object containing terms (words) that can be used to line up with beta and gamma
colnames(beta) <- terms # puts the terms (or words) as the column names for the topic weights.
id <- t(apply(beta, 1, order)) # order the beta values
beta_ranked <- lapply(1:nrow(id),function(i)beta[i,id[i,]]) # gives table of words per topic with words ranked in order of beta values. Useful for determining the most important words per topic
You can’t perform that action at this time.