Skip to content

Commit

Permalink
Concise alerts as HTML for blog posts with sentiment scores, PESL top…
Browse files Browse the repository at this point in the history
…ics or novelty.
  • Loading branch information
Adam Cooper committed Sep 21, 2012
1 parent a542834 commit 841197a
Show file tree
Hide file tree
Showing 4 changed files with 288 additions and 0 deletions.
182 changes: 182 additions & 0 deletions Alerts/BlogScore Alert.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
##
## A Simple alterter. Produce a concise report to highligh blog posts in a (short) timewindow with
## pos/neg/subj, econ/polit/legal/knowing/doing, novelty scores.
## Relies on Database for source data (no CSV) and assumes that pre-process.R has already been run
##
library("RSQLite")
library("tm")
library("slam")
library("brew")
#library("RColorBrewer")

home.dir<-"/home/arc1/R Projects/Text Mining Weak Signals"
output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output"
db.dir<-paste(home.dir,"Source Data",sep="/")
template.dir<-paste(home.dir,"Alerts",sep="/")

##
## RUN PARAMETERS - Often Changed
title<-"TEL Blog Scan - Last 14 Days"
# The date of the report. Posts up to and including this date are candidates for output
report.date<-as.POSIXlt("2012-06-07")
# how many days to include when seeking candidates
report.days<-14

##
## RUN PARAMETERS - Not often changed
# locate the database
sqlite.filename <- "TMWS Data A.sqlite"
# set output subfolder
output.subdir <- "BlogScore Alert"
# score columns. **** NB score.mins must follow this order *****
score.cols<-c("pos_score", "neg_score", "subj_score", "econ_score", "polit_score", "legal_score",
"doing_score", "knowing_score")
# Reader-fiendly captions to match. These determine what is reported (set an element to NA to omit)
score.title<-c("Positive Sentiment", "Negative Sentiment",NA,"Economics Topic", "Political Topic", "Legal Topic", "Doing: Aims and Means", "Knowing and Problem-solving")
# These are shown as explanatory text under the titles in the report.
score.caption<-c("Posts containing a high proportion of words associated with positive sentiment",
"Posts containing a high proportion of words associated with negative sentiment",
"Posts with high subjective character (both positive and negative sentiment",
"Posts containing typically economic keywords",
"Posts containing typically political keywords",
"Posts containing typically legal keywords",
"Posts containing keywords associated with motivation, goals, the means of achieving goals and also the state of failure",
"Posts containing keywords associated with knowledge, knowledge acquisition and problem solving")
# These are used to work out a scale the scores to give a "rating" for approximate comparability.
# i.e. to that readers are not distracted by inherent differences between the dictionary word occurrence in the corpus. Use something like 1.1*(corpus max score)
rating.fullscale<-c(.325,.181,.382,
.275,.240,.154, #econ, polit, legal
.1925,.224)
# How many months to use for novelty baseline.
nov.baseline.months<-12
# Thresholds for selection
# only ever show the top few. This is a max; if the score thresholds are not met nothing shows
top.n<-4
# dictionary score minima by dictionary. Order must be same as score.cols.
# NB: these are unweighted and are used to define the point at which rating=1
score.mins<-c(0.15, #pos
0.05, #neg
0.15, #subj
0.12, #econ
0.12, #ploit
0.04, #legal
0.08,#doing
0.08)#knowing
# novelty score min
nov.min<-0.82
# novelty rating zero-point. make this less than nov.min. This is the far-left position on the rating gauge
nov.ref<-0.7

##
## PRELIMINARIES - some initial setup-specific working
# database query
qdate<-function(d){
return (paste("'",as.character(d),"'", sep=""))
}
# this query defines the posts to be considered for report. BETWEEN is inclusive
report.start<-report.date
report.start$mday<-report.start$mday - report.days
sql<- paste("select content, title, authors, datestamp, url,",paste(score.cols, collapse=", "),
"from blog_post where datestamp between",qdate(report.start),"and", qdate(report.date))
# this query fetches the baseline (large) for novelty calculation
nov.end<-report.start
nov.end$mday<-nov.end$mday-1
nov.start<-nov.end
nov.start$mon<-nov.start$mon-nov.baseline.months
baseline.sql<- paste("select content, datestamp from blog_post where datestamp between",qdate(nov.start),"and", qdate(nov.end))
# initialise database access
# instantiate the SQLite driver in the R process
sqlite<- dbDriver("SQLite")
# open sqlite connection. db is a "connection"
db<- dbConnect(sqlite, dbname=paste(db.dir,sqlite.filename,sep="/"))
summary(db)
# preparation for output destination
#setwd(paste(output.dir, output.subdir,sep="/"))
reportFile<-paste(paste(output.dir, output.subdir,paste(report.date,".html",sep=""),sep="/"))

map<-list(Content="content", DateTimeStamp="datestamp")# Heading="title", Author="authors", URL="url")

##
## MAIN
# Write out the HTML Header
#this palette is used in brew for color-coding scores
score.pal<-c("#00FFFF","#0000FF","#800080","#FF0000","#FF8040")
pal.len<-length(score.pal)
brew.conn<-file(reportFile, open="wt")
brew(file=paste(template.dir,"BlogScore BrewHeader.html",sep="/"), output=brew.conn)

# Loop over the dictionaries, emitting a section of HTML if there are any posts matching the thresholds
for(i in 1:length(score.caption)){
section<-score.title[i]
caption<-score.caption[i]
if(!is.na(section)){
sect.sql<-paste(sql,"and ", score.cols[i],">",score.mins[i], "order by", score.cols[i],"desc limit", as.character(top.n))
hits<-dbGetQuery(db,sect.sql)#query, fetch all records to dataframe and clear resultset in one go
#only create output if there are some "hits"
if(length(hits[,1])>0){
#extract and massage the scores for "friendly" display
scores<-hits[,score.cols[i]]
ratings<-round(100*(scores-score.mins[i])/(rating.fullscale[i]-score.mins[i]))
ratings.capped<-pmin(100,ratings)
#write out
brew(file=paste(template.dir,"BlogScore BrewChunk.html",sep="/"), output=brew.conn)
}
}
}
# The novelty calculation requires some real work since we need to compare all posts in the reporting-period window against all those in the baseline period.
# This is not quite the same as in "Rising and Falling Terms" since the two sets are disjoint here
#candidates are in the reporting time window
candidates<-dbGetQuery(db,sql)
candidates.corp<-Corpus(DataframeSource(candidates), readerControl=list(reader= readTabular(mapping=map)))
candidates.corp<-tm_map(candidates.corp,removeNumbers)
candidates.corp<-tm_map(candidates.corp,removePunctuation)
candidates.dtm.bin<-DocumentTermMatrix(candidates.corp, control=list(stemming=TRUE, stopwords=TRUE, minWordLength=3, weighting=weightBin))
#eliminate very short docs as they give unreliable novelty calcs (too sparse). must have >15 different non-stopwords
ok.posts<-row_sums(candidates.dtm.bin)>15
candidates<-candidates[ok.posts,]
candidates.dtm.bin<-candidates.dtm.bin[ok.posts,]
# the baseline is what novelty is calculated with respect to
baseline<-dbGetQuery(db,baseline.sql)
baseline.corp<-Corpus(DataframeSource(baseline), readerControl=list(reader= readTabular(mapping=map)))
baseline.corp<-tm_map(baseline.corp,removeNumbers)
baseline.corp<-tm_map(baseline.corp,removePunctuation)
baseline.dtm.bin<-DocumentTermMatrix(baseline.corp, control=list(stemming=TRUE, stopwords=TRUE, minWordLength=3, weighting=weightBin))
# Calculate distance using cosine measure, i.e. as a scalar product
#first vector norms
doc.norm.mat<-sqrt(tcrossprod(row_sums(candidates.dtm.bin),row_sums(baseline.dtm.bin)))
#now eliminate terms that are not shared since their product will be zero anyway and we need to avoid attempts to allocate memory beyond what is permitted. Norms calc must be BEFORE this.
shared.terms<-Terms(candidates.dtm.bin)[Terms(candidates.dtm.bin) %in% Terms(baseline.dtm.bin)]
candidates.dtm.bin<-candidates.dtm.bin[,shared.terms]
baseline.dtm.bin<-baseline.dtm.bin[,shared.terms]
#the dot product
difference.mat<-1.0-tcrossprod(as.matrix(candidates.dtm.bin),
as.matrix(baseline.dtm.bin)) / doc.norm.mat
#this should not be necessary since sets are disjoint but it is possible identical posts are present, one in both sets, so make sure that any such do not interfere
difference.mat[difference.mat[,]==0]<-1.0
#sometimes NaNs creep in (not sure why)
difference.mat[is.nan(difference.mat[,])]<-1.0
#novelty means there is no other close doc so find the smallest difference
novelty<-apply(difference.mat,1,min)
# "hits" are those candidates above the threshold and in the top.n
top.n.bool<-order(novelty, decreasing=T)[1:min(top.n,length(novelty))]
hits<-candidates[top.n.bool,]
nov.top.n<-novelty[top.n.bool]
hits<-hits[nov.top.n>=nov.min,]
#only create output if there are some "hits"
if(length(hits[,1])>0){
section<-"Novelty"
caption<-paste("Posts with an unusual combination of words compared to posts from the previous",nov.baseline.months,"months")
#extract and massage the scores for "friendly" display
scores<-nov.top.n[nov.top.n>=nov.min]
ratings<-round(100*(scores-nov.min)/(1-nov.ref))
ratings.capped<-pmin(100,ratings)
#write out
brew(file=paste(template.dir,"BlogScore BrewChunk.html",sep="/"), output=brew.conn)
}

#Write out the HTML Footer and close the report file
brew(file=paste(template.dir,"BlogScore BrewFooter.html",sep="/"), output=brew.conn)
close(brew.conn)


dbDisconnect(db)
22 changes: 22 additions & 0 deletions Alerts/BlogScore BrewChunk.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
<a name="<%=section%>"/>
<h2><%=section%></h2>

<table>
<%for(kk in 1:length(hits[,1])){%>
<tr style="border-style: solid; border-color:<%=score.pal[min(pal.len*ratings.capped[kk]/100+1,pal.len)]%>">
<td style="border-style: solid; border-color:<%=score.pal[min(pal.len*ratings.capped[kk]/100+1,pal.len)]%>">
<strong><%=hits[kk,"authors"]%></strong><br/>
<a href="<%=hits[kk,"url"]%>"><%=hits[kk,"title"]%></a><br/>
<%=strtrim(hits[kk,"content"], 500)%>...</td>
<td align="center"><img src="http://chart.googleapis.com/chart?chs=200x125&cht=gom&chd=t:<%=ratings.capped[kk]%>&chco=00FFFF,0000FF,800080,FF0000,FF8040&chxt=x,y&chxl=0:|<%=ratings[kk]%>|1:|unusual|notable|wild"/>
<br/>
(raw score=<%=format(scores[kk],digits=3)%>)</td>
</tr>
<tr>
<td colspan="2">
<em>
<!--Contains terms: < =paste(Terms(dtm.tf.unstemmed.lex[kk,col_sums(dtm.tf.unstemmed.lex[kk,])>0]), collapse=", ")%> -->
</em>
</tr>
<%}%>
</table>
19 changes: 19 additions & 0 deletions Alerts/BlogScore BrewFooter.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@

<h2>Info</h2>

<p>Documents are scored according to the occurrence of words in the following lexicons derived from the categories of the <a href="http://www.wjh.harvard.edu/~inquirer/homecat.htm">Harvard General Inquirer</a>: <a href="#Economic">Economic</a> combines &quot;Econ@&quot; and &quot;ECON&quot;, <a href="#Legal">Legal</a> uses &quot;Legal&quot;, <a href="#Political">Political</a> combines &quot;Polit@&quot; and &quot;POLIT&quot;, <a href="#Doing: Aims and Means">Doing</a> combines a group of dictionaries associated with motivation (&quot;Need&quot;, &quot;Goal&quot;, &quot;Try&quot;, &quot;Means&quot;, &quot;Persist&quot;, &quot;Complet&quot; and &quot;Fail&quot;), and <a href="#Knowing and Problem-solving">Knowing and Problem-solving</a> combines &quot;Know&quot; and &quot;Solve&quot;. Sometimes a section is omitted from the alert; this occurs when there are no scores above threshold and at most <%=top.n%> posts appear in any section. The <em>raw scores</em> given are the fraction of words in each document that are found in the relevant lexicon. The ratings are an estimate of the size of the score compared to </p>


<h2>Copyright, Licence and Credits</h2>
<p>This work was undertaken as part of the <a href="http://www.telmap.org">TEL-Map Project</a>; TEL-Map is a support and coordination action within EC IST FP7 Technology Enhanced Learning.<img src="http://www.learningfrontiers.eu/sites/all/themes/learningfrontiers/images/footer-EU-logo.jpg" /></p>
<table>
<tr>
<td><a rel="license" href="http://creativecommons.org/licenses/by/3.0/"><img alt="Creative Commons Licence" style="border-width:0" src="http://i.creativecommons.org/l/by/3.0/88x31.png" /></a></td>
<td>This work and original text are &copy;2012 Adam Cooper, Institute for Educational Cybernetics, University of Bolton, UK.<BR/><span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Adam Cooper</span> has licenced it under a <a rel="license" href="http://creativecommons.org/licenses/by/3.0/">Creative Commons Attribution 3.0 Unported License</a></td>
</tr>
<tr><td colspan="2">Referenced blogs and quoted sections are &copy; the relevant authors. The lexicons used are from the <a href="http://www.wjh.harvard.edu/~inquirer/">Harvard Inquirer spreadsheet</a>
</td></tr>
</table>

</body>
</html>
65 changes: 65 additions & 0 deletions Alerts/BlogScore BrewHeader.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
<html>
<head>
<title>Blog Scan Alerts: <%=as.character(report.date)%></title>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
<meta name="AUTHOR" content="Adam Cooper"/>
<!--
## ***Made available using the The MIT License (MIT)***
# Copyright (c) 2012, Adam Cooper
#
# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
## ************ end licence ***************
-->
<!--adopt styles from TELMap. -->
<style type="text/css">
html, body, div, span, table, td, tr, th, tbody, tfoot, frame, iframe, object {
margin-top: 20px;
margin-bottom: 20px;
font-family: "Lucida Sans Unicode", "Lucida Grande", sans-serif;
font-size: 13px;
color: #404040;
}

a:link, a:hover {
color: #23a0d2;
text-decoration: none;
}

a:visited {
color: #066890;
text-decoration: none;
}

h2 {
color: #e2a12c;
font-size: 30px;
font-family: "Times New Roman", Times, serif;
}

h3 {
color: #494949;
font-family: "Times New Roman", Times, serif;
font-size: 16px;
text-transform: uppercase;
}
</style>

<!--additional style for plot boxing and captioning-->
<style type="text/css">
.cap-img { background-color: #F9F9F9; border: 1px solid #CCCCCC; padding: 3px; font: 11px/1.4em Arial, sans-serif;
margin: 0.5em 0.8em 0.5em 0.1em; float:left; width:604px;}
.clear { clear: both;}
.cap-img img { border: 1px solid #CCCCCC; vertical-align:middle; margin-bottom: 3px;}
</style>

</head>
<body>
<h1>Blog Scan Alerts: <%=as.character(report.date)%></h1>



0 comments on commit 841197a

Please sign in to comment.