Concise alerts as HTML for blog posts with sentiment scores, PESL top…

…ics or novelty.
arc12 · Sep 21, 2012 · 841197a · 841197a
1 parent a542834
commit 841197a
Show file tree

Hide file tree

Showing 4 changed files with 288 additions and 0 deletions.
diff --git a/Alerts/BlogScore Alert.R b/Alerts/BlogScore Alert.R
@@ -0,0 +1,182 @@
+##
+## A Simple alterter. Produce a concise report to highligh blog posts in a (short) timewindow with
+##    pos/neg/subj, econ/polit/legal/knowing/doing, novelty scores.
+## Relies on Database for source data (no CSV) and assumes that pre-process.R has already been run
+##
+library("RSQLite")
+library("tm")
+library("slam")
+library("brew")
+#library("RColorBrewer")
+
+home.dir<-"/home/arc1/R Projects/Text Mining Weak Signals"
+output.dir<-"/home/arc1/R Projects/Text Mining Weak Signals Output"
+db.dir<-paste(home.dir,"Source Data",sep="/")
+template.dir<-paste(home.dir,"Alerts",sep="/")
+
+##
+## RUN PARAMETERS - Often Changed
+title<-"TEL Blog Scan - Last 14 Days"
+# The date of the report. Posts up to and including this date are candidates for output
+report.date<-as.POSIXlt("2012-06-07")
+# how many days to include when seeking candidates
+report.days<-14
+
+##
+## RUN PARAMETERS - Not often changed
+# locate the database
+sqlite.filename <- "TMWS Data A.sqlite"
+# set output subfolder
+output.subdir <- "BlogScore Alert"
+# score columns. **** NB score.mins must follow this order *****
+score.cols<-c("pos_score", "neg_score", "subj_score", "econ_score", "polit_score", "legal_score",
+              "doing_score", "knowing_score")
+# Reader-fiendly captions to match. These determine what is reported (set an element to NA to omit)
+score.title<-c("Positive Sentiment", "Negative Sentiment",NA,"Economics Topic", "Political Topic", "Legal Topic", "Doing: Aims and Means", "Knowing and Problem-solving")
+# These are shown as explanatory text under the titles in the report.
+score.caption<-c("Posts containing a high proportion of words associated with positive sentiment",
+                 "Posts containing a high proportion of words associated with negative sentiment",
+                 "Posts with high subjective character (both positive and negative sentiment",
+                 "Posts containing typically economic keywords",
+                 "Posts containing typically political keywords",
+                 "Posts containing typically legal keywords",
+                 "Posts containing keywords associated with motivation, goals, the means of achieving goals and also the state of failure",
+                 "Posts containing keywords associated with knowledge, knowledge acquisition and problem solving")
+# These are used to work out a scale the scores to give a "rating" for approximate comparability.
+# i.e. to that readers are not distracted by inherent differences between the dictionary word occurrence in the corpus. Use something like  1.1*(corpus max score)
+rating.fullscale<-c(.325,.181,.382,
+                 .275,.240,.154, #econ, polit, legal
+                 .1925,.224)
+# How many months to use for novelty baseline.
+nov.baseline.months<-12
+# Thresholds for selection
+# only ever show the top few. This is a max; if the score thresholds are not met nothing shows
+top.n<-4
+# dictionary score minima by dictionary. Order must be same as score.cols.
+# NB: these are unweighted and are used to define the point at which rating=1
+score.mins<-c(0.15, #pos
+              0.05, #neg
+              0.15, #subj
+              0.12,  #econ
+              0.12, #ploit
+              0.04,  #legal
+              0.08,#doing
+              0.08)#knowing
+# novelty score min
+nov.min<-0.82
+# novelty rating zero-point. make this less than nov.min. This is the far-left position on the rating gauge
+nov.ref<-0.7
+
+##
+## PRELIMINARIES - some initial setup-specific working
+# database query
+qdate<-function(d){
+   return (paste("'",as.character(d),"'", sep=""))
+}
+# this query defines the posts to be considered for report. BETWEEN is inclusive
+report.start<-report.date
+report.start$mday<-report.start$mday - report.days
+sql<- paste("select content, title, authors, datestamp, url,",paste(score.cols, collapse=", "),
+            "from blog_post where datestamp between",qdate(report.start),"and", qdate(report.date))
+# this query fetches the baseline (large) for novelty calculation
+nov.end<-report.start
+nov.end$mday<-nov.end$mday-1
+nov.start<-nov.end
+nov.start$mon<-nov.start$mon-nov.baseline.months
+baseline.sql<- paste("select content, datestamp from blog_post where datestamp between",qdate(nov.start),"and", qdate(nov.end))
+# initialise database access
+# instantiate the SQLite driver in the R process
+sqlite<- dbDriver("SQLite")
+# open sqlite connection. db is a "connection"
+db<- dbConnect(sqlite, dbname=paste(db.dir,sqlite.filename,sep="/"))
+summary(db)
+# preparation for output destination
+#setwd(paste(output.dir, output.subdir,sep="/"))
+reportFile<-paste(paste(output.dir, output.subdir,paste(report.date,".html",sep=""),sep="/"))
+
+map<-list(Content="content", DateTimeStamp="datestamp")# Heading="title", Author="authors",    URL="url")
+
+##
+## MAIN
+# Write out the HTML Header
+#this palette is used in brew for color-coding scores
+score.pal<-c("#00FFFF","#0000FF","#800080","#FF0000","#FF8040")
+pal.len<-length(score.pal)
+brew.conn<-file(reportFile, open="wt")
+brew(file=paste(template.dir,"BlogScore BrewHeader.html",sep="/"), output=brew.conn)
+
+# Loop over the dictionaries, emitting a section of HTML if there are any posts matching the thresholds
+for(i in 1:length(score.caption)){
+   section<-score.title[i]
+   caption<-score.caption[i]
+   if(!is.na(section)){
+      sect.sql<-paste(sql,"and ", score.cols[i],">",score.mins[i], "order by", score.cols[i],"desc limit", as.character(top.n))
+      hits<-dbGetQuery(db,sect.sql)#query, fetch all records to dataframe and clear resultset in one go
+      #only create output if there are some "hits"
+      if(length(hits[,1])>0){
+         #extract and massage the scores for "friendly" display
+         scores<-hits[,score.cols[i]]
+         ratings<-round(100*(scores-score.mins[i])/(rating.fullscale[i]-score.mins[i]))
+         ratings.capped<-pmin(100,ratings)
+         #write out
+         brew(file=paste(template.dir,"BlogScore BrewChunk.html",sep="/"), output=brew.conn)
+      }
+   }
+}
+# The novelty calculation requires some real work since we need to compare all posts in the reporting-period window against all those in the baseline period.
+# This is not quite the same as in "Rising and Falling Terms" since the two sets are disjoint here
+#candidates are in the reporting time window
+candidates<-dbGetQuery(db,sql)
+candidates.corp<-Corpus(DataframeSource(candidates), readerControl=list(reader= readTabular(mapping=map)))
+candidates.corp<-tm_map(candidates.corp,removeNumbers)
+candidates.corp<-tm_map(candidates.corp,removePunctuation)
+candidates.dtm.bin<-DocumentTermMatrix(candidates.corp, control=list(stemming=TRUE, stopwords=TRUE, minWordLength=3, weighting=weightBin))
+#eliminate very short docs as they give unreliable novelty calcs (too sparse). must have >15 different non-stopwords
+ok.posts<-row_sums(candidates.dtm.bin)>15
+candidates<-candidates[ok.posts,]
+candidates.dtm.bin<-candidates.dtm.bin[ok.posts,]
+# the baseline is what novelty is calculated with respect to
+baseline<-dbGetQuery(db,baseline.sql)
+baseline.corp<-Corpus(DataframeSource(baseline), readerControl=list(reader= readTabular(mapping=map)))
+baseline.corp<-tm_map(baseline.corp,removeNumbers)
+baseline.corp<-tm_map(baseline.corp,removePunctuation)
+baseline.dtm.bin<-DocumentTermMatrix(baseline.corp, control=list(stemming=TRUE, stopwords=TRUE, minWordLength=3, weighting=weightBin))
+# Calculate distance using cosine measure, i.e. as a scalar product
+#first vector norms
+doc.norm.mat<-sqrt(tcrossprod(row_sums(candidates.dtm.bin),row_sums(baseline.dtm.bin)))
+#now eliminate terms that are not shared since their product will be zero anyway and we need to avoid attempts to allocate memory beyond what is permitted. Norms calc must be BEFORE this.
+shared.terms<-Terms(candidates.dtm.bin)[Terms(candidates.dtm.bin) %in% Terms(baseline.dtm.bin)]
+candidates.dtm.bin<-candidates.dtm.bin[,shared.terms]
+baseline.dtm.bin<-baseline.dtm.bin[,shared.terms]
+#the dot product
+difference.mat<-1.0-tcrossprod(as.matrix(candidates.dtm.bin),
+                               as.matrix(baseline.dtm.bin)) / doc.norm.mat
+#this should not be necessary since sets are disjoint but it is possible identical posts are present, one in both sets, so make sure that any such do not interfere
+difference.mat[difference.mat[,]==0]<-1.0
+#sometimes NaNs creep in (not sure why)
+difference.mat[is.nan(difference.mat[,])]<-1.0
+#novelty means there is no other close doc so find the smallest difference
+novelty<-apply(difference.mat,1,min)
+# "hits" are those candidates above the threshold and in the top.n
+top.n.bool<-order(novelty, decreasing=T)[1:min(top.n,length(novelty))]
+hits<-candidates[top.n.bool,]
+nov.top.n<-novelty[top.n.bool]
+hits<-hits[nov.top.n>=nov.min,]
+#only create output if there are some "hits"
+if(length(hits[,1])>0){
+   section<-"Novelty"
+   caption<-paste("Posts with an unusual combination of words compared to posts from the previous",nov.baseline.months,"months")
+   #extract and massage the scores for "friendly" display
+   scores<-nov.top.n[nov.top.n>=nov.min]
+      ratings<-round(100*(scores-nov.min)/(1-nov.ref))
+      ratings.capped<-pmin(100,ratings)
+      #write out
+      brew(file=paste(template.dir,"BlogScore BrewChunk.html",sep="/"), output=brew.conn)     
+}
+
+#Write out the HTML Footer and close the report file
+brew(file=paste(template.dir,"BlogScore BrewFooter.html",sep="/"), output=brew.conn)
+close(brew.conn)
+
+
+dbDisconnect(db)
diff --git a/Alerts/BlogScore BrewChunk.html b/Alerts/BlogScore BrewChunk.html
@@ -0,0 +1,22 @@
+<a name="<%=section%>"/>
+<h2><%=section%></h2>
+
+<table>
+<%for(kk in 1:length(hits[,1])){%>      
+<tr style="border-style: solid; border-color:<%=score.pal[min(pal.len*ratings.capped[kk]/100+1,pal.len)]%>">
+<td  style="border-style: solid; border-color:<%=score.pal[min(pal.len*ratings.capped[kk]/100+1,pal.len)]%>">
+<strong><%=hits[kk,"authors"]%></strong><br/>
+<a href="<%=hits[kk,"url"]%>"><%=hits[kk,"title"]%></a><br/>
+<%=strtrim(hits[kk,"content"], 500)%>...</td>
+<td align="center"><img src="http://chart.googleapis.com/chart?chs=200x125&cht=gom&chd=t:<%=ratings.capped[kk]%>&chco=00FFFF,0000FF,800080,FF0000,FF8040&chxt=x,y&chxl=0:|<%=ratings[kk]%>|1:|unusual|notable|wild"/>
+<br/>
+(raw score=<%=format(scores[kk],digits=3)%>)</td>
+</tr>
+<tr>
+<td colspan="2">
+<em>
+<!--Contains terms: < =paste(Terms(dtm.tf.unstemmed.lex[kk,col_sums(dtm.tf.unstemmed.lex[kk,])>0]), collapse=", ")%> -->
+</em>
+</tr>
+<%}%>
+</table>
diff --git a/Alerts/BlogScore BrewFooter.html b/Alerts/BlogScore BrewFooter.html
@@ -0,0 +1,19 @@
+
+<h2>Info</h2>
+
+<p>Documents are scored according to the occurrence of words in the following lexicons derived from the categories of the <a href="http://www.wjh.harvard.edu/~inquirer/homecat.htm">Harvard General Inquirer</a>: <a href="#Economic">Economic</a> combines &quot;Econ@&quot; and &quot;ECON&quot;,  <a href="#Legal">Legal</a> uses &quot;Legal&quot;, <a href="#Political">Political</a> combines &quot;Polit@&quot; and &quot;POLIT&quot;, <a href="#Doing: Aims and Means">Doing</a> combines a group of dictionaries associated with motivation (&quot;Need&quot;, &quot;Goal&quot;, &quot;Try&quot;, &quot;Means&quot;, &quot;Persist&quot;, &quot;Complet&quot; and &quot;Fail&quot;), and <a href="#Knowing and Problem-solving">Knowing and Problem-solving</a> combines &quot;Know&quot; and &quot;Solve&quot;. Sometimes a section is omitted from the alert; this occurs when there are no scores above threshold and at most <%=top.n%> posts appear in any section. The <em>raw scores</em> given are the fraction of words in each document that are found in the relevant lexicon. The ratings are an estimate of the size of the score compared to </p>
+
+
+<h2>Copyright, Licence and Credits</h2>
+<p>This work was undertaken as part of the <a href="http://www.telmap.org">TEL-Map Project</a>; TEL-Map is a support and coordination action within EC IST FP7 Technology Enhanced Learning.<img src="http://www.learningfrontiers.eu/sites/all/themes/learningfrontiers/images/footer-EU-logo.jpg" /></p>
+<table>
+<tr>
+	<td><a rel="license" href="http://creativecommons.org/licenses/by/3.0/"><img alt="Creative Commons Licence" style="border-width:0" src="http://i.creativecommons.org/l/by/3.0/88x31.png" /></a></td>
+	<td>This work and original text are &copy;2012 Adam Cooper, Institute for Educational Cybernetics, University of Bolton, UK.<BR/><span xmlns:cc="http://creativecommons.org/ns#" property="cc:attributionName">Adam Cooper</span> has licenced it under a <a rel="license" href="http://creativecommons.org/licenses/by/3.0/">Creative Commons Attribution 3.0 Unported License</a></td>
+</tr>
+<tr><td colspan="2">Referenced blogs and quoted sections are &copy; the relevant authors. The lexicons used are from the <a href="http://www.wjh.harvard.edu/~inquirer/">Harvard Inquirer spreadsheet</a>
+</td></tr>
+</table>
+
+</body>
+</html>
diff --git a/Alerts/BlogScore BrewHeader.html b/Alerts/BlogScore BrewHeader.html
@@ -0,0 +1,65 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
+<html>
+<head>
+<title>Blog Scan Alerts: <%=as.character(report.date)%></title>
+<meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
+<meta name="AUTHOR" content="Adam Cooper"/>
+<!--
+## ***Made available using the The MIT License (MIT)***
+# Copyright (c) 2012, Adam Cooper
+# 
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+# 
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+# 
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+## ************ end licence ***************
+-->
+<!--adopt styles from TELMap. -->
+<style type="text/css">
+html, body, div, span, table, td, tr, th, tbody, tfoot, frame, iframe, object {
+	margin-top: 20px;
+	margin-bottom: 20px;		
+	font-family: "Lucida Sans Unicode", "Lucida Grande", sans-serif;
+	font-size: 13px;
+	color: #404040;
+}
+
+a:link, a:hover {
+	color: #23a0d2;
+	text-decoration: none;
+}
+
+a:visited {
+	color: #066890;
+	text-decoration: none;
+}
+
+h2 {
+	color: #e2a12c;
+	font-size: 30px;
+	font-family: "Times New Roman", Times, serif;
+}
+
+h3 {
+	color: #494949;
+	font-family: "Times New Roman", Times, serif;
+	font-size: 16px;
+	text-transform: uppercase;	
+}
+</style>	
+
+<!--additional style for plot boxing and captioning-->
+<style type="text/css">
+.cap-img { background-color: #F9F9F9; border: 1px solid #CCCCCC; padding: 3px; font: 11px/1.4em Arial, sans-serif; 
+margin: 0.5em 0.8em 0.5em 0.1em; float:left; width:604px;}
+.clear { clear: both;} 
+.cap-img img { border: 1px solid #CCCCCC; vertical-align:middle; margin-bottom: 3px;} 
+</style>
+
+</head>
+<body>
+<h1>Blog Scan Alerts: <%=as.character(report.date)%></h1>
+
+
+