Skip to content

Commit

Permalink
golib: add tags support
Browse files Browse the repository at this point in the history
  • Loading branch information
bcampbell committed Sep 15, 2015
1 parent 6892858 commit 0189c9e
Show file tree
Hide file tree
Showing 7 changed files with 159 additions and 90 deletions.
15 changes: 12 additions & 3 deletions golib/jl/article.go
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@ type Article struct {
LastCommentCheck
LastSimilar
*/

Tags []Tag
}

// InsertArticle loads a new article into the database.
Expand Down Expand Up @@ -97,7 +99,7 @@ func InsertArticle(tx *sql.Tx, art *Article) error {
art.Permalink,
art.SrcURL,
art.Publication.ID,
sql.NullInt64{0, false},
art.WordCount,
pq.NullTime{},
).Scan(&artID)
if err != nil {
Expand Down Expand Up @@ -137,6 +139,13 @@ func InsertArticle(tx *sql.Tx, art *Article) error {
}
}

// Tags
for _, t := range art.Tags {
_, err := tx.Exec(`INSERT INTO article_tag (article_id, tag, freq) VALUES ($1,$2,$3)`, artID, t.Name, t.Freq)
if err != nil {
return err
}
}
// queue article for xapian indexing
tx.Exec(`DELETE FROM article_needs_indexing WHERE article_id=$1`, artID)
tx.Exec(`INSERT INTO article_needs_indexing (article_id) VALUES ($1)`, artID)
Expand All @@ -147,7 +156,6 @@ func InsertArticle(tx *sql.Tx, art *Article) error {
// TODO: images into article_image table?

// commentlinks
// TODO: tags
// TODO: wordcount

art.ID = artID
Expand Down Expand Up @@ -200,6 +208,8 @@ func UpdateArticle(tx *sql.Tx, art *Article) error {
}
}

panic("TODO: update tags")

panic("TODO: update journo links")
// queue it for xapian indexing
tx.Exec(`DELETE FROM article_needs_indexing WHERE article_id=$1`, art.ID)
Expand All @@ -208,7 +218,6 @@ func UpdateArticle(tx *sql.Tx, art *Article) error {
// TODO:
// article_image
// article_commentlink
// article_tag

return nil
}
Expand Down
135 changes: 70 additions & 65 deletions golib/jl/cmd/jlfeeder/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -37,11 +37,11 @@ func main() {
flag.StringVar(&opts.serverBase, "server", "http://localhost:12345", `base URL of slurp server`)
flag.BoolVar(&opts.verbose, "v", false, `verbose`)
flag.BoolVar(&opts.forceRescrape, "force", false, `force rescrape`)
flag.StringVar(&opts.sinceIDFile, "sinceid", "", `file to track since_id with ""=none`)
flag.StringVar(&opts.sinceIDFile, "sinceid", "", `file to track since_id with (""=none)`)
flag.Parse()

if opts.verbose {
infoLog = log.New(os.Stderr, "", 0)
infoLog = log.New(os.Stdout, "", 0)
} else {
infoLog = log.New(ioutil.Discard, "", 0)
}
Expand Down Expand Up @@ -90,6 +90,7 @@ func doIt() error {
defer func() {
db.Close()

infoLog.Println(stats.summary())
if opts.sinceIDFile != "" && stats.HighID != 0 {
putSinceID(opts.sinceIDFile, stats.HighID)
infoLog.Printf("%s: new since_id is %d\n", opts.sinceIDFile, sinceID)
Expand All @@ -103,8 +104,8 @@ func doIt() error {
for {
filt := &slurp.Filter{
SinceID: sinceID,
Count: 30,
PubCodes: []string{"guardian", "dailymail"},
Count: 2000,
PubCodes: []string{},
}
infoLog.Printf("slurp %v...\n", filt)
incoming := client.Slurp(filt)
Expand All @@ -125,22 +126,22 @@ func doIt() error {
}
infoLog.Printf("batch received (%d arts)\n", len(arts))

if len(arts) > 0 {
// load the batch into the db
for _, art := range arts {
tx, err := db.Begin()
if err != nil {
return err
}
err = loadBatch(tx, arts, &stats)
err = loadArt(tx, art, &stats)
if err != nil {
tx.Rollback()
return err
// TODO: check error count against threshold here!
continue
}
// TODO: commit!
tx.Rollback()
tx.Commit()

sinceID = stats.HighID
}
// FORCE BREAK FOR NOW!
break
// end of articles?
if len(arts) < filt.Count {
Expand All @@ -162,59 +163,67 @@ type loadStats struct {
HighID int
}

// load a batch of articles into the database
func loadBatch(tx *sql.Tx, rawArts []*slurp.Article, stats *loadStats) error {
for _, raw := range rawArts {
art, authors := convertArt(raw)
// already got the article?
foundID, err := jl.FindArticle(tx, art.URLs)

if err != nil && err != jl.ErrNoArticle {
errLog.Printf(err.Error())
stats.ErrCnt++
// TODO: implement abort here if too many errors?
continue
}
func (stats *loadStats) summary() string {

rescrape := false
if err == nil {
// article was found
if opts.forceRescrape {
rescrape = true
art.ID = foundID
} else {
// skip this one. already got it.
// TODO: possible that we've got new URLs to add...
stats.Skipped++
continue
}
}
return fmt.Sprintf("%d new, %d skipped, %d errors", stats.NewCnt, stats.Skipped, stats.ErrCnt)
}

err = stash(tx, art, authors, "")
if err != nil {
errLog.Printf(err.Error())
stats.ErrCnt++
continue
}
// log it
bylineBits := make([]string, len(art.Authors))
for i, j := range art.Authors {
bylineBits[i] = j.Ref
}
infoLog.Printf("%d: new [a%d] %s (%s)\n", raw.ID, art.ID, art.Permalink, strings.Join(bylineBits, ","))
// convert and load an article into the database
func loadArt(tx *sql.Tx, rawArt *slurp.Article, stats *loadStats) error {
art, authors := convertArt(rawArt)
// already got the article?
foundID, err := jl.FindArticle(tx, art.URLs)

if err != nil && err != jl.ErrNoArticle {
errLog.Printf(err.Error())
stats.ErrCnt++
return err
}

if rescrape {
stats.Reloaded++
//infoLog.Printf("reloaded %s %s\n", art.Title, art.Permalink)
rescrape := false
if err == nil {
// article was found
if opts.forceRescrape {
rescrape = true
art.ID = foundID
} else {
stats.NewCnt++
//infoLog.Printf("new %s %s\n", art.Title, art.Permalink)
// skip this one. already got it.
// TODO: possible that we've got new URLs to add...
stats.Skipped++
// bump the since_id
if rawArt.ID > stats.HighID {
stats.HighID = rawArt.ID
}
return nil
}
}

// record highest serverside ID encountered so far
if raw.ID > stats.HighID {
stats.HighID = raw.ID
}
logPrefix := fmt.Sprintf("%d: ", rawArt.ID)

err = stash(tx, art, authors, "", logPrefix)
if err != nil {
errLog.Printf("%sError: %s\n", logPrefix, err.Error())
stats.ErrCnt++
return err
}
// log it
bylineBits := make([]string, len(art.Authors))
for i, j := range art.Authors {
bylineBits[i] = j.Ref
}
infoLog.Printf("%snew [a%d] %s (%s)\n", logPrefix, art.ID, art.Permalink, strings.Join(bylineBits, ","))

if rescrape {
stats.Reloaded++
//infoLog.Printf("reloaded %s %s\n", art.Title, art.Permalink)
} else {
stats.NewCnt++
//infoLog.Printf("new %s %s\n", art.Title, art.Permalink)
}

// record highest serverside ID encountered so far
if rawArt.ID > stats.HighID {
stats.HighID = rawArt.ID
}
return nil
}
Expand All @@ -228,12 +237,7 @@ func convertArt(src *slurp.Article) (*jl.Article, []*jl.UnresolvedJourno) {
bestURL = src.URLs[0]
}

pub := &jl.Publication{
ID: 0, // unresolved
ShortName: src.Publication.Code,
PrettyName: src.Publication.Name,
Domains: []string{src.Publication.Domain},
}
pub := jl.NewPublication(src.Publication.Domain, src.Publication.Name)

var pubDate pq.NullTime
t, err := time.Parse(time.RFC3339, src.Published)
Expand All @@ -252,15 +256,15 @@ func convertArt(src *slurp.Article) (*jl.Article, []*jl.UnresolvedJourno) {
}
}

tags := []jl.Tag{}
if rawTxt != "" {
// count words
cnt := len(wordPat.FindAllString(rawTxt, -1))
if cnt > 0 {
wordCnt = sql.NullInt64{int64(cnt), true}
}

// extract tags
//tags := jl.ExtractTags(rawText)
tags = jl.ExtractTagsFromText(rawTxt)
}

art := &jl.Article{
Expand All @@ -280,6 +284,7 @@ func convertArt(src *slurp.Article) (*jl.Article, []*jl.UnresolvedJourno) {

WordCount: wordCnt,
Status: 'a',
Tags: tags,
}
copy(art.URLs, src.URLs)

Expand Down
24 changes: 16 additions & 8 deletions golib/jl/cmd/jlfeeder/stash.go
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ import (

var ErrAmbiguousJourno = errors.New("Ambiguous Journo")

func stash(tx *sql.Tx, art *jl.Article, authors []*jl.UnresolvedJourno, expectedJournoRef string) error {
func stash(tx *sql.Tx, art *jl.Article, authors []*jl.UnresolvedJourno, expectedJournoRef string, logPrefix string) error {

var err error
// sanity checks
Expand All @@ -29,7 +29,10 @@ func stash(tx *sql.Tx, art *jl.Article, authors []*jl.UnresolvedJourno, expected
if err == jl.ErrNoPub {
// not found - create a new one
err = jl.InsertPublication(tx, art.Publication)
infoLog.Printf("new publication [%d] %s\n", art.Publication.ID, art.Publication.ShortName)
if err != nil {
return err
}
infoLog.Printf("%snew publication [%d] %s\n", logPrefix, art.Publication.ID, art.Publication.ShortName)
}
if err != nil {
return err
Expand All @@ -53,15 +56,20 @@ func stash(tx *sql.Tx, art *jl.Article, authors []*jl.UnresolvedJourno, expected
// find/create journos

journos := []*jl.Journo{}

// de-dupe (just in case!)
authors = jl.UniqUnresolvedJournos(authors)
for _, author := range authors {
j, err := sussJourno(tx, author, art.Publication.ID, expectedJournoRef)
j, err := sussJourno(tx, author, art.Publication.ID, expectedJournoRef, logPrefix)
if err == ErrAmbiguousJourno {
// TODO: need a better mechanism to notify ambiguous journos!
// maybe a new database table?
warnLog.Printf("[a%d] %s\n", art.ID, err)
warnLog.Printf("%sAmbiguous Journo (%s) \n", logPrefix, author.Name)
continue
} else if err != nil {
return fmt.Errorf("sussJourno() failed: %s\n", err)
warnLog.Printf("%s%s (%s) \n", logPrefix, err, author.Name)
//
continue
}
journos = append(journos, j)
}
Expand All @@ -78,7 +86,7 @@ func stash(tx *sql.Tx, art *jl.Article, authors []*jl.UnresolvedJourno, expected
// journo_attr
_, err = tx.Exec("INSERT INTO journo_attr (journo_id,article_id) VALUES ($1,$2)", j.ID, art.ID)
if err != nil {
return err
return fmt.Errorf("failed to attribute journo (j%d to a%d): %s", j.ID, art.ID, err)
}

// apply journo activation policy
Expand All @@ -101,7 +109,7 @@ func stash(tx *sql.Tx, art *jl.Article, authors []*jl.UnresolvedJourno, expected
}

// find or create a journo
func sussJourno(tx *sql.Tx, author *jl.UnresolvedJourno, pubID int, expectedJournoRef string) (*jl.Journo, error) {
func sussJourno(tx *sql.Tx, author *jl.UnresolvedJourno, pubID int, expectedJournoRef string, logPrefix string) (*jl.Journo, error) {

prospects, err := jl.ResolveJourno(tx, author, pubID, expectedJournoRef)
if err != nil {
Expand All @@ -120,6 +128,6 @@ func sussJourno(tx *sql.Tx, author *jl.UnresolvedJourno, pubID int, expectedJour
if err != nil {
return nil, err
}
infoLog.Printf("new journo [j%d] %s\n", newJourno.ID, newJourno.Prettyname)
infoLog.Printf("%snew journo [j%d] %s\n", logPrefix, newJourno.ID, newJourno.Prettyname)
return newJourno, nil
}
Loading

0 comments on commit 0189c9e

Please sign in to comment.