From ca2b4677039aa25e872da1d6a9229cf04e2ece64 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Mon, 7 Sep 2015 17:38:11 -0600 Subject: [PATCH 01/26] Renaming JsonPostResponse.go -> JsonResponse.go --- app/api/{JsonPostResponse.go => JsonResponse.go} | 1 + app/api/api.go | 1 - 2 files changed, 1 insertion(+), 1 deletion(-) rename app/api/{JsonPostResponse.go => JsonResponse.go} (92%) diff --git a/app/api/JsonPostResponse.go b/app/api/JsonResponse.go similarity index 92% rename from app/api/JsonPostResponse.go rename to app/api/JsonResponse.go index f3a748d..ac4d16a 100644 --- a/app/api/JsonPostResponse.go +++ b/app/api/JsonResponse.go @@ -36,6 +36,7 @@ func (res *JsonResponse) write(w http.ResponseWriter) error { } else { out = string(str_items) } + w.Header().Set("Content-Type", "application/json; charset=utf-8") fmt.Fprint(w, out) return err } diff --git a/app/api/api.go b/app/api/api.go index 75f8174..51c5baf 100644 --- a/app/api/api.go +++ b/app/api/api.go @@ -25,7 +25,6 @@ func get_url_count(url *url.URL) int { // Actual API functions func random(c appengine.Context, w http.ResponseWriter, r *http.Request) { - w.Header().Set("Content-Type", "application/json; charset=utf-8") count := get_url_count(r.URL) c.Infof("Requested %v random posts", count) result := NewJsonResponse(500, "Unknown Error", nil) From 3a5f3244f7664cff1be4452da33124a08aa6e3ed Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Mon, 7 Sep 2015 18:04:52 -0600 Subject: [PATCH 02/26] Begin spliting cron into multiple phases --- app/cron/crawler.go | 89 ++++++++++++++++++++++++++++++++++++++++++++ app/cron/cron.go | 1 + app/cron/parser.go | 63 +++++++++++++++++++++++++++++++ app/models/models.go | 5 ++- 4 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 app/cron/crawler.go create mode 100644 app/cron/parser.go diff --git a/app/cron/crawler.go b/app/cron/crawler.go new file mode 100644 index 0000000..c5dfbcd --- /dev/null +++ b/app/cron/crawler.go @@ -0,0 +1,89 @@ +package cron + +import ( + // "app/models" + // "app/helpers/keycache" + "appengine" + // "appengine/datastore" + // "appengine/delay" + // "appengine/taskqueue" + "appengine/urlfetch" + "encoding/xml" + "encoding/json" + "fmt" + "net/http" +) + +// Sourcer: this is a source for defered work chains + +type ChivePost struct { + KEY string `xml:"guid"` + XML string `xml:",innerxml"` +} + +type ChivePostMiner struct { + Item ChivePost `xml:"channel>item"` +} + + +func crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) { + url := page_url(0) + + // Get Response + c.Infof("Parsing index 0 (%v)", url) + resp, err := urlfetch.Client(c).Get(url) + if err != nil { + fmt.Fprint(w, "client error") + return + } + defer resp.Body.Close() + if resp.StatusCode != 200 { + fmt.Fprint(w, "unexpected error code") + } + + // Decode Response + var feed []ChivePostMiner + decoder := xml.NewDecoder(resp.Body) + if err := decoder.Decode(&feed); err != nil { + c.Errorf("decode error %v", err) + fmt.Fprint(w, "decode error") + return + } + + feed[0].Item.XML = "" + feed[0].Item.XML + "" + + c.Infof("Something %v", feed) + + // TODO: store all items to datastore + + + // DEBUGGING ONLY.... HERE DOWN + + post, err := parseData(feed[0].Item.XML) + if err != nil { + c.Errorf("error parsing %v", err) + return + } + + // JSONIFY Response + str_items, err := json.MarshalIndent(&post, "", " ") + var out string + if err != nil { + out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}" + } else { + out = string(str_items) + } + w.Header().Set("Content-Type", "application/json; charset=utf-8") + fmt.Fprint(w, out) +} + + +type FeedCrawler struct { + context appengine.Context + client *http.Client +} + +func (fc *FeedCrawler) Init(c appengine.Context) { + fc.context = c + fc.client = urlfetch.Client(c) +} diff --git a/app/cron/cron.go b/app/cron/cron.go index 7b92c57..f7ef0ca 100644 --- a/app/cron/cron.go +++ b/app/cron/cron.go @@ -25,6 +25,7 @@ const ( ) func Init() { + http.Handle("/cron/crawl", appstats.NewHandler(crawl)) http.Handle("/cron/parse", appstats.NewHandler(parseFeeds)) http.HandleFunc("/cron/delete", delete) } diff --git a/app/cron/parser.go b/app/cron/parser.go new file mode 100644 index 0000000..e869203 --- /dev/null +++ b/app/cron/parser.go @@ -0,0 +1,63 @@ +package cron + +import ( + // "app/models" + // "app/helpers/keycache" + // "appengine" + // "appengine/datastore" + // "appengine/delay" + // "appengine/taskqueue" + // "appengine/urlfetch" + "encoding/xml" + // "encoding/json" + // "fmt" + // "net/http" + "html/template" +) + +type Node struct { + // XML string `xml:",innerxml"` + // ATTR []string + // DATA string `xml:",chardata"` + XMLName xml.Name + XMLAttrs []xml.Attr `xml:",any"` + DATA string `xml:",chardata"` +} + +type Post struct { + Guid string `xml:"guid"` + Tags []string `xml:"category"` + Link string `xml:"link"` + Date string `xml:"pubDate"` + Title string `xml:"title"` + Creator string `xml:"creator"` + Media []Img `xml:"content"` + CommentRSS string `xml:"commentRss"` + Comment []string `xml:"comments"` + Desc template.HTML `xml:"description"` + Enclosure struct { + Url string `xml:"url,attr"` + Children []Node `xml:",any"` + } `xml:"enclosure"` + Thumbnail struct { + Url string `xml:"url,attr"` + Children []Node `xml:",any"` + } `xml:"thumbnail"` + Children []Node `xml:",any"` + Content template.HTML `xml:"encoded"` +} + +type Img struct { + Url string `xml:"url,attr"` + Title string `xml:"title"` + Rating string `xml:"rating"` + Category string `xml:"category"` +} + +// Worker: this will be a worker on defered work chains + +func parseData(data string) (*Post, error) { + var post Post + err := xml.Unmarshal([]byte(data), &post) + return &post, err +} diff --git a/app/models/models.go b/app/models/models.go index b7e363e..9795add 100644 --- a/app/models/models.go +++ b/app/models/models.go @@ -1,3 +1,6 @@ package models -const DB_POST_TABLE = "PostNew" +const ( + DB_POST_TABLE = "PostNew" + DB_RAW_XML_POST_TABLE = "RawXMLPosts" +) From c2f57667eb31b3d76188b2471b02da26322a041e Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Mon, 7 Sep 2015 23:24:23 -0600 Subject: [PATCH 03/26] Breaking crawler up into pieces --- app/cron/crawler/Batcher.go | 23 +++++ app/cron/crawler/FeedCrawler.go | 137 ++++++++++++++++++++++++++++++ app/cron/{ => crawler}/crawler.go | 60 +++++++------ app/cron/cron.go | 4 +- 4 files changed, 197 insertions(+), 27 deletions(-) create mode 100644 app/cron/crawler/Batcher.go create mode 100644 app/cron/crawler/FeedCrawler.go rename app/cron/{ => crawler}/crawler.go (51%) diff --git a/app/cron/crawler/Batcher.go b/app/cron/crawler/Batcher.go new file mode 100644 index 0000000..1c68a88 --- /dev/null +++ b/app/cron/crawler/Batcher.go @@ -0,0 +1,23 @@ +package crawler + +func Batcher(in <-chan ChivePost, batch_size int) <-chan []ChivePost { + out := make(chan []ChivePost) + go func() { + defer close(out) + batch := make([]ChivePost, batch_size) + count := 0 + for post := range in { + batch[count] = post + count++ + if count >= batch_size { + count = 0 + out <- batch + batch = make([]ChivePost, batch_size) // allocate another chunk of memory + } + } + if count > 0 { + out <- batch[:count] + } + }() + return out +} diff --git a/app/cron/crawler/FeedCrawler.go b/app/cron/crawler/FeedCrawler.go new file mode 100644 index 0000000..57a3af4 --- /dev/null +++ b/app/cron/crawler/FeedCrawler.go @@ -0,0 +1,137 @@ +package crawler + +import ( + // "app/models" + // "app/helpers/keycache" + "appengine" + // "appengine/datastore" + // "appengine/delay" + // "appengine/taskqueue" + "appengine/urlfetch" + // "encoding/xml" + // "fmt" + "net/http" + "strconv" +) + +var ( + DEBUG = true + DEBUG_DEPTH = 1 +) + +func NewFeedCrawler(c appengine.Context) *FeedCrawler { + return &FeedCrawler{ + context: c, + client: urlfetch.Client(c), + results: make(chan ChivePost), + } +} + +type FeedCrawler struct { + context appengine.Context + client *http.Client + + todo []int + guids map[string]bool // this could be extremely large + results chan ChivePost +} + +func (fc *FeedCrawler) StartSearch() <-chan ChivePost { + go func() { + defer close(fc.results) + for i := 0; i < 99; i++ { + fc.results <- ChivePost{KEY:"asdf", XML:strconv.Itoa(i)} + } + // fc.search(1, -1) + }() + return fc.results +} + +func (fc *FeedCrawler) addRange(bot, top int) { + // TODO: isn't there a better way to perform this operation!? + for i := bot + 1; i < top; i++ { + fc.todo = append(fc.todo, i) + } +} + +// func (fc *FeedCrawler) search(bot, top int) (err error) { +// /* +// def infinite_length(bottom=1, top=-1): +// if bottom == 1 and not item_exists(1): return 0 # Starting edge case +// if bottom == top - 1: return bottom # Result found! (top doesn’t exist) +// if top < 0: # Searching forward +// top = bottom << 1 # Base 2 hops +// if item_exists(top): +// top, bottom = -1, top # continue searching forward +// else: # Binary search between bottom and top +// middle = (bottom + top) // 2 +// bottom, top = middle, top if item_exists(middle) else bottom, middle +// return infinite_length(bottom, top) # Tail recursion!!! +// */ +// if bot == top - 1 { +// fc.context.Infof("TOP OF RANGE FOUND! @%d", top) +// fc.addRange(bot, top) +// return nil +// } +// var full_stop, is_stop bool = false, false +// if top < 0 { // Searching forward +// top = bot << 1 // Base 2 hops forward +// is_stop, full_stop, err = fc.isStop(top) +// if err != nil { +// return err +// } +// if !is_stop { +// fc.addRange(bot, top) +// top, bot = -1, top +// } +// } else { // Binary search between top and bottom +// mid := (bot + top) / 2 +// is_stop, full_stop, err = fc.isStop(mid) +// if err != nil { +// return err +// } +// if is_stop { +// top = mid +// } else { +// fc.addRange(bot, mid) +// bot = mid +// } +// } +// if full_stop { +// return nil +// } +// return fc.search(bot, top) // TAIL RECURSION!!! +// } +// +// func (fc *FeedCrawler) isStop(idx int) (is_stop, full_stop bool, err error) { +// // Gather posts as necessary +// posts, err := fc.getAndParseFeed(idx) +// if err == FeedParse404Error { +// fc.context.Infof("Reached the end of the feed list (%v)", idx) +// return true, false, nil +// } +// if err != nil { +// fc.context.Errorf("Error decoding ChiveFeed: %s", err) +// return false, false, err +// } +// +// // Check for Duplicates +// store_count := 0 +// for _, post := range posts { +// id, _, err := guidToInt(post.Guid) +// if x.guids[id] || err != nil { +// continue +// } +// store_count += 1 +// } +// fc.posts = append(fc.posts, posts...) +// +// // Use store_count info to determine if isStop +// is_stop = store_count == 0 || DEBUG +// full_stop = len(posts) != store_count && store_count > 0 +// if DEBUG { +// is_stop = idx > DEBUG_DEPTH +// full_stop = idx == DEBUG_DEPTH +// } +// return +// } diff --git a/app/cron/crawler.go b/app/cron/crawler/crawler.go similarity index 51% rename from app/cron/crawler.go rename to app/cron/crawler/crawler.go index c5dfbcd..fd8f710 100644 --- a/app/cron/crawler.go +++ b/app/cron/crawler/crawler.go @@ -1,4 +1,4 @@ -package cron +package crawler import ( // "app/models" @@ -9,7 +9,6 @@ import ( // "appengine/taskqueue" "appengine/urlfetch" "encoding/xml" - "encoding/json" "fmt" "net/http" ) @@ -25,8 +24,12 @@ type ChivePostMiner struct { Item ChivePost `xml:"channel>item"` } +func page_url(idx int) string { + return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx) +} + -func crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) { +func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) { url := page_url(0) // Get Response @@ -59,31 +62,36 @@ func crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) { // DEBUGGING ONLY.... HERE DOWN - post, err := parseData(feed[0].Item.XML) - if err != nil { - c.Errorf("error parsing %v", err) - return - } - - // JSONIFY Response - str_items, err := json.MarshalIndent(&post, "", " ") - var out string - if err != nil { - out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}" - } else { - out = string(str_items) - } - w.Header().Set("Content-Type", "application/json; charset=utf-8") - fmt.Fprint(w, out) + // post, err := parseData(feed[0].Item.XML) + // if err != nil { + // c.Errorf("error parsing %v", err) + // return + // } + // + // // JSONIFY Response + // str_items, err := json.MarshalIndent(&post, "", " ") + // var out string + // if err != nil { + // out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}" + // } else { + // out = string(str_items) + // } + // w.Header().Set("Content-Type", "application/json; charset=utf-8") + // fmt.Fprint(w, out) } - -type FeedCrawler struct { - context appengine.Context - client *http.Client +func Crawl2(c appengine.Context, w http.ResponseWriter, r *http.Request) { + crawler := NewFeedCrawler(c) + found_posts := crawler.StartSearch() + batch_posts := Batcher(found_posts, 20) + Storage(batch_posts, c) } -func (fc *FeedCrawler) Init(c appengine.Context) { - fc.context = c - fc.client = urlfetch.Client(c) +func Storage(in <-chan []ChivePost, c appengine.Context) { + go func() { + for batch := range in { + fmt.Println(batch) + c.Infof("Storing %v", batch) + } + }() } diff --git a/app/cron/cron.go b/app/cron/cron.go index f7ef0ca..4f7f119 100644 --- a/app/cron/cron.go +++ b/app/cron/cron.go @@ -1,6 +1,7 @@ package cron import ( + "app/cron/crawler" "app/models" "app/helpers/keycache" "appengine" @@ -25,7 +26,8 @@ const ( ) func Init() { - http.Handle("/cron/crawl", appstats.NewHandler(crawl)) + http.Handle("/cron/crawl", appstats.NewHandler(crawler.Crawl)) + http.Handle("/cron/crawl2", appstats.NewHandler(crawler.Crawl2)) http.Handle("/cron/parse", appstats.NewHandler(parseFeeds)) http.HandleFunc("/cron/delete", delete) } From e70b32956ac68088b67796363f70ec13943fabdb Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sun, 1 Nov 2015 14:32:07 -0700 Subject: [PATCH 04/26] Macking app work with new version of appengine builder --- app.yaml => yaml/app.yaml | 14 +++++--------- cron.yaml => yaml/cron.yaml | 1 + index.yaml => yaml/index.yaml | 0 yaml/main.go | 16 ++++++++++++++++ yaml/module-cron.yaml | 22 ++++++++++++++++++++++ 5 files changed, 44 insertions(+), 9 deletions(-) rename app.yaml => yaml/app.yaml (59%) rename cron.yaml => yaml/cron.yaml (85%) rename index.yaml => yaml/index.yaml (100%) create mode 100644 yaml/main.go create mode 100644 yaml/module-cron.yaml diff --git a/app.yaml b/yaml/app.yaml similarity index 59% rename from app.yaml rename to yaml/app.yaml index ebcc2cc..6efa065 100644 --- a/app.yaml +++ b/yaml/app.yaml @@ -8,19 +8,15 @@ skip_files: handlers: - url: /static - static_dir: static + static_dir: ../static - url: / - static_files: static/index.html - upload: static/index.html + static_files: ../static/index.html + upload: ../static/index.html - url: /(favicon\.ico|index\.html) - static_files: static/\1 - upload: static/(favicon\.ico|index\.html) - -- url: /cron/.* - script: _go_app - login: admin + static_files: ../static/\1 + upload: ../static/(favicon\.ico|index\.html) - url: /.* script: _go_app diff --git a/cron.yaml b/yaml/cron.yaml similarity index 85% rename from cron.yaml rename to yaml/cron.yaml index 407d25e..08dbe64 100644 --- a/cron.yaml +++ b/yaml/cron.yaml @@ -2,3 +2,4 @@ cron: - description: Parse feeds from source url: /cron/parse schedule: every 6 hours + target: cron diff --git a/index.yaml b/yaml/index.yaml similarity index 100% rename from index.yaml rename to yaml/index.yaml diff --git a/yaml/main.go b/yaml/main.go new file mode 100644 index 0000000..6cd9712 --- /dev/null +++ b/yaml/main.go @@ -0,0 +1,16 @@ +package main + +import ( + "net/http" + + "github.com/bign8/chive-show/app/api" + "github.com/bign8/chive-show/app/cron" +) + +func init() { + http.HandleFunc("/", http.NotFound) // Default Handler + + // Setup Other routes routes + api.Init() + cron.Init() +} diff --git a/yaml/module-cron.yaml b/yaml/module-cron.yaml new file mode 100644 index 0000000..a70f2c3 --- /dev/null +++ b/yaml/module-cron.yaml @@ -0,0 +1,22 @@ +application: crucial-alpha-706 +module: cron +version: uno +runtime: go +api_version: go1 +instance_class: B1 +basic_scaling: + max_instances: 1 + idle_timeout: 30m + +skip_files: +- test/* + +handlers: +- url: /cron/.* + script: _go_app + login: admin + +error_handlers: + - file: err/default.html + - error_code: over_quota + file: err/over_quota.html From dd9dd11325050784cae822239b127461b1b68b1e Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sun, 1 Nov 2015 14:33:28 -0700 Subject: [PATCH 05/26] Breaking up cron into (fetcher, dePager, parser, batcher, saver) [in progress] --- app/cron/crawler/Batcher.go | 41 ++++---- app/cron/crawler/FeedCrawler.go | 137 --------------------------- app/cron/crawler/Fetcher.go | 155 +++++++++++++++++++++++++++++++ app/cron/crawler/UnPager.go | 20 ++++ app/cron/crawler/crawler.go | 159 ++++++++++++++++---------------- app/cron/cron.go | 3 + 6 files changed, 279 insertions(+), 236 deletions(-) delete mode 100644 app/cron/crawler/FeedCrawler.go create mode 100644 app/cron/crawler/Fetcher.go create mode 100644 app/cron/crawler/UnPager.go diff --git a/app/cron/crawler/Batcher.go b/app/cron/crawler/Batcher.go index 1c68a88..8bbaf2a 100644 --- a/app/cron/crawler/Batcher.go +++ b/app/cron/crawler/Batcher.go @@ -1,23 +1,24 @@ package crawler -func Batcher(in <-chan ChivePost, batch_size int) <-chan []ChivePost { - out := make(chan []ChivePost) - go func() { - defer close(out) - batch := make([]ChivePost, batch_size) - count := 0 - for post := range in { - batch[count] = post - count++ - if count >= batch_size { - count = 0 - out <- batch - batch = make([]ChivePost, batch_size) // allocate another chunk of memory - } - } - if count > 0 { - out <- batch[:count] - } - }() - return out +// Batcher takes input and batches to given sizes +func Batcher(in <-chan string, size int) <-chan []string { + out := make(chan []string) + go func() { + defer close(out) + batch := make([]string, size) + count := 0 + for post := range in { + batch[count] = post + count++ + if count >= size { + count = 0 + out <- batch + batch = make([]string, size) // allocate another chunk of memory + } + } + if count > 0 { + out <- batch[:count] + } + }() + return out } diff --git a/app/cron/crawler/FeedCrawler.go b/app/cron/crawler/FeedCrawler.go deleted file mode 100644 index 57a3af4..0000000 --- a/app/cron/crawler/FeedCrawler.go +++ /dev/null @@ -1,137 +0,0 @@ -package crawler - -import ( - // "app/models" - // "app/helpers/keycache" - "appengine" - // "appengine/datastore" - // "appengine/delay" - // "appengine/taskqueue" - "appengine/urlfetch" - // "encoding/xml" - // "fmt" - "net/http" - "strconv" -) - -var ( - DEBUG = true - DEBUG_DEPTH = 1 -) - -func NewFeedCrawler(c appengine.Context) *FeedCrawler { - return &FeedCrawler{ - context: c, - client: urlfetch.Client(c), - results: make(chan ChivePost), - } -} - -type FeedCrawler struct { - context appengine.Context - client *http.Client - - todo []int - guids map[string]bool // this could be extremely large - results chan ChivePost -} - -func (fc *FeedCrawler) StartSearch() <-chan ChivePost { - go func() { - defer close(fc.results) - for i := 0; i < 99; i++ { - fc.results <- ChivePost{KEY:"asdf", XML:strconv.Itoa(i)} - } - // fc.search(1, -1) - }() - return fc.results -} - -func (fc *FeedCrawler) addRange(bot, top int) { - // TODO: isn't there a better way to perform this operation!? - for i := bot + 1; i < top; i++ { - fc.todo = append(fc.todo, i) - } -} - -// func (fc *FeedCrawler) search(bot, top int) (err error) { -// /* -// def infinite_length(bottom=1, top=-1): -// if bottom == 1 and not item_exists(1): return 0 # Starting edge case -// if bottom == top - 1: return bottom # Result found! (top doesn’t exist) -// if top < 0: # Searching forward -// top = bottom << 1 # Base 2 hops -// if item_exists(top): -// top, bottom = -1, top # continue searching forward -// else: # Binary search between bottom and top -// middle = (bottom + top) // 2 -// bottom, top = middle, top if item_exists(middle) else bottom, middle -// return infinite_length(bottom, top) # Tail recursion!!! -// */ -// if bot == top - 1 { -// fc.context.Infof("TOP OF RANGE FOUND! @%d", top) -// fc.addRange(bot, top) -// return nil -// } -// var full_stop, is_stop bool = false, false -// if top < 0 { // Searching forward -// top = bot << 1 // Base 2 hops forward -// is_stop, full_stop, err = fc.isStop(top) -// if err != nil { -// return err -// } -// if !is_stop { -// fc.addRange(bot, top) -// top, bot = -1, top -// } -// } else { // Binary search between top and bottom -// mid := (bot + top) / 2 -// is_stop, full_stop, err = fc.isStop(mid) -// if err != nil { -// return err -// } -// if is_stop { -// top = mid -// } else { -// fc.addRange(bot, mid) -// bot = mid -// } -// } -// if full_stop { -// return nil -// } -// return fc.search(bot, top) // TAIL RECURSION!!! -// } -// -// func (fc *FeedCrawler) isStop(idx int) (is_stop, full_stop bool, err error) { -// // Gather posts as necessary -// posts, err := fc.getAndParseFeed(idx) -// if err == FeedParse404Error { -// fc.context.Infof("Reached the end of the feed list (%v)", idx) -// return true, false, nil -// } -// if err != nil { -// fc.context.Errorf("Error decoding ChiveFeed: %s", err) -// return false, false, err -// } -// -// // Check for Duplicates -// store_count := 0 -// for _, post := range posts { -// id, _, err := guidToInt(post.Guid) -// if x.guids[id] || err != nil { -// continue -// } -// store_count += 1 -// } -// fc.posts = append(fc.posts, posts...) -// -// // Use store_count info to determine if isStop -// is_stop = store_count == 0 || DEBUG -// full_stop = len(posts) != store_count && store_count > 0 -// if DEBUG { -// is_stop = idx > DEBUG_DEPTH -// full_stop = idx == DEBUG_DEPTH -// } -// return -// } diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go new file mode 100644 index 0000000..89a3c10 --- /dev/null +++ b/app/cron/crawler/Fetcher.go @@ -0,0 +1,155 @@ +package crawler + +import ( + "fmt" + "io/ioutil" + "net/http" + + "appengine" + "appengine/urlfetch" +) + +const ( + // DEBUG enable if troubleshooting algorithm + DEBUG = true + + // DEPTH depth of feed mining + DEPTH = 3 +) + +func pageURL(idx int) string { + return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx) +} + +// Fetcher returns stream of un-processed xml posts +func Fetcher(c appengine.Context) <-chan string { + res := make(chan string) + worker := &fetcher{ + res: res, + context: c, + client: urlfetch.Client(c), + } + go worker.Main() + return res +} + +type fetcher struct { + res chan<- string + context appengine.Context + client *http.Client + todo chan int +} + +func (x *fetcher) Main() error { + // Check first item edge case + if isStop, err := x.isStop(1); isStop || err != nil { + x.context.Infof("Fetcher: Finished without recursive searching %v", err) + return err + } + + // Defer as many todo workers as necessary + x.todo = make(chan int) + go x.processTODO() + return x.Search(1, -1) +} + +func (x *fetcher) Search(bottom, top int) (err error) { + /* + def infinite_length(bottom=1, top=-1): + if bottom == 1 and not item_exists(1): return 0 # Starting edge case + if bottom == top - 1: return bottom # Result found! (top doesn’t exist) + if top < 0: # Searching forward + top = bottom << 1 # Base 2 hops + if item_exists(top): + top, bottom = -1, top # continue searching forward + else: # Binary search between bottom and top + middle = (bottom + top) // 2 + bottom, top = middle, top if item_exists(middle) else bottom, middle + return infinite_length(bottom, top) # Tail recursion!!! + */ + if bottom == top-1 { + x.context.Infof("Fetcher: TOP OF RANGE FOUND! @%d", top) + x.addRange(bottom, top) + close(x.res) + return nil + } + x.context.Infof("Fetcher: Search(%d, %d)", bottom, top) + var isStop = false + + // Searching forward + if top < 0 { + top = bottom << 1 // Base 2 hops forward + isStop, err = x.isStop(top) + if err != nil { + close(x.res) + return err + } + if !isStop { + x.addRange(bottom, top) + top, bottom = -1, top + } + + // Binary search between top and bottom + } else { + middle := (bottom + top) / 2 + isStop, err = x.isStop(middle) + if err != nil { + close(x.res) + return err + } + if isStop { + top = middle + } else { + x.addRange(bottom, middle) + bottom = middle + } + } + return x.Search(bottom, top) // TAIL RECURSION!!! +} + +func (x *fetcher) isStop(idx int) (isStop bool, err error) { + + // Gather posts as necessary + url := pageURL(idx) + x.context.Infof("Fetcher: Fetching %s", url) + resp, err := x.client.Get(url) + if err != nil { + x.context.Errorf("Fetcher: Error decoding ChiveFeed: %s", err) + return true, err + } + defer resp.Body.Close() + + // Check Response Codes for non-200 responses + if resp.StatusCode != 200 { + if resp.StatusCode == 404 { + x.context.Infof("Fetcher: Reached the end of the feed list (%v)", idx) + return true, nil + } + return true, fmt.Errorf("Fetcher: Feed parcing received a %d Status Code on (%s)", resp.StatusCode, url) + } + + // Pull response content into String + contents, err := ioutil.ReadAll(resp.Body) + if err != nil { + return true, err + } + x.res <- string(contents) + + // Use store_count info to determine if isStop + if DEBUG { + isStop = idx >= DEPTH + } + return isStop, nil +} + +func (x *fetcher) addRange(bottom, top int) { + for i := bottom + 1; i < top; i++ { + x.todo <- i + } +} + +func (x *fetcher) processTODO() { + for idx := range x.todo { + x.isStop(idx) + } +} diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go new file mode 100644 index 0000000..fdbb5d0 --- /dev/null +++ b/app/cron/crawler/UnPager.go @@ -0,0 +1,20 @@ +package crawler + +import "appengine" + +// UnPager process pages of posts to individual posts +func UnPager(c appengine.Context, pages <-chan string) <-chan string { + res := make(chan string) + go runUnPager(c, pages, res) + return res +} + +func runUnPager(c appengine.Context, in <-chan string, out chan<- string) { + defer close(out) + + for page := range in { + c.Infof("Retrieved Page %s", page) + + // TODO: decompress page + } +} diff --git a/app/cron/crawler/crawler.go b/app/cron/crawler/crawler.go index fd8f710..5e240b8 100644 --- a/app/cron/crawler/crawler.go +++ b/app/cron/crawler/crawler.go @@ -1,97 +1,98 @@ package crawler import ( - // "app/models" - // "app/helpers/keycache" - "appengine" - // "appengine/datastore" - // "appengine/delay" - // "appengine/taskqueue" - "appengine/urlfetch" - "encoding/xml" - "fmt" - "net/http" + // "app/models" + // "app/helpers/keycache" + "appengine" + // "appengine/datastore" + // "appengine/delay" + // "appengine/taskqueue" + "encoding/xml" + "fmt" + "net/http" + + "appengine/urlfetch" ) // Sourcer: this is a source for defered work chains -type ChivePost struct { - KEY string `xml:"guid"` - XML string `xml:",innerxml"` +type chivePost struct { + KEY string `xml:"guid"` + XML string `xml:",innerxml"` } -type ChivePostMiner struct { - Item ChivePost `xml:"channel>item"` +type chivePostMiner struct { + Item []chivePost `xml:"channel>item"` } -func page_url(idx int) string { - return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx) -} - - func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) { - url := page_url(0) - - // Get Response - c.Infof("Parsing index 0 (%v)", url) - resp, err := urlfetch.Client(c).Get(url) - if err != nil { - fmt.Fprint(w, "client error") - return - } - defer resp.Body.Close() - if resp.StatusCode != 200 { - fmt.Fprint(w, "unexpected error code") - } - - // Decode Response - var feed []ChivePostMiner - decoder := xml.NewDecoder(resp.Body) - if err := decoder.Decode(&feed); err != nil { - c.Errorf("decode error %v", err) - fmt.Fprint(w, "decode error") - return - } - - feed[0].Item.XML = "" + feed[0].Item.XML + "" - - c.Infof("Something %v", feed) - - // TODO: store all items to datastore - - - // DEBUGGING ONLY.... HERE DOWN - - // post, err := parseData(feed[0].Item.XML) - // if err != nil { - // c.Errorf("error parsing %v", err) - // return - // } - // - // // JSONIFY Response - // str_items, err := json.MarshalIndent(&post, "", " ") - // var out string - // if err != nil { - // out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}" - // } else { - // out = string(str_items) - // } - // w.Header().Set("Content-Type", "application/json; charset=utf-8") - // fmt.Fprint(w, out) + url := pageURL(9999) + + // Get Response + c.Infof("Parsing index 0 (%v)", url) + resp, err := urlfetch.Client(c).Get(url) + if err != nil { + fmt.Fprint(w, "client error") + return + } + defer resp.Body.Close() + if resp.StatusCode != 200 { + fmt.Fprint(w, "unexpected error code") + } + + // Decode Response + var feed chivePostMiner + decoder := xml.NewDecoder(resp.Body) + if err := decoder.Decode(&feed); err != nil { + c.Errorf("decode error %v", err) + fmt.Fprint(w, "decode error") + return + } + + // Wrap posts in xml + for idx, post := range feed.Item { + feed.Item[idx].XML = "" + post.XML + "" + } + + c.Infof("Something %v", feed) + + // TODO: store all items to datastore + + // DEBUGGING ONLY.... HERE DOWN + + // post, err := parseData(feed[0].Item.XML) + // if err != nil { + // c.Errorf("error parsing %v", err) + // return + // } + // + // // JSONIFY Response + // str_items, err := json.MarshalIndent(&post, "", " ") + // var out string + // if err != nil { + // out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}" + // } else { + // out = string(str_items) + // } + // w.Header().Set("Content-Type", "application/json; charset=utf-8") + // fmt.Fprint(w, out) } func Crawl2(c appengine.Context, w http.ResponseWriter, r *http.Request) { - crawler := NewFeedCrawler(c) - found_posts := crawler.StartSearch() - batch_posts := Batcher(found_posts, 20) - Storage(batch_posts, c) + pages := Fetcher(c) + for _ = range pages { + c.Infof("Found page") + } + // posts := UnPager(c, pages) + // batch := Batcher(posts, 20) + // Storage(c, batch) } -func Storage(in <-chan []ChivePost, c appengine.Context) { - go func() { - for batch := range in { - fmt.Println(batch) - c.Infof("Storing %v", batch) - } - }() +func Storage(c appengine.Context, in <-chan []string) { + go func() { + for batch := range in { + fmt.Println(batch) + c.Infof("Storing %v", batch) + } + }() } diff --git a/app/cron/cron.go b/app/cron/cron.go index 08808b5..475c1e7 100644 --- a/app/cron/cron.go +++ b/app/cron/cron.go @@ -41,6 +41,9 @@ func Init() { http.Handle("/cron/crawl2", appstats.NewHandler(crawler.Crawl2)) http.Handle("/cron/parse", appstats.NewHandler(parseFeeds)) http.HandleFunc("/cron/delete", delete) + http.HandleFunc("/_ah/start", func(w http.ResponseWriter, r *http.Request) { + + }) } var ( From 87aa042c7fa8f298eae768b86548acf03cdf6fb3 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sun, 1 Nov 2015 14:44:38 -0700 Subject: [PATCH 06/26] Cleaning up fetcher channel closures --- app/cron/crawler/Fetcher.go | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go index 89a3c10..650ce58 100644 --- a/app/cron/crawler/Fetcher.go +++ b/app/cron/crawler/Fetcher.go @@ -11,7 +11,7 @@ import ( const ( // DEBUG enable if troubleshooting algorithm - DEBUG = true + DEBUG = false // DEPTH depth of feed mining DEPTH = 3 @@ -41,6 +41,8 @@ type fetcher struct { } func (x *fetcher) Main() error { + defer close(x.res) + // Check first item edge case if isStop, err := x.isStop(1); isStop || err != nil { x.context.Infof("Fetcher: Finished without recursive searching %v", err) @@ -49,6 +51,7 @@ func (x *fetcher) Main() error { // Defer as many todo workers as necessary x.todo = make(chan int) + defer close(x.todo) go x.processTODO() return x.Search(1, -1) } @@ -70,7 +73,6 @@ func (x *fetcher) Search(bottom, top int) (err error) { if bottom == top-1 { x.context.Infof("Fetcher: TOP OF RANGE FOUND! @%d", top) x.addRange(bottom, top) - close(x.res) return nil } x.context.Infof("Fetcher: Search(%d, %d)", bottom, top) @@ -81,7 +83,6 @@ func (x *fetcher) Search(bottom, top int) (err error) { top = bottom << 1 // Base 2 hops forward isStop, err = x.isStop(top) if err != nil { - close(x.res) return err } if !isStop { @@ -94,7 +95,6 @@ func (x *fetcher) Search(bottom, top int) (err error) { middle := (bottom + top) / 2 isStop, err = x.isStop(middle) if err != nil { - close(x.res) return err } if isStop { @@ -150,6 +150,7 @@ func (x *fetcher) addRange(bottom, top int) { func (x *fetcher) processTODO() { for idx := range x.todo { - x.isStop(idx) + x.context.Infof("Fetcher: NOT processing TODO %d", idx) + //x.isStop(idx) } } From a64b79080e9ea9323357f2ff5c4d57ea6253e7b9 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sun, 1 Nov 2015 15:06:55 -0700 Subject: [PATCH 07/26] Finishing un-paginator --- app/cron/crawler/Fetcher.go | 4 ++-- app/cron/crawler/UnPager.go | 26 +++++++++++++++++++++++--- app/cron/crawler/crawler.go | 9 +++++---- 3 files changed, 30 insertions(+), 9 deletions(-) diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go index 650ce58..6cf8252 100644 --- a/app/cron/crawler/Fetcher.go +++ b/app/cron/crawler/Fetcher.go @@ -11,10 +11,10 @@ import ( const ( // DEBUG enable if troubleshooting algorithm - DEBUG = false + DEBUG = true // DEPTH depth of feed mining - DEPTH = 3 + DEPTH = 1 ) func pageURL(idx int) string { diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go index fdbb5d0..ef3a800 100644 --- a/app/cron/crawler/UnPager.go +++ b/app/cron/crawler/UnPager.go @@ -1,10 +1,16 @@ package crawler -import "appengine" +import ( + "encoding/xml" + + "appengine" +) // UnPager process pages of posts to individual posts func UnPager(c appengine.Context, pages <-chan string) <-chan string { res := make(chan string) + + // TODO: spin up as many unpages as desired go runUnPager(c, pages, res) return res } @@ -12,9 +18,23 @@ func UnPager(c appengine.Context, pages <-chan string) <-chan string { func runUnPager(c appengine.Context, in <-chan string, out chan<- string) { defer close(out) + var miner struct { + Item []struct { + KEY string `xml:"guid"` + XML string `xml:",innerxml"` + } `xml:"channel>item"` + } + for page := range in { - c.Infof("Retrieved Page %s", page) + c.Infof("UnPager: Retrieved Page") + + if err := xml.Unmarshal([]byte(page), &miner); err != nil { + c.Errorf("UnPager: Error %s", err) + } - // TODO: decompress page + for _, post := range miner.Item { + c.Infof("UnPager: Found Post %s", post.KEY) + out <- post.XML + } } } diff --git a/app/cron/crawler/crawler.go b/app/cron/crawler/crawler.go index 5e240b8..418c620 100644 --- a/app/cron/crawler/crawler.go +++ b/app/cron/crawler/crawler.go @@ -26,7 +26,7 @@ type chivePostMiner struct { } func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) { - url := pageURL(9999) + url := pageURL(1) // Get Response c.Infof("Parsing index 0 (%v)", url) @@ -79,11 +79,12 @@ func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) { } func Crawl2(c appengine.Context, w http.ResponseWriter, r *http.Request) { + // fetcher, dePager, parser, batcher, saver pages := Fetcher(c) - for _ = range pages { - c.Infof("Found page") + posts := UnPager(c, pages) + for post := range posts { + c.Infof("Post: %v", post) } - // posts := UnPager(c, pages) // batch := Batcher(posts, 20) // Storage(c, batch) } From ad46e1cd8c71bdba49fe7c5135d06986a7bf9e1e Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sun, 1 Nov 2015 15:42:48 -0700 Subject: [PATCH 08/26] Storing crawled posts --- app/cron/crawler/Batcher.go | 8 ++-- app/cron/crawler/Fetcher.go | 8 ---- app/cron/crawler/Storage.go | 33 +++++++++++++ app/cron/crawler/UnPager.go | 11 +++-- app/cron/crawler/crawler.go | 93 +++++++------------------------------ app/cron/cron.go | 1 - 6 files changed, 60 insertions(+), 94 deletions(-) create mode 100644 app/cron/crawler/Storage.go diff --git a/app/cron/crawler/Batcher.go b/app/cron/crawler/Batcher.go index 8bbaf2a..a4a99f3 100644 --- a/app/cron/crawler/Batcher.go +++ b/app/cron/crawler/Batcher.go @@ -1,11 +1,11 @@ package crawler // Batcher takes input and batches to given sizes -func Batcher(in <-chan string, size int) <-chan []string { - out := make(chan []string) +func Batcher(in <-chan Data, size int) <-chan []Data { + out := make(chan []Data) go func() { defer close(out) - batch := make([]string, size) + batch := make([]Data, size) count := 0 for post := range in { batch[count] = post @@ -13,7 +13,7 @@ func Batcher(in <-chan string, size int) <-chan []string { if count >= size { count = 0 out <- batch - batch = make([]string, size) // allocate another chunk of memory + batch = make([]Data, size) // allocate another chunk of memory } } if count > 0 { diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go index 6cf8252..9d147c3 100644 --- a/app/cron/crawler/Fetcher.go +++ b/app/cron/crawler/Fetcher.go @@ -9,14 +9,6 @@ import ( "appengine/urlfetch" ) -const ( - // DEBUG enable if troubleshooting algorithm - DEBUG = true - - // DEPTH depth of feed mining - DEPTH = 1 -) - func pageURL(idx int) string { return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx) } diff --git a/app/cron/crawler/Storage.go b/app/cron/crawler/Storage.go new file mode 100644 index 0000000..b2a1346 --- /dev/null +++ b/app/cron/crawler/Storage.go @@ -0,0 +1,33 @@ +package crawler + +import ( + "appengine" + "appengine/datastore" +) + +func Storage(c appengine.Context, in <-chan []Data) { + runStorage(c, in) +} + +type Store struct { + XML []byte +} + +func runStorage(c appengine.Context, in <-chan []Data) { + var keys []*datastore.Key + var items []Store + for batch := range in { + keys = make([]*datastore.Key, len(batch)) + items = make([]Store, len(batch)) + for i, item := range batch { + keys[i] = datastore.NewKey(c, XML, item.KEY, 0, nil) + items[i].XML = []byte(item.XML) + } + + c.Infof("Storage: Storing %v", keys) + _, err := datastore.PutMulti(c, keys, items) + if err != nil { + c.Errorf("Storage: Error storing batch %s", err) + } + } +} diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go index ef3a800..f7ac41b 100644 --- a/app/cron/crawler/UnPager.go +++ b/app/cron/crawler/UnPager.go @@ -7,15 +7,15 @@ import ( ) // UnPager process pages of posts to individual posts -func UnPager(c appengine.Context, pages <-chan string) <-chan string { - res := make(chan string) +func UnPager(c appengine.Context, pages <-chan string) <-chan Data { + res := make(chan Data) // TODO: spin up as many unpages as desired go runUnPager(c, pages, res) return res } -func runUnPager(c appengine.Context, in <-chan string, out chan<- string) { +func runUnPager(c appengine.Context, in <-chan string, out chan<- Data) { defer close(out) var miner struct { @@ -34,7 +34,10 @@ func runUnPager(c appengine.Context, in <-chan string, out chan<- string) { for _, post := range miner.Item { c.Infof("UnPager: Found Post %s", post.KEY) - out <- post.XML + out <- Data{ + KEY: post.KEY, + XML: post.XML, + } } } } diff --git a/app/cron/crawler/crawler.go b/app/cron/crawler/crawler.go index 418c620..5227b3f 100644 --- a/app/cron/crawler/crawler.go +++ b/app/cron/crawler/crawler.go @@ -7,93 +7,32 @@ import ( // "appengine/datastore" // "appengine/delay" // "appengine/taskqueue" - "encoding/xml" + "fmt" "net/http" - - "appengine/urlfetch" ) -// Sourcer: this is a source for defered work chains - -type chivePost struct { - KEY string `xml:"guid"` - XML string `xml:",innerxml"` -} - -type chivePostMiner struct { - Item []chivePost `xml:"channel>item"` -} - -func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) { - url := pageURL(1) - - // Get Response - c.Infof("Parsing index 0 (%v)", url) - resp, err := urlfetch.Client(c).Get(url) - if err != nil { - fmt.Fprint(w, "client error") - return - } - defer resp.Body.Close() - if resp.StatusCode != 200 { - fmt.Fprint(w, "unexpected error code") - } - - // Decode Response - var feed chivePostMiner - decoder := xml.NewDecoder(resp.Body) - if err := decoder.Decode(&feed); err != nil { - c.Errorf("decode error %v", err) - fmt.Fprint(w, "decode error") - return - } - - // Wrap posts in xml - for idx, post := range feed.Item { - feed.Item[idx].XML = "" + post.XML + "" - } - - c.Infof("Something %v", feed) +const ( + // DEBUG enable if troubleshooting algorithm + DEBUG = false - // TODO: store all items to datastore + // DEPTH depth of feed mining + DEPTH = 1 - // DEBUGGING ONLY.... HERE DOWN + // XML name of where xml posts are stored + XML = "xml" +) - // post, err := parseData(feed[0].Item.XML) - // if err != nil { - // c.Errorf("error parsing %v", err) - // return - // } - // - // // JSONIFY Response - // str_items, err := json.MarshalIndent(&post, "", " ") - // var out string - // if err != nil { - // out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}" - // } else { - // out = string(str_items) - // } - // w.Header().Set("Content-Type", "application/json; charset=utf-8") - // fmt.Fprint(w, out) +type Data struct { + KEY string + XML string } -func Crawl2(c appengine.Context, w http.ResponseWriter, r *http.Request) { +func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) { // fetcher, dePager, parser, batcher, saver pages := Fetcher(c) posts := UnPager(c, pages) - for post := range posts { - c.Infof("Post: %v", post) - } - // batch := Batcher(posts, 20) - // Storage(c, batch) -} - -func Storage(c appengine.Context, in <-chan []string) { - go func() { - for batch := range in { - fmt.Println(batch) - c.Infof("Storing %v", batch) - } - }() + batch := Batcher(posts, 50) + Storage(c, batch) + fmt.Fprint(w, "Crawl Complete!") } diff --git a/app/cron/cron.go b/app/cron/cron.go index 475c1e7..aa23eca 100644 --- a/app/cron/cron.go +++ b/app/cron/cron.go @@ -38,7 +38,6 @@ const ( // Init initializes cron handlers func Init() { http.Handle("/cron/crawl", appstats.NewHandler(crawler.Crawl)) - http.Handle("/cron/crawl2", appstats.NewHandler(crawler.Crawl2)) http.Handle("/cron/parse", appstats.NewHandler(parseFeeds)) http.HandleFunc("/cron/delete", delete) http.HandleFunc("/_ah/start", func(w http.ResponseWriter, r *http.Request) { From 8dca8c35c8546ade6f7abfe8dcb02afa3e08b22e Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Tue, 10 Nov 2015 00:49:08 -0700 Subject: [PATCH 09/26] Splitting up cron into multiple parts --- app/cron/crawler/Batcher.go | 7 ++++-- app/cron/crawler/Fetcher.go | 49 ++++++++++++++++++++++++++----------- app/cron/crawler/Storage.go | 19 +++++++++++--- app/cron/crawler/UnPager.go | 27 ++++++++++++++------ app/cron/crawler/crawler.go | 36 ++++++++++++++++++++++----- app/cron/cron.go | 6 +++-- 6 files changed, 108 insertions(+), 36 deletions(-) diff --git a/app/cron/crawler/Batcher.go b/app/cron/crawler/Batcher.go index a4a99f3..3ff3d77 100644 --- a/app/cron/crawler/Batcher.go +++ b/app/cron/crawler/Batcher.go @@ -1,8 +1,10 @@ package crawler +import "appengine" + // Batcher takes input and batches to given sizes -func Batcher(in <-chan Data, size int) <-chan []Data { - out := make(chan []Data) +func Batcher(c appengine.Context, in <-chan Data, size int) <-chan []Data { + out := make(chan []Data, 10000) go func() { defer close(out) batch := make([]Data, size) @@ -16,6 +18,7 @@ func Batcher(in <-chan Data, size int) <-chan []Data { batch = make([]Data, size) // allocate another chunk of memory } } + c.Infof("Batcher: Finished Batching") if count > 0 { out <- batch[:count] } diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go index 9d147c3..c43c9a5 100644 --- a/app/cron/crawler/Fetcher.go +++ b/app/cron/crawler/Fetcher.go @@ -4,6 +4,8 @@ import ( "fmt" "io/ioutil" "net/http" + "sync" + "time" "appengine" "appengine/urlfetch" @@ -14,25 +16,25 @@ func pageURL(idx int) string { } // Fetcher returns stream of un-processed xml posts -func Fetcher(c appengine.Context) <-chan string { - res := make(chan string) +func Fetcher(c appengine.Context, workers int) <-chan Data { + res := make(chan Data, 100) worker := &fetcher{ res: res, context: c, client: urlfetch.Client(c), } - go worker.Main() + go worker.Main(workers) return res } type fetcher struct { - res chan<- string + res chan<- Data context appengine.Context client *http.Client todo chan int } -func (x *fetcher) Main() error { +func (x *fetcher) Main(workers int) error { defer close(x.res) // Check first item edge case @@ -42,10 +44,24 @@ func (x *fetcher) Main() error { } // Defer as many todo workers as necessary - x.todo = make(chan int) - defer close(x.todo) - go x.processTODO() - return x.Search(1, -1) + x.todo = make(chan int, 1000) + + // Number of batch fetchers to process + var wg sync.WaitGroup + for i := 0; i < workers; i++ { + go func(idx int) { + x.processTODO() + wg.Done() + }(i) + } + wg.Add(workers) + + err := x.Search(1, -1) + + // wait for processTODOs to finish + wg.Wait() + x.context.Infof("Complete with FETCHING") + return err } func (x *fetcher) Search(bottom, top int) (err error) { @@ -65,6 +81,7 @@ func (x *fetcher) Search(bottom, top int) (err error) { if bottom == top-1 { x.context.Infof("Fetcher: TOP OF RANGE FOUND! @%d", top) x.addRange(bottom, top) + close(x.todo) return nil } x.context.Infof("Fetcher: Search(%d, %d)", bottom, top) @@ -106,8 +123,9 @@ func (x *fetcher) isStop(idx int) (isStop bool, err error) { x.context.Infof("Fetcher: Fetching %s", url) resp, err := x.client.Get(url) if err != nil { - x.context.Errorf("Fetcher: Error decoding ChiveFeed: %s", err) - return true, err + x.context.Errorf("Fetcher: Error decoding ChiveFeed (1s sleep): %s", err) + time.Sleep(time.Second) + return x.isStop(idx) // Tail recursion (this loop may get us into trouble) } defer resp.Body.Close() @@ -125,7 +143,10 @@ func (x *fetcher) isStop(idx int) (isStop bool, err error) { if err != nil { return true, err } - x.res <- string(contents) + x.res <- Data{ + KEY: url, + XML: string(contents), + } // Use store_count info to determine if isStop if DEBUG { @@ -142,7 +163,7 @@ func (x *fetcher) addRange(bottom, top int) { func (x *fetcher) processTODO() { for idx := range x.todo { - x.context.Infof("Fetcher: NOT processing TODO %d", idx) - //x.isStop(idx) + // x.context.Infof("Fetcher: NOT processing TODO %d", idx) + x.isStop(idx) } } diff --git a/app/cron/crawler/Storage.go b/app/cron/crawler/Storage.go index b2a1346..6d62bbc 100644 --- a/app/cron/crawler/Storage.go +++ b/app/cron/crawler/Storage.go @@ -1,22 +1,33 @@ package crawler import ( + "sync" + "appengine" "appengine/datastore" ) -func Storage(c appengine.Context, in <-chan []Data) { - runStorage(c, in) +func Storage(c appengine.Context, in <-chan []Data, workers int) { + var wg sync.WaitGroup + for i := 0; i < workers; i++ { + go func(x int) { + runStorage(c, in, x) + wg.Done() + }(i) + } + wg.Add(workers) + wg.Wait() } type Store struct { XML []byte } -func runStorage(c appengine.Context, in <-chan []Data) { +func runStorage(c appengine.Context, in <-chan []Data, x int) { var keys []*datastore.Key var items []Store for batch := range in { + c.Infof("Storage %d: Storing chunk", x) keys = make([]*datastore.Key, len(batch)) items = make([]Store, len(batch)) for i, item := range batch { @@ -24,7 +35,7 @@ func runStorage(c appengine.Context, in <-chan []Data) { items[i].XML = []byte(item.XML) } - c.Infof("Storage: Storing %v", keys) + // c.Infof("Storage: Storing %v", keys) _, err := datastore.PutMulti(c, keys, items) if err != nil { c.Errorf("Storage: Error storing batch %s", err) diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go index f7ac41b..2a3a21d 100644 --- a/app/cron/crawler/UnPager.go +++ b/app/cron/crawler/UnPager.go @@ -2,22 +2,33 @@ package crawler import ( "encoding/xml" + "sync" "appengine" ) // UnPager process pages of posts to individual posts -func UnPager(c appengine.Context, pages <-chan string) <-chan Data { - res := make(chan Data) +func UnPager(c appengine.Context, pages <-chan string, workers int) <-chan Data { + res := make(chan Data, 100000) // TODO: spin up as many unpages as desired - go runUnPager(c, pages, res) + var wg sync.WaitGroup + wg.Add(workers) + for i := 0; i < workers; i++ { + go func(x int) { + runUnPager(c, pages, res, x) + wg.Done() + }(i) + } + go func() { + wg.Wait() + close(res) + }() + return res } -func runUnPager(c appengine.Context, in <-chan string, out chan<- Data) { - defer close(out) - +func runUnPager(c appengine.Context, in <-chan string, out chan<- Data, idx int) { var miner struct { Item []struct { KEY string `xml:"guid"` @@ -26,14 +37,14 @@ func runUnPager(c appengine.Context, in <-chan string, out chan<- Data) { } for page := range in { - c.Infof("UnPager: Retrieved Page") + c.Infof("UnPager %d: Retrieved Page", idx) if err := xml.Unmarshal([]byte(page), &miner); err != nil { c.Errorf("UnPager: Error %s", err) } for _, post := range miner.Item { - c.Infof("UnPager: Found Post %s", post.KEY) + // c.Infof("UnPager: Found Post %s", post.KEY) out <- Data{ KEY: post.KEY, XML: post.XML, diff --git a/app/cron/crawler/crawler.go b/app/cron/crawler/crawler.go index 5227b3f..8e322ce 100644 --- a/app/cron/crawler/crawler.go +++ b/app/cron/crawler/crawler.go @@ -3,8 +3,9 @@ package crawler import ( // "app/models" // "app/helpers/keycache" + "appengine" - // "appengine/datastore" + "appengine/datastore" // "appengine/delay" // "appengine/taskqueue" @@ -28,11 +29,34 @@ type Data struct { XML string } -func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) { +func Crawl(w http.ResponseWriter, r *http.Request) { + c := appengine.NewContext(r) + + fetchers, storers := 50, 20 + // fetcher, dePager, parser, batcher, saver - pages := Fetcher(c) - posts := UnPager(c, pages) - batch := Batcher(posts, 50) - Storage(c, batch) + pages := Fetcher(c, fetchers) + // posts := UnPager(c, pages, pagers) + batch := Batcher(c, pages, 10) + Storage(c, batch, storers) + fmt.Fprint(w, "Crawl Complete!") } + +func Stats(c appengine.Context, w http.ResponseWriter, r *http.Request) { + + q := datastore.NewQuery("xml") + + var data []Store + keys, err := q.GetAll(c, &data) + if err != nil { + fmt.Fprintf(w, "Error %s", err) + return + } + + for idx, key := range keys { + fmt.Fprintf(w, "Data %s: len %d\n", key, len(data[idx].XML)) + } + + fmt.Fprintf(w, "Overall %d", len(data)) +} diff --git a/app/cron/cron.go b/app/cron/cron.go index aa23eca..4bc1439 100644 --- a/app/cron/cron.go +++ b/app/cron/cron.go @@ -37,11 +37,13 @@ const ( // Init initializes cron handlers func Init() { - http.Handle("/cron/crawl", appstats.NewHandler(crawler.Crawl)) + http.HandleFunc("/cron/crawl", crawler.Crawl) + http.Handle("/cron/stats", appstats.NewHandler(crawler.Stats)) + http.Handle("/cron/parse", appstats.NewHandler(parseFeeds)) http.HandleFunc("/cron/delete", delete) http.HandleFunc("/_ah/start", func(w http.ResponseWriter, r *http.Request) { - + fmt.Fprintf(w, "Here boys") }) } From 8738e9a7a7e290de40fdea62adbe5b7951822e14 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sat, 14 Nov 2015 10:32:31 -0700 Subject: [PATCH 10/26] Doing a bad idea (storing all the things super flat) --- app/cron/crawler/Batcher.go | 8 +-- app/cron/crawler/Fetcher.go | 6 +-- app/cron/crawler/Miner.go | 76 ++++++++++++++++++++++++++ app/cron/crawler/Storage.go | 103 +++++++++++++++++++++++++++++++++--- app/cron/crawler/UnPager.go | 31 ++++++++++- app/cron/crawler/crawler.go | 7 ++- app/cron/cron.go | 32 +++++++++-- 7 files changed, 242 insertions(+), 21 deletions(-) create mode 100644 app/cron/crawler/Miner.go diff --git a/app/cron/crawler/Batcher.go b/app/cron/crawler/Batcher.go index 3ff3d77..720b28e 100644 --- a/app/cron/crawler/Batcher.go +++ b/app/cron/crawler/Batcher.go @@ -3,11 +3,11 @@ package crawler import "appengine" // Batcher takes input and batches to given sizes -func Batcher(c appengine.Context, in <-chan Data, size int) <-chan []Data { - out := make(chan []Data, 10000) +func Batcher(c appengine.Context, in <-chan interface{}, size int) <-chan []interface{} { + out := make(chan []interface{}, 10000) go func() { defer close(out) - batch := make([]Data, size) + batch := make([]interface{}, size) count := 0 for post := range in { batch[count] = post @@ -15,7 +15,7 @@ func Batcher(c appengine.Context, in <-chan Data, size int) <-chan []Data { if count >= size { count = 0 out <- batch - batch = make([]Data, size) // allocate another chunk of memory + batch = make([]interface{}, size) // allocate another chunk of memory } } c.Infof("Batcher: Finished Batching") diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go index c43c9a5..9afd088 100644 --- a/app/cron/crawler/Fetcher.go +++ b/app/cron/crawler/Fetcher.go @@ -16,8 +16,8 @@ func pageURL(idx int) string { } // Fetcher returns stream of un-processed xml posts -func Fetcher(c appengine.Context, workers int) <-chan Data { - res := make(chan Data, 100) +func Fetcher(c appengine.Context, workers int) <-chan interface{} { + res := make(chan interface{}, 100) worker := &fetcher{ res: res, context: c, @@ -28,7 +28,7 @@ func Fetcher(c appengine.Context, workers int) <-chan Data { } type fetcher struct { - res chan<- Data + res chan<- interface{} context appengine.Context client *http.Client todo chan int diff --git a/app/cron/crawler/Miner.go b/app/cron/crawler/Miner.go new file mode 100644 index 0000000..3a8133b --- /dev/null +++ b/app/cron/crawler/Miner.go @@ -0,0 +1,76 @@ +package crawler + +import ( + "encoding/xml" + "sync" + "time" + + "appengine" +) + +// Vertex of the graph +type Vertex struct { + Type string + Value string + Count int64 +} + +// Edge of the graph +type Edge struct { + Nodes []string +} + +// Miner takes posts and mines out a graph +func Miner(c appengine.Context, posts <-chan Data, workers int) (<-chan interface{}, <-chan interface{}) { + vertexes := make(chan interface{}, 10000) + edges := make(chan interface{}, 10000) + + var wg sync.WaitGroup + for i := 0; i < workers; i++ { + go func(i int) { + miner(c, posts, vertexes, edges, i) + wg.Done() + }(i) + } + wg.Add(workers) + + go func() { + wg.Wait() + close(vertexes) + close(edges) + }() + return vertexes, edges +} + +func miner(c appengine.Context, posts <-chan Data, vertexes chan<- interface{}, edges chan<- interface{}, i int) { + var data struct { + Tags []string `xml:"category"` + Imgs []struct { + URL string `xml:"url,attr"` + } `xml:"content"` + } + + for post := range posts { + vertexes <- Vertex{"Pst", post.KEY, 0} + + // log.Printf("Miner %d: Got Post: %s", i, post.KEY) + // log.Printf("Data: %s", post.XML) + + if err := xml.Unmarshal([]byte(""+post.XML+""), &data); err != nil { + c.Errorf("Miner %d: Error %s", i, err) + } + + for _, tag := range data.Tags { + // log.Printf("Found Tag: %s", tag) + vertexes <- Vertex{"Tag", tag, 0} + edges <- Edge{[]string{"Tag" + tag, "Pst" + post.KEY}} + } + + for _, img := range data.Imgs { + // log.Printf("Found Img: %s", img.URL) + vertexes <- Vertex{"Img", img.URL, 0} + edges <- Edge{[]string{"Img" + img.URL, "Pst" + post.KEY}} + } + time.Sleep(time.Second) + } +} diff --git a/app/cron/crawler/Storage.go b/app/cron/crawler/Storage.go index 6d62bbc..2e803c6 100644 --- a/app/cron/crawler/Storage.go +++ b/app/cron/crawler/Storage.go @@ -7,11 +7,23 @@ import ( "appengine/datastore" ) -func Storage(c appengine.Context, in <-chan []Data, workers int) { +// Storage push items to datastore +func Storage(c appengine.Context, in <-chan []interface{}, workers int, loc string) { + var store func(c appengine.Context, in <-chan []interface{}, x int, loc string) + + switch loc { + case XML: + store = runStorageData + case "vertex": + store = runStorageVertex + case "edge": + store = runStorageEdge + } + var wg sync.WaitGroup for i := 0; i < workers; i++ { go func(x int) { - runStorage(c, in, x) + store(c, in, x, loc) wg.Done() }(i) } @@ -19,26 +31,103 @@ func Storage(c appengine.Context, in <-chan []Data, workers int) { wg.Wait() } +// Puller pull items from datastore +// TODO: improve pulling performance (cache number of xml in stage_1, fan out pulling) +func Puller(c appengine.Context, loc string) <-chan string { + out := make(chan string, 10000) + + go func() { + defer close(out) + q := datastore.NewQuery(loc) + t := q.Run(c) + for { + var s Store + _, err := t.Next(&s) + if err == datastore.Done { + break // No further entities match the query. + } + if err != nil { + c.Errorf("fetching next Person: %v", err) + break + } + + // Do something with Person p and Key k + out <- string(s.XML) + } + }() + return out +} + +// Store single xml item to put in storage type Store struct { XML []byte } -func runStorage(c appengine.Context, in <-chan []Data, x int) { +func runStorageData(c appengine.Context, in <-chan []interface{}, x int, loc string) { var keys []*datastore.Key var items []Store + for batch := range in { - c.Infof("Storage %d: Storing chunk", x) + c.Infof("Storage %d: Storing Post chunk", x) keys = make([]*datastore.Key, len(batch)) items = make([]Store, len(batch)) for i, item := range batch { - keys[i] = datastore.NewKey(c, XML, item.KEY, 0, nil) - items[i].XML = []byte(item.XML) + x := item.(Data) + keys[i] = datastore.NewKey(c, loc, x.KEY, 0, nil) + items[i] = Store{[]byte(x.XML)} + } + + // c.Infof("Storage: Storing %v", keys) + _, err := datastore.PutMulti(c, keys, items) + if err != nil { + c.Errorf("Storage %d: Error %s: %v %v", x, err, keys, items) + panic(err) + } + } +} + +func runStorageVertex(c appengine.Context, in <-chan []interface{}, x int, loc string) { + var keys []*datastore.Key + var items []Vertex + + for batch := range in { + c.Infof("Storage %d: Storing Vertex chunk", x) + keys = make([]*datastore.Key, len(batch)) + items = make([]Vertex, len(batch)) + for i, item := range batch { + x := item.(Vertex) + keys[i] = datastore.NewKey(c, loc, x.Type+":"+x.Value, 0, nil) + items[i] = x + } + + // c.Infof("Storage: Storing %v", keys) + _, err := datastore.PutMulti(c, keys, items) + if err != nil { + c.Errorf("Storage %d: Error %s: %v %v", x, err, keys, items) + panic(err) + } + } +} + +func runStorageEdge(c appengine.Context, in <-chan []interface{}, x int, loc string) { + var keys []*datastore.Key + var items []Edge + + for batch := range in { + c.Infof("Storage %d: Storing Edge chunk", x) + keys = make([]*datastore.Key, len(batch)) + items = make([]Edge, len(batch)) + for i, item := range batch { + x := item.(Edge) + keys[i] = datastore.NewIncompleteKey(c, loc, nil) + items[i] = x } // c.Infof("Storage: Storing %v", keys) _, err := datastore.PutMulti(c, keys, items) if err != nil { - c.Errorf("Storage: Error storing batch %s", err) + c.Errorf("Storage %d: Error %s: %v %v", x, err, keys, items) + panic(err) } } } diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go index 2a3a21d..7cf4168 100644 --- a/app/cron/crawler/UnPager.go +++ b/app/cron/crawler/UnPager.go @@ -2,11 +2,40 @@ package crawler import ( "encoding/xml" + "fmt" + "net/http" "sync" "appengine" ) +// UnPage unpage and flatten data from Crawling +func UnPage(w http.ResponseWriter, r *http.Request) { + c := appengine.NewContext(r) + + pages := Puller(c, XML) + posts := UnPager(c, pages, 10) + + vertexes, edges := Miner(c, posts, 30) + + vbatch := Batcher(c, vertexes, 100) + ebatch := Batcher(c, edges, 100) + + var wg sync.WaitGroup + wg.Add(2) + go func() { + Storage(c, vbatch, 10, "vertex") + wg.Done() + }() + go func() { + Storage(c, ebatch, 10, "edge") + wg.Done() + }() + wg.Wait() + + fmt.Fprintf(w, "Done") +} + // UnPager process pages of posts to individual posts func UnPager(c appengine.Context, pages <-chan string, workers int) <-chan Data { res := make(chan Data, 100000) @@ -37,7 +66,7 @@ func runUnPager(c appengine.Context, in <-chan string, out chan<- Data, idx int) } for page := range in { - c.Infof("UnPager %d: Retrieved Page", idx) + // c.Infof("UnPager %d: Retrieved Page", idx) if err := xml.Unmarshal([]byte(page), &miner); err != nil { c.Errorf("UnPager: Error %s", err) diff --git a/app/cron/crawler/crawler.go b/app/cron/crawler/crawler.go index 8e322ce..ca203ab 100644 --- a/app/cron/crawler/crawler.go +++ b/app/cron/crawler/crawler.go @@ -20,8 +20,11 @@ const ( // DEPTH depth of feed mining DEPTH = 1 - // XML name of where xml posts are stored + // XML name of where xml posts pages are stored XML = "xml" + + // POST name of where xml posts are stored + POST = "post" ) type Data struct { @@ -38,7 +41,7 @@ func Crawl(w http.ResponseWriter, r *http.Request) { pages := Fetcher(c, fetchers) // posts := UnPager(c, pages, pagers) batch := Batcher(c, pages, 10) - Storage(c, batch, storers) + Storage(c, batch, storers, XML) fmt.Fprint(w, "Crawl Complete!") } diff --git a/app/cron/cron.go b/app/cron/cron.go index 4bc1439..4612c8f 100644 --- a/app/cron/cron.go +++ b/app/cron/cron.go @@ -35,15 +35,39 @@ const ( DEFERRED = true ) +func cleanup(c appengine.Context, name string) error { + q := datastore.NewQuery(name).KeysOnly() + keys, err := q.GetAll(c, nil) + s := 100 + for len(keys) > 0 { + if len(keys) < 100 { + s = len(keys) + } + err = datastore.DeleteMulti(c, keys[:s]) + keys = keys[s:] + } + return err +} + // Init initializes cron handlers func Init() { - http.HandleFunc("/cron/crawl", crawler.Crawl) + http.HandleFunc("/cron/stage/1", crawler.Crawl) + http.HandleFunc("/cron/stage/2", crawler.UnPage) + http.HandleFunc("/cron/stage/2/clean", func(w http.ResponseWriter, r *http.Request) { + c := appengine.NewContext(r) + cleanup(c, "edge") + cleanup(c, "vertex") + cleanup(c, "post") + }) http.Handle("/cron/stats", appstats.NewHandler(crawler.Stats)) - http.Handle("/cron/parse", appstats.NewHandler(parseFeeds)) - http.HandleFunc("/cron/delete", delete) + // http.Handle("/cron/parse", appstats.NewHandler(parseFeeds)) + // http.HandleFunc("/cron/delete", delete) http.HandleFunc("/_ah/start", func(w http.ResponseWriter, r *http.Request) { - fmt.Fprintf(w, "Here boys") + fmt.Fprintf(w, "Start") + }) + http.HandleFunc("/_ah/stop", func(w http.ResponseWriter, r *http.Request) { + fmt.Fprintf(w, "Stop") }) } From ac6a52f0ccb9de2fea1740a08f830987b0058661 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sat, 14 Nov 2015 17:02:57 -0700 Subject: [PATCH 11/26] Finding tags based on response data --- app/cron/chain/chain.go | 51 +++ app/cron/crawler/Miner.go | 76 ---- app/cron/crawler/Storage.go | 77 ---- app/cron/crawler/UnPager.go | 83 ---- app/cron/cron.go | 761 ++++++++++++++++++------------------ app/cron/proj/graph.go | 28 ++ app/cron/proj/proj.go | 118 ++++++ app/cron/proj/tags.go | 87 +++++ yaml/module-cron.yaml | 4 + 9 files changed, 667 insertions(+), 618 deletions(-) create mode 100644 app/cron/chain/chain.go delete mode 100644 app/cron/crawler/Miner.go delete mode 100644 app/cron/crawler/UnPager.go create mode 100644 app/cron/proj/graph.go create mode 100644 app/cron/proj/proj.go create mode 100644 app/cron/proj/tags.go diff --git a/app/cron/chain/chain.go b/app/cron/chain/chain.go new file mode 100644 index 0000000..67b235a --- /dev/null +++ b/app/cron/chain/chain.go @@ -0,0 +1,51 @@ +package chain + +import "sync" + +// Worker is a function designed to fan out and perform work on a piece of Data +type Worker func(in <-chan interface{}, out chan<- interface{}, idx int) + +// FanOut allows lengthy workers to fan out on chanel operations +func FanOut(count int, buff int, in <-chan interface{}, doIt Worker) <-chan interface{} { + out := make(chan interface{}, buff) + var wg sync.WaitGroup + wg.Add(count) + for i := 0; i < count; i++ { + go func(idx int) { + doIt(in, out, idx) + wg.Done() + }(i) + } + go func() { + wg.Wait() + close(out) + }() + return out +} + +// FanIn takes multiple chanels and pushes their results into a single channel +func FanIn(buff int, cs ...<-chan interface{}) <-chan interface{} { + var wg sync.WaitGroup + out := make(chan interface{}) + + // Start an output goroutine for each input channel in cs. output + // copies values from c to out until c is closed, then calls wg.Done. + output := func(c <-chan interface{}) { + for n := range c { + out <- n + } + wg.Done() + } + wg.Add(len(cs)) + for _, c := range cs { + go output(c) + } + + // Start a goroutine to close out once all the output goroutines are + // done. This must start after the wg.Add call. + go func() { + wg.Wait() + close(out) + }() + return out +} diff --git a/app/cron/crawler/Miner.go b/app/cron/crawler/Miner.go deleted file mode 100644 index 3a8133b..0000000 --- a/app/cron/crawler/Miner.go +++ /dev/null @@ -1,76 +0,0 @@ -package crawler - -import ( - "encoding/xml" - "sync" - "time" - - "appengine" -) - -// Vertex of the graph -type Vertex struct { - Type string - Value string - Count int64 -} - -// Edge of the graph -type Edge struct { - Nodes []string -} - -// Miner takes posts and mines out a graph -func Miner(c appengine.Context, posts <-chan Data, workers int) (<-chan interface{}, <-chan interface{}) { - vertexes := make(chan interface{}, 10000) - edges := make(chan interface{}, 10000) - - var wg sync.WaitGroup - for i := 0; i < workers; i++ { - go func(i int) { - miner(c, posts, vertexes, edges, i) - wg.Done() - }(i) - } - wg.Add(workers) - - go func() { - wg.Wait() - close(vertexes) - close(edges) - }() - return vertexes, edges -} - -func miner(c appengine.Context, posts <-chan Data, vertexes chan<- interface{}, edges chan<- interface{}, i int) { - var data struct { - Tags []string `xml:"category"` - Imgs []struct { - URL string `xml:"url,attr"` - } `xml:"content"` - } - - for post := range posts { - vertexes <- Vertex{"Pst", post.KEY, 0} - - // log.Printf("Miner %d: Got Post: %s", i, post.KEY) - // log.Printf("Data: %s", post.XML) - - if err := xml.Unmarshal([]byte(""+post.XML+""), &data); err != nil { - c.Errorf("Miner %d: Error %s", i, err) - } - - for _, tag := range data.Tags { - // log.Printf("Found Tag: %s", tag) - vertexes <- Vertex{"Tag", tag, 0} - edges <- Edge{[]string{"Tag" + tag, "Pst" + post.KEY}} - } - - for _, img := range data.Imgs { - // log.Printf("Found Img: %s", img.URL) - vertexes <- Vertex{"Img", img.URL, 0} - edges <- Edge{[]string{"Img" + img.URL, "Pst" + post.KEY}} - } - time.Sleep(time.Second) - } -} diff --git a/app/cron/crawler/Storage.go b/app/cron/crawler/Storage.go index 2e803c6..a527a16 100644 --- a/app/cron/crawler/Storage.go +++ b/app/cron/crawler/Storage.go @@ -14,10 +14,6 @@ func Storage(c appengine.Context, in <-chan []interface{}, workers int, loc stri switch loc { case XML: store = runStorageData - case "vertex": - store = runStorageVertex - case "edge": - store = runStorageEdge } var wg sync.WaitGroup @@ -31,33 +27,6 @@ func Storage(c appengine.Context, in <-chan []interface{}, workers int, loc stri wg.Wait() } -// Puller pull items from datastore -// TODO: improve pulling performance (cache number of xml in stage_1, fan out pulling) -func Puller(c appengine.Context, loc string) <-chan string { - out := make(chan string, 10000) - - go func() { - defer close(out) - q := datastore.NewQuery(loc) - t := q.Run(c) - for { - var s Store - _, err := t.Next(&s) - if err == datastore.Done { - break // No further entities match the query. - } - if err != nil { - c.Errorf("fetching next Person: %v", err) - break - } - - // Do something with Person p and Key k - out <- string(s.XML) - } - }() - return out -} - // Store single xml item to put in storage type Store struct { XML []byte @@ -85,49 +54,3 @@ func runStorageData(c appengine.Context, in <-chan []interface{}, x int, loc str } } } - -func runStorageVertex(c appengine.Context, in <-chan []interface{}, x int, loc string) { - var keys []*datastore.Key - var items []Vertex - - for batch := range in { - c.Infof("Storage %d: Storing Vertex chunk", x) - keys = make([]*datastore.Key, len(batch)) - items = make([]Vertex, len(batch)) - for i, item := range batch { - x := item.(Vertex) - keys[i] = datastore.NewKey(c, loc, x.Type+":"+x.Value, 0, nil) - items[i] = x - } - - // c.Infof("Storage: Storing %v", keys) - _, err := datastore.PutMulti(c, keys, items) - if err != nil { - c.Errorf("Storage %d: Error %s: %v %v", x, err, keys, items) - panic(err) - } - } -} - -func runStorageEdge(c appengine.Context, in <-chan []interface{}, x int, loc string) { - var keys []*datastore.Key - var items []Edge - - for batch := range in { - c.Infof("Storage %d: Storing Edge chunk", x) - keys = make([]*datastore.Key, len(batch)) - items = make([]Edge, len(batch)) - for i, item := range batch { - x := item.(Edge) - keys[i] = datastore.NewIncompleteKey(c, loc, nil) - items[i] = x - } - - // c.Infof("Storage: Storing %v", keys) - _, err := datastore.PutMulti(c, keys, items) - if err != nil { - c.Errorf("Storage %d: Error %s: %v %v", x, err, keys, items) - panic(err) - } - } -} diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go deleted file mode 100644 index 7cf4168..0000000 --- a/app/cron/crawler/UnPager.go +++ /dev/null @@ -1,83 +0,0 @@ -package crawler - -import ( - "encoding/xml" - "fmt" - "net/http" - "sync" - - "appengine" -) - -// UnPage unpage and flatten data from Crawling -func UnPage(w http.ResponseWriter, r *http.Request) { - c := appengine.NewContext(r) - - pages := Puller(c, XML) - posts := UnPager(c, pages, 10) - - vertexes, edges := Miner(c, posts, 30) - - vbatch := Batcher(c, vertexes, 100) - ebatch := Batcher(c, edges, 100) - - var wg sync.WaitGroup - wg.Add(2) - go func() { - Storage(c, vbatch, 10, "vertex") - wg.Done() - }() - go func() { - Storage(c, ebatch, 10, "edge") - wg.Done() - }() - wg.Wait() - - fmt.Fprintf(w, "Done") -} - -// UnPager process pages of posts to individual posts -func UnPager(c appengine.Context, pages <-chan string, workers int) <-chan Data { - res := make(chan Data, 100000) - - // TODO: spin up as many unpages as desired - var wg sync.WaitGroup - wg.Add(workers) - for i := 0; i < workers; i++ { - go func(x int) { - runUnPager(c, pages, res, x) - wg.Done() - }(i) - } - go func() { - wg.Wait() - close(res) - }() - - return res -} - -func runUnPager(c appengine.Context, in <-chan string, out chan<- Data, idx int) { - var miner struct { - Item []struct { - KEY string `xml:"guid"` - XML string `xml:",innerxml"` - } `xml:"channel>item"` - } - - for page := range in { - // c.Infof("UnPager %d: Retrieved Page", idx) - - if err := xml.Unmarshal([]byte(page), &miner); err != nil { - c.Errorf("UnPager: Error %s", err) - } - - for _, post := range miner.Item { - // c.Infof("UnPager: Found Post %s", post.KEY) - out <- Data{ - KEY: post.KEY, - XML: post.XML, - } - } - } -} diff --git a/app/cron/cron.go b/app/cron/cron.go index 4612c8f..ddeb2ff 100644 --- a/app/cron/cron.go +++ b/app/cron/cron.go @@ -1,41 +1,34 @@ package cron import ( - "encoding/xml" "fmt" "net/http" - "net/url" - "regexp" - "strconv" "github.com/bign8/chive-show/app/cron/crawler" - "github.com/bign8/chive-show/app/helpers/keycache" - "github.com/bign8/chive-show/app/models" + "github.com/bign8/chive-show/app/cron/proj" "gopkg.in/mjibson/v1/appstats" "appengine" "appengine/datastore" - "appengine/delay" - "appengine/taskqueue" - "appengine/urlfetch" ) -const ( - // SIZE of a batch - SIZE = 10 - - // DEBUG enable if troubleshooting algorithm - DEBUG = true - - // DEPTH depth of feed mining - DEPTH = 1 - - // DEFERRED if deferreds should be processed deferred - DEFERRED = true -) +// const ( +// // SIZE of a batch +// SIZE = 10 +// +// // DEBUG enable if troubleshooting algorithm +// DEBUG = true +// +// // DEPTH depth of feed mining +// DEPTH = 1 +// +// // DEFERRED if deferreds should be processed deferred +// DEFERRED = true +// ) func cleanup(c appengine.Context, name string) error { + c.Infof("Cleaning %s", name) q := datastore.NewQuery(name).KeysOnly() keys, err := q.GetAll(c, nil) s := 100 @@ -52,13 +45,17 @@ func cleanup(c appengine.Context, name string) error { // Init initializes cron handlers func Init() { http.HandleFunc("/cron/stage/1", crawler.Crawl) - http.HandleFunc("/cron/stage/2", crawler.UnPage) - http.HandleFunc("/cron/stage/2/clean", func(w http.ResponseWriter, r *http.Request) { + + http.Handle("/proj/tags", appstats.NewHandler(proj.Tags)) + + http.HandleFunc("/clean", func(w http.ResponseWriter, r *http.Request) { c := appengine.NewContext(r) + cleanup(c, "buff") cleanup(c, "edge") cleanup(c, "vertex") cleanup(c, "post") }) + http.Handle("/cron/stats", appstats.NewHandler(crawler.Stats)) // http.Handle("/cron/parse", appstats.NewHandler(parseFeeds)) @@ -71,361 +68,361 @@ func Init() { }) } -var ( - // ErrFeedParse404 if feed page is not found - ErrFeedParse404 = fmt.Errorf("Feed parcing recieved a %d Status Code", 404) -) - -func pageURL(idx int) string { - return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx) -} - -func parseFeeds(c appengine.Context, w http.ResponseWriter, r *http.Request) { - fp := new(feedParser) - err := fp.Main(c, w) - if err != nil { - http.Error(w, err.Error(), http.StatusInternalServerError) - } else { - fmt.Fprint(w, "Parsed") - } -} - -type feedParser struct { - context appengine.Context - client *http.Client - - todo []int - guids map[int64]bool // this could be extremely large - posts []models.Post -} - -func (x *feedParser) Main(c appengine.Context, w http.ResponseWriter) error { - x.context = c - x.client = urlfetch.Client(c) - - // Load guids from DB - // TODO: do this with sharded keys - keys, err := datastore.NewQuery(models.POST).KeysOnly().GetAll(c, nil) - if err != nil { - c.Errorf("Error finding keys %v %v", err, appengine.IsOverQuota(err)) - return err - } - x.guids = map[int64]bool{} - for _, key := range keys { - x.guids[key.IntID()] = true - } - keys = nil - - // // DEBUG ONLY - // data, err := json.MarshalIndent(x.guids, "", " ") - // fmt.Fprint(w, string(data)) - // return err - x.posts = make([]models.Post, 0) - - // Initial recursive edge case - isStop, fullStop, err := x.isStop(1) - if isStop || fullStop || err != nil { - c.Infof("Finished without recursive searching %v", err) - if err == nil { - err = x.storePosts(x.posts) - } - return err - } - - // Recursive search strategy - err = x.Search(1, -1) - - // storePosts and processTodo - if err == nil { - errc := make(chan error) - go func() { - errc <- x.storePosts(x.posts) - }() - go func() { - errc <- x.processTodo() - }() - err1, err2 := <-errc, <-errc - if err1 != nil { - err = err1 - } else if err2 != nil { - err = err2 - } - } - - if err != nil { - c.Errorf("Error in Main %v", err) - } - return err -} - -var processBatchDeferred = delay.Func("process-todo-batch", func(c appengine.Context, ids []int) { - parser := feedParser{ - context: c, - client: urlfetch.Client(c), - } - parser.processBatch(ids) -}) - -func (x *feedParser) processBatch(ids []int) error { - done := make(chan error) - for _, idx := range ids { - go func(idx int) { - posts, err := x.getAndParseFeed(idx) - if err == nil { - err = x.storePosts(posts) - } - done <- err - }(idx) - } - for i := 0; i < len(ids); i++ { - err := <-done - if err != nil { - x.context.Errorf("error storing feed (at index %d): %v", i, err) - return err - } - } - return nil -} - -func (x *feedParser) processTodo() error { - x.context.Infof("Processing TODO: %v", x.todo) - - var batch []int - var task *taskqueue.Task - var allTasks []*taskqueue.Task - var err error - for _, idx := range x.todo { - if batch == nil { - batch = make([]int, 0) - } - batch = append(batch, idx) - if len(batch) >= SIZE { - if DEFERRED { - task, err = processBatchDeferred.Task(batch) - if err == nil { - allTasks = append(allTasks, task) - } - } else { - err = x.processBatch(batch) - } - if err != nil { - return err - } - batch = nil - } - } - if len(batch) > 0 { - if DEFERRED { - task, err = processBatchDeferred.Task(batch) - if err == nil { - allTasks = append(allTasks, task) - } - } else { - err = x.processBatch(batch) - } - } - if DEFERRED && len(allTasks) > 0 { - x.context.Infof("Adding %d task(s) to the default queue", len(allTasks)) - taskqueue.AddMulti(x.context, allTasks, "default") - } - return err -} - -func (x *feedParser) addRange(bottom, top int) { - for i := bottom + 1; i < top; i++ { - x.todo = append(x.todo, i) - } -} - -func (x *feedParser) Search(bottom, top int) (err error) { - /* - def infinite_length(bottom=1, top=-1): - if bottom == 1 and not item_exists(1): return 0 # Starting edge case - if bottom == top - 1: return bottom # Result found! (top doesn’t exist) - if top < 0: # Searching forward - top = bottom << 1 # Base 2 hops - if item_exists(top): - top, bottom = -1, top # continue searching forward - else: # Binary search between bottom and top - middle = (bottom + top) // 2 - bottom, top = middle, top if item_exists(middle) else bottom, middle - return infinite_length(bottom, top) # Tail recursion!!! - */ - if bottom == top-1 { - x.context.Infof("TOP OF RANGE FOUND! @%d", top) - x.addRange(bottom, top) - return nil - } - var fullStop, isStop bool = false, false - if top < 0 { // Searching forward - top = bottom << 1 // Base 2 hops forward - isStop, fullStop, err = x.isStop(top) - if err != nil { - return err - } - if !isStop { - x.addRange(bottom, top) - top, bottom = -1, top - } - } else { // Binary search between top and bottom - middle := (bottom + top) / 2 - isStop, fullStop, err = x.isStop(middle) - if err != nil { - return err - } - if isStop { - top = middle - } else { - x.addRange(bottom, middle) - bottom = middle - } - } - if fullStop { - return nil - } - return x.Search(bottom, top) // TAIL RECURSION!!! -} - -func (x *feedParser) isStop(idx int) (isStop, fullStop bool, err error) { - // Gather posts as necessary - posts, err := x.getAndParseFeed(idx) - if err == ErrFeedParse404 { - x.context.Infof("Reached the end of the feed list (%v)", idx) - return true, false, nil - } - if err != nil { - x.context.Errorf("Error decoding ChiveFeed: %s", err) - return false, false, err - } - - // Check for Duplicates - count := 0 - for _, post := range posts { - id, _, err := guidToInt(post.GUID) - if x.guids[id] || err != nil { - continue - } - count++ - } - x.posts = append(x.posts, posts...) - - // Use store_count info to determine if isStop - isStop = count == 0 || DEBUG - fullStop = len(posts) != count && count > 0 - if DEBUG { - isStop = idx > DEPTH - fullStop = idx == DEPTH - } - return -} - -func (x *feedParser) getAndParseFeed(idx int) ([]models.Post, error) { - url := pageURL(idx) - - // Get Response - x.context.Infof("Parsing index %v (%v)", idx, url) - resp, err := x.client.Get(url) - if err != nil { - return nil, err - } - defer resp.Body.Close() - if resp.StatusCode != 200 { - if resp.StatusCode == 404 { - return nil, ErrFeedParse404 - } - return nil, fmt.Errorf("Feed parcing recieved a %d Status Code", resp.StatusCode) - } - - // Decode Response - decoder := xml.NewDecoder(resp.Body) - var feed struct { - Items []models.Post `xml:"channel>item"` - } - if decoder.Decode(&feed) != nil { - return nil, err - } - - // Cleanup Response - for idx := range feed.Items { - post := &feed.Items[idx] - for i, img := range post.Media { - post.Media[i].URL = stripQuery(img.URL) - } - post.MugShot = post.Media[0].URL - post.Media = post.Media[1:] - } - return feed.Items, err -} - -func (x *feedParser) storePosts(dirty []models.Post) (err error) { - var posts []models.Post - var keys []*datastore.Key - for _, post := range dirty { - key, err := x.cleanPost(&post) - if err != nil { - continue - } - posts = append(posts, post) - keys = append(keys, key) - } - if len(keys) > 0 { - complete, err := datastore.PutMulti(x.context, keys, posts) - if err == nil { - err = keycache.AddKeys(x.context, models.POST, complete) - } - } - return err -} - -func (x *feedParser) cleanPost(p *models.Post) (*datastore.Key, error) { - id, link, err := guidToInt(p.GUID) - if err != nil { - return nil, err - } - // Remove link posts - if link { - x.context.Infof("Ignoring links post %v \"%v\"", p.GUID, p.Title) - return nil, fmt.Errorf("Ignoring links post") - } - - // Detect video only posts - video := regexp.MustCompile("\\([^&]*Video.*\\)") - if video.MatchString(p.Title) { - x.context.Infof("Ignoring video post %v \"%v\"", p.GUID, p.Title) - return nil, fmt.Errorf("Ignoring video post") - } - x.context.Infof("Storing post %v \"%v\"", p.GUID, p.Title) - - // Cleanup post titles - clean := regexp.MustCompile("\\W\\(([^\\)]*)\\)$") - p.Title = clean.ReplaceAllLiteralString(p.Title, "") - - // Post - // temp_key := datastore.NewIncompleteKey(x.context, DB_POST_TABLE, nil) - key := datastore.NewKey(x.context, models.POST, "", id, nil) - return key, nil -} - -func guidToInt(guid string) (int64, bool, error) { - // Remove link posts - url, err := url.Parse(guid) - if err != nil { - return -1, false, err - } - - // Parsing post id from guid url - id, err := strconv.Atoi(url.Query().Get("p")) - if err != nil { - return -1, false, err - } - return int64(id), url.Query().Get("post_type") == "sdac_links", nil -} - -func stripQuery(dirty string) string { - obj, err := url.Parse(dirty) - if err != nil { - return dirty - } - obj.RawQuery = "" - return obj.String() -} +// var ( +// // ErrFeedParse404 if feed page is not found +// ErrFeedParse404 = fmt.Errorf("Feed parcing recieved a %d Status Code", 404) +// ) +// +// func pageURL(idx int) string { +// return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx) +// } +// +// func parseFeeds(c appengine.Context, w http.ResponseWriter, r *http.Request) { +// fp := new(feedParser) +// err := fp.Main(c, w) +// if err != nil { +// http.Error(w, err.Error(), http.StatusInternalServerError) +// } else { +// fmt.Fprint(w, "Parsed") +// } +// } +// +// type feedParser struct { +// context appengine.Context +// client *http.Client +// +// todo []int +// guids map[int64]bool // this could be extremely large +// posts []models.Post +// } +// +// func (x *feedParser) Main(c appengine.Context, w http.ResponseWriter) error { +// x.context = c +// x.client = urlfetch.Client(c) +// +// // Load guids from DB +// // TODO: do this with sharded keys +// keys, err := datastore.NewQuery(models.POST).KeysOnly().GetAll(c, nil) +// if err != nil { +// c.Errorf("Error finding keys %v %v", err, appengine.IsOverQuota(err)) +// return err +// } +// x.guids = map[int64]bool{} +// for _, key := range keys { +// x.guids[key.IntID()] = true +// } +// keys = nil +// +// // // DEBUG ONLY +// // data, err := json.MarshalIndent(x.guids, "", " ") +// // fmt.Fprint(w, string(data)) +// // return err +// x.posts = make([]models.Post, 0) +// +// // Initial recursive edge case +// isStop, fullStop, err := x.isStop(1) +// if isStop || fullStop || err != nil { +// c.Infof("Finished without recursive searching %v", err) +// if err == nil { +// err = x.storePosts(x.posts) +// } +// return err +// } +// +// // Recursive search strategy +// err = x.Search(1, -1) +// +// // storePosts and processTodo +// if err == nil { +// errc := make(chan error) +// go func() { +// errc <- x.storePosts(x.posts) +// }() +// go func() { +// errc <- x.processTodo() +// }() +// err1, err2 := <-errc, <-errc +// if err1 != nil { +// err = err1 +// } else if err2 != nil { +// err = err2 +// } +// } +// +// if err != nil { +// c.Errorf("Error in Main %v", err) +// } +// return err +// } +// +// var processBatchDeferred = delay.Func("process-todo-batch", func(c appengine.Context, ids []int) { +// parser := feedParser{ +// context: c, +// client: urlfetch.Client(c), +// } +// parser.processBatch(ids) +// }) +// +// func (x *feedParser) processBatch(ids []int) error { +// done := make(chan error) +// for _, idx := range ids { +// go func(idx int) { +// posts, err := x.getAndParseFeed(idx) +// if err == nil { +// err = x.storePosts(posts) +// } +// done <- err +// }(idx) +// } +// for i := 0; i < len(ids); i++ { +// err := <-done +// if err != nil { +// x.context.Errorf("error storing feed (at index %d): %v", i, err) +// return err +// } +// } +// return nil +// } +// +// func (x *feedParser) processTodo() error { +// x.context.Infof("Processing TODO: %v", x.todo) +// +// var batch []int +// var task *taskqueue.Task +// var allTasks []*taskqueue.Task +// var err error +// for _, idx := range x.todo { +// if batch == nil { +// batch = make([]int, 0) +// } +// batch = append(batch, idx) +// if len(batch) >= SIZE { +// if DEFERRED { +// task, err = processBatchDeferred.Task(batch) +// if err == nil { +// allTasks = append(allTasks, task) +// } +// } else { +// err = x.processBatch(batch) +// } +// if err != nil { +// return err +// } +// batch = nil +// } +// } +// if len(batch) > 0 { +// if DEFERRED { +// task, err = processBatchDeferred.Task(batch) +// if err == nil { +// allTasks = append(allTasks, task) +// } +// } else { +// err = x.processBatch(batch) +// } +// } +// if DEFERRED && len(allTasks) > 0 { +// x.context.Infof("Adding %d task(s) to the default queue", len(allTasks)) +// taskqueue.AddMulti(x.context, allTasks, "default") +// } +// return err +// } +// +// func (x *feedParser) addRange(bottom, top int) { +// for i := bottom + 1; i < top; i++ { +// x.todo = append(x.todo, i) +// } +// } +// +// func (x *feedParser) Search(bottom, top int) (err error) { +// /* +// def infinite_length(bottom=1, top=-1): +// if bottom == 1 and not item_exists(1): return 0 # Starting edge case +// if bottom == top - 1: return bottom # Result found! (top doesn’t exist) +// if top < 0: # Searching forward +// top = bottom << 1 # Base 2 hops +// if item_exists(top): +// top, bottom = -1, top # continue searching forward +// else: # Binary search between bottom and top +// middle = (bottom + top) // 2 +// bottom, top = middle, top if item_exists(middle) else bottom, middle +// return infinite_length(bottom, top) # Tail recursion!!! +// */ +// if bottom == top-1 { +// x.context.Infof("TOP OF RANGE FOUND! @%d", top) +// x.addRange(bottom, top) +// return nil +// } +// var fullStop, isStop bool = false, false +// if top < 0 { // Searching forward +// top = bottom << 1 // Base 2 hops forward +// isStop, fullStop, err = x.isStop(top) +// if err != nil { +// return err +// } +// if !isStop { +// x.addRange(bottom, top) +// top, bottom = -1, top +// } +// } else { // Binary search between top and bottom +// middle := (bottom + top) / 2 +// isStop, fullStop, err = x.isStop(middle) +// if err != nil { +// return err +// } +// if isStop { +// top = middle +// } else { +// x.addRange(bottom, middle) +// bottom = middle +// } +// } +// if fullStop { +// return nil +// } +// return x.Search(bottom, top) // TAIL RECURSION!!! +// } +// +// func (x *feedParser) isStop(idx int) (isStop, fullStop bool, err error) { +// // Gather posts as necessary +// posts, err := x.getAndParseFeed(idx) +// if err == ErrFeedParse404 { +// x.context.Infof("Reached the end of the feed list (%v)", idx) +// return true, false, nil +// } +// if err != nil { +// x.context.Errorf("Error decoding ChiveFeed: %s", err) +// return false, false, err +// } +// +// // Check for Duplicates +// count := 0 +// for _, post := range posts { +// id, _, err := guidToInt(post.GUID) +// if x.guids[id] || err != nil { +// continue +// } +// count++ +// } +// x.posts = append(x.posts, posts...) +// +// // Use store_count info to determine if isStop +// isStop = count == 0 || DEBUG +// fullStop = len(posts) != count && count > 0 +// if DEBUG { +// isStop = idx > DEPTH +// fullStop = idx == DEPTH +// } +// return +// } +// +// func (x *feedParser) getAndParseFeed(idx int) ([]models.Post, error) { +// url := pageURL(idx) +// +// // Get Response +// x.context.Infof("Parsing index %v (%v)", idx, url) +// resp, err := x.client.Get(url) +// if err != nil { +// return nil, err +// } +// defer resp.Body.Close() +// if resp.StatusCode != 200 { +// if resp.StatusCode == 404 { +// return nil, ErrFeedParse404 +// } +// return nil, fmt.Errorf("Feed parcing recieved a %d Status Code", resp.StatusCode) +// } +// +// // Decode Response +// decoder := xml.NewDecoder(resp.Body) +// var feed struct { +// Items []models.Post `xml:"channel>item"` +// } +// if decoder.Decode(&feed) != nil { +// return nil, err +// } +// +// // Cleanup Response +// for idx := range feed.Items { +// post := &feed.Items[idx] +// for i, img := range post.Media { +// post.Media[i].URL = stripQuery(img.URL) +// } +// post.MugShot = post.Media[0].URL +// post.Media = post.Media[1:] +// } +// return feed.Items, err +// } +// +// func (x *feedParser) storePosts(dirty []models.Post) (err error) { +// var posts []models.Post +// var keys []*datastore.Key +// for _, post := range dirty { +// key, err := x.cleanPost(&post) +// if err != nil { +// continue +// } +// posts = append(posts, post) +// keys = append(keys, key) +// } +// if len(keys) > 0 { +// complete, err := datastore.PutMulti(x.context, keys, posts) +// if err == nil { +// err = keycache.AddKeys(x.context, models.POST, complete) +// } +// } +// return err +// } +// +// func (x *feedParser) cleanPost(p *models.Post) (*datastore.Key, error) { +// id, link, err := guidToInt(p.GUID) +// if err != nil { +// return nil, err +// } +// // Remove link posts +// if link { +// x.context.Infof("Ignoring links post %v \"%v\"", p.GUID, p.Title) +// return nil, fmt.Errorf("Ignoring links post") +// } +// +// // Detect video only posts +// video := regexp.MustCompile("\\([^&]*Video.*\\)") +// if video.MatchString(p.Title) { +// x.context.Infof("Ignoring video post %v \"%v\"", p.GUID, p.Title) +// return nil, fmt.Errorf("Ignoring video post") +// } +// x.context.Infof("Storing post %v \"%v\"", p.GUID, p.Title) +// +// // Cleanup post titles +// clean := regexp.MustCompile("\\W\\(([^\\)]*)\\)$") +// p.Title = clean.ReplaceAllLiteralString(p.Title, "") +// +// // Post +// // temp_key := datastore.NewIncompleteKey(x.context, DB_POST_TABLE, nil) +// key := datastore.NewKey(x.context, models.POST, "", id, nil) +// return key, nil +// } +// +// func guidToInt(guid string) (int64, bool, error) { +// // Remove link posts +// url, err := url.Parse(guid) +// if err != nil { +// return -1, false, err +// } +// +// // Parsing post id from guid url +// id, err := strconv.Atoi(url.Query().Get("p")) +// if err != nil { +// return -1, false, err +// } +// return int64(id), url.Query().Get("post_type") == "sdac_links", nil +// } +// +// func stripQuery(dirty string) string { +// obj, err := url.Parse(dirty) +// if err != nil { +// return dirty +// } +// obj.RawQuery = "" +// return obj.String() +// } diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go new file mode 100644 index 0000000..44b211b --- /dev/null +++ b/app/cron/proj/graph.go @@ -0,0 +1,28 @@ +package proj + +import ( + "net/http" + "time" + + "appengine" +) + +// Graph processes all posts in attempt to create a graph +func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) { + start := time.Now() + + // pages := puller(c) + // dirtyTags := getNod(c, pages, 100) + // tags := cleaner(dirtyTags) + // + // found := map[string]int64{} + // for tag := range tags { + // found[tag]++ + // } + // + // for key, value := range found { + // fmt.Fprintf(w, "%s,%d\n", key, value) + // } + + c.Infof("Time took: %v", time.Since(start)) +} diff --git a/app/cron/proj/proj.go b/app/cron/proj/proj.go new file mode 100644 index 0000000..5ad4d23 --- /dev/null +++ b/app/cron/proj/proj.go @@ -0,0 +1,118 @@ +package proj + +import ( + "encoding/xml" + "log" + "sync" + + "appengine" + "appengine/datastore" + + "github.com/bign8/chive-show/app/cron/chain" + "github.com/bign8/chive-show/app/cron/crawler" +) + +// XMLPage xml processor for a page +type XMLPage struct { + Items []struct { + GUID string `xml:"guid"` + Tags []string `xml:"category"` + Imgs []struct { + URL string `xml:"url,attr"` + } `xml:"content"` + } `xml:"channel>item"` +} + +// Item is a post item +type Item struct { + GUID string + Tags []string + Imgs []string +} + +// TODO: improve pulling performance (cache number of xml in stage_1, fan out pulling) +func puller(c appengine.Context) <-chan []byte { + out := make(chan []byte, 10000) + + go func() { + defer close(out) + q := datastore.NewQuery(crawler.XML) + t := q.Run(c) + for { + var s crawler.Store + _, err := t.Next(&s) + if err == datastore.Done { + break // No further entities match the query. + } + if err != nil { + c.Errorf("fetching next Person: %v", err) + break + } + out <- s.XML + } + }() + return out +} + +func flatten(c appengine.Context, in <-chan []byte) <-chan Item { + const WORKERS = 100 + out := make(chan Item, 10000) + var wg sync.WaitGroup + wg.Add(WORKERS) + for i := 0; i < WORKERS; i++ { + go func(idx int) { + flattenWorker(c, in, out, idx) + wg.Done() + }(i) + } + go func() { + wg.Wait() + close(out) + }() + return out +} + +func flattenWorker(c appengine.Context, in <-chan []byte, out chan<- Item, idx int) { + var xmlPage XMLPage + var imgs []string + + for data := range in { + if err := xml.Unmarshal(data, &xmlPage); err != nil { + c.Errorf("Flatten %d: %v", idx, err) + continue + } + for _, item := range xmlPage.Items { + imgs = make([]string, len(item.Imgs)) + for i, img := range item.Imgs { + imgs[i] = img.URL + } + + out <- Item{ + GUID: item.GUID, + Tags: item.Tags, + Imgs: imgs, + } + } + } +} + +func doMagic() { + start := make(chan interface{}, 10) + out := chain.FanOut(10, 10, start, worker) + go func() { + for o := range out { + log.Printf("Something: %v", o) + } + }() + start <- 1 + start <- 2 + start <- 3 +} + +func worker(in <-chan interface{}, out chan<- interface{}, idx int) { + var bytes []byte + for x := range in { + bytes = x.([]byte) + out <- bytes + } +} diff --git a/app/cron/proj/tags.go b/app/cron/proj/tags.go new file mode 100644 index 0000000..337d884 --- /dev/null +++ b/app/cron/proj/tags.go @@ -0,0 +1,87 @@ +package proj + +import ( + "encoding/xml" + "fmt" + "net/http" + "strings" + "sync" + "time" + + "appengine" +) + +// Tags etrieves the tags from the dataset +func Tags(c appengine.Context, w http.ResponseWriter, r *http.Request) { + start := time.Now() + + pages := puller(c) + dirtyTags := getTags(c, pages, 100) + tags := cleaner(dirtyTags) + + found := map[string]int64{} + for tag := range tags { + found[tag]++ + } + + for key, value := range found { + fmt.Fprintf(w, "%s,%d\n", key, value) + } + + c.Infof("Time took: %v", time.Since(start)) +} + +func getTags(c appengine.Context, in <-chan []byte, workers int) <-chan string { + out := make(chan string, 10000) + var wg sync.WaitGroup + wg.Add(workers) + for i := 0; i < workers; i++ { + go func(idx int) { + tags(c, in, out, idx) + wg.Done() + }(i) + } + go func() { + wg.Wait() + close(out) + }() + return out +} + +func tags(c appengine.Context, in <-chan []byte, out chan<- string, idx int) { + var xmlPage = XMLPage{} + + for data := range in { + if err := xml.Unmarshal(data, &xmlPage); err != nil { + c.Errorf("Miner %d: Error %s", idx, err) + continue + } + + for _, item := range xmlPage.Items { + for _, tag := range item.Tags { + out <- tag + } + } + } +} + +func cleaner(in <-chan string) <-chan string { + // http://xpo6.com/list-of-english-stop-words/ + var stopWords = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" + var stops = map[string]bool{} + for _, s := range strings.Split(stopWords, ",") { + stops[s] = true + } + + out := make(chan string, 10000) + go func() { + for s := range in { + s = strings.ToLower(s) + if !stops[s] { + out <- s + } + } + close(out) + }() + return out +} diff --git a/yaml/module-cron.yaml b/yaml/module-cron.yaml index a70f2c3..c9ed0dd 100644 --- a/yaml/module-cron.yaml +++ b/yaml/module-cron.yaml @@ -16,6 +16,10 @@ handlers: script: _go_app login: admin +- url: /proj/.* + script: _go_app + login: admin + error_handlers: - file: err/default.html - error_code: over_quota From a5ddd2a1dd067f0d13e25da6202f327bbaf046df Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sat, 14 Nov 2015 20:46:35 -0700 Subject: [PATCH 12/26] Optimizing tags to take <15s --- app/cron/chain/chain.go | 6 ++- app/cron/proj/proj.go | 78 ++++++++-------------------- app/cron/proj/tags.go | 110 +++++++++++++++++++++------------------- 3 files changed, 81 insertions(+), 113 deletions(-) diff --git a/app/cron/chain/chain.go b/app/cron/chain/chain.go index 67b235a..35e2952 100644 --- a/app/cron/chain/chain.go +++ b/app/cron/chain/chain.go @@ -3,7 +3,7 @@ package chain import "sync" // Worker is a function designed to fan out and perform work on a piece of Data -type Worker func(in <-chan interface{}, out chan<- interface{}, idx int) +type Worker func(obj interface{}, out chan<- interface{}, idx int) // FanOut allows lengthy workers to fan out on chanel operations func FanOut(count int, buff int, in <-chan interface{}, doIt Worker) <-chan interface{} { @@ -12,7 +12,9 @@ func FanOut(count int, buff int, in <-chan interface{}, doIt Worker) <-chan inte wg.Add(count) for i := 0; i < count; i++ { go func(idx int) { - doIt(in, out, idx) + for obj := range in { + doIt(obj, out, idx) + } wg.Done() }(i) } diff --git a/app/cron/proj/proj.go b/app/cron/proj/proj.go index 5ad4d23..6e56ec3 100644 --- a/app/cron/proj/proj.go +++ b/app/cron/proj/proj.go @@ -2,8 +2,6 @@ package proj import ( "encoding/xml" - "log" - "sync" "appengine" "appengine/datastore" @@ -30,17 +28,22 @@ type Item struct { Imgs []string } -// TODO: improve pulling performance (cache number of xml in stage_1, fan out pulling) -func puller(c appengine.Context) <-chan []byte { - out := make(chan []byte, 10000) +func getItems(c appengine.Context) <-chan interface{} { + pages := puller(c) + return chain.FanOut(50, 10000, pages, flatten(c)) +} + +func puller(c appengine.Context) <-chan interface{} { + out := make(chan interface{}, 10000) + // TODO: improve pulling performance (cache number of xml in stage_1, fan out pulling) go func() { defer close(out) q := datastore.NewQuery(crawler.XML) - t := q.Run(c) + iterator := q.Run(c) for { var s crawler.Store - _, err := t.Next(&s) + _, err := iterator.Next(&s) if err == datastore.Done { break // No further entities match the query. } @@ -54,65 +57,24 @@ func puller(c appengine.Context) <-chan []byte { return out } -func flatten(c appengine.Context, in <-chan []byte) <-chan Item { - const WORKERS = 100 - out := make(chan Item, 10000) - var wg sync.WaitGroup - wg.Add(WORKERS) - for i := 0; i < WORKERS; i++ { - go func(idx int) { - flattenWorker(c, in, out, idx) - wg.Done() - }(i) - } - go func() { - wg.Wait() - close(out) - }() - return out -} - -func flattenWorker(c appengine.Context, in <-chan []byte, out chan<- Item, idx int) { - var xmlPage XMLPage - var imgs []string +func flatten(c appengine.Context) chain.Worker { + return func(obj interface{}, out chan<- interface{}, idx int) { + var xmlPage XMLPage + var imgs []string - for data := range in { - if err := xml.Unmarshal(data, &xmlPage); err != nil { + // Parse the XML of an object + if err := xml.Unmarshal(obj.([]byte), &xmlPage); err != nil { c.Errorf("Flatten %d: %v", idx, err) - continue + return } + + // Process items in a particular page for _, item := range xmlPage.Items { imgs = make([]string, len(item.Imgs)) for i, img := range item.Imgs { imgs[i] = img.URL } - - out <- Item{ - GUID: item.GUID, - Tags: item.Tags, - Imgs: imgs, - } + out <- Item{item.GUID, item.Tags, imgs} } } } - -func doMagic() { - start := make(chan interface{}, 10) - out := chain.FanOut(10, 10, start, worker) - go func() { - for o := range out { - log.Printf("Something: %v", o) - } - }() - start <- 1 - start <- 2 - start <- 3 -} - -func worker(in <-chan interface{}, out chan<- interface{}, idx int) { - var bytes []byte - for x := range in { - bytes = x.([]byte) - out <- bytes - } -} diff --git a/app/cron/proj/tags.go b/app/cron/proj/tags.go index 337d884..eb8b45c 100644 --- a/app/cron/proj/tags.go +++ b/app/cron/proj/tags.go @@ -1,87 +1,91 @@ package proj import ( - "encoding/xml" + "bytes" "fmt" "net/http" + "runtime" "strings" - "sync" "time" + "github.com/bign8/chive-show/app/cron/chain" + "appengine" + "appengine/memcache" ) +const tagsMemcacheKey = "tags-baby" + // Tags etrieves the tags from the dataset func Tags(c appengine.Context, w http.ResponseWriter, r *http.Request) { start := time.Now() + defer func() { + c.Infof("Time took: %v", time.Since(start)) + }() + + // Check from memcache + if item, err := memcache.Get(c, tagsMemcacheKey); err == nil { + w.Write(item.Value) + return + } - pages := puller(c) - dirtyTags := getTags(c, pages, 100) - tags := cleaner(dirtyTags) + // Pretty sure this doesn't work on prod, but works awesome in dev + runtime.GOMAXPROCS(runtime.NumCPU()) + tags := chain.FanOut(50, 10000, getItems(c), tags) // Pull and clean tags + // Build a counter dictionary found := map[string]int64{} for tag := range tags { - found[tag]++ + found[tag.(string)]++ } + // Output results + var buffer bytes.Buffer for key, value := range found { - fmt.Fprintf(w, "%s,%d\n", key, value) + buffer.WriteString(fmt.Sprintf("%s,%d\n", key, value)) } + data := buffer.String() - c.Infof("Time took: %v", time.Since(start)) -} + fmt.Fprint(w, data) + c.Infof("Num tags: %v", len(found)) -func getTags(c appengine.Context, in <-chan []byte, workers int) <-chan string { - out := make(chan string, 10000) - var wg sync.WaitGroup - wg.Add(workers) - for i := 0; i < workers; i++ { - go func(idx int) { - tags(c, in, out, idx) - wg.Done() - }(i) - } + // Save to memcache, but only wait up to 3ms. + done := make(chan bool, 1) go func() { - wg.Wait() - close(out) + memcache.Set(c, &memcache.Item{ + Key: tagsMemcacheKey, + Value: []byte(data), + }) + done <- true }() - return out + select { + case <-done: + case <-time.After(3 * time.Millisecond): + } } -func tags(c appengine.Context, in <-chan []byte, out chan<- string, idx int) { - var xmlPage = XMLPage{} - - for data := range in { - if err := xml.Unmarshal(data, &xmlPage); err != nil { - c.Errorf("Miner %d: Error %s", idx, err) - continue - } - - for _, item := range xmlPage.Items { - for _, tag := range item.Tags { - out <- tag - } - } +func tags(obj interface{}, out chan<- interface{}, idx int) { + for _, tag := range validTags((obj.(Item)).Tags) { + out <- tag } } -func cleaner(in <-chan string) <-chan string { - // http://xpo6.com/list-of-english-stop-words/ - var stopWords = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" - var stops = map[string]bool{} - for _, s := range strings.Split(stopWords, ",") { - stops[s] = true - } +// http://xpo6.com/list-of-english-stop-words/ +var chiveWords = "web only," +var stopWords = chiveWords + "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" +var stops = map[string]bool{} - out := make(chan string, 10000) - go func() { - for s := range in { - s = strings.ToLower(s) - if !stops[s] { - out <- s - } +func validTags(tags []string) (res []string) { + if len(stops) == 0 { + for _, s := range strings.Split(stopWords, ",") { + stops[s] = true } - close(out) - }() - return out + } + for _, tag := range tags { + tag = strings.ToLower(tag) + if !stops[tag] { + res = append(res, tag) + } + } + return } From ea37ea5939f1c59cd1aeff8138423c4c730a9aae Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sun, 15 Nov 2015 13:00:45 -0700 Subject: [PATCH 13/26] Adding graph package (for serialized graphs) Yes I committed the graph.pb.go file here, because I haven't updated any of the build scripts yet... --- app/cron/proj/graph/graph.go | 110 +++++++++++++++++++ app/cron/proj/graph/graph.pb.go | 163 +++++++++++++++++++++++++++++ app/cron/proj/graph/graph.proto | 24 +++++ app/cron/proj/graph/load.sh | 14 +++ app/cron/proj/graph/serialGraph.go | 76 ++++++++++++++ 5 files changed, 387 insertions(+) create mode 100644 app/cron/proj/graph/graph.go create mode 100644 app/cron/proj/graph/graph.pb.go create mode 100644 app/cron/proj/graph/graph.proto create mode 100755 app/cron/proj/graph/load.sh create mode 100644 app/cron/proj/graph/serialGraph.go diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go new file mode 100644 index 0000000..ee1485e --- /dev/null +++ b/app/cron/proj/graph/graph.go @@ -0,0 +1,110 @@ +package main + +import ( + "errors" + "log" + + "github.com/golang/protobuf/proto" +) + +// TODO: add some graph processing functions + +// Graph is the serializable graph we have all been looking for +type Graph struct { + s *SerialGraph + nodes map[uint64]*Node // Optimal lookup with pointers goes here +} + +// New creates a new Graph +func New(isDirected bool) *Graph { + return &Graph{ + s: &SerialGraph{ + Nodes: make([]*Node, 0), + Directed: proto.Bool(isDirected), + NodeCount: proto.Uint64(0), + }, + nodes: make(map[uint64]*Node), + } +} + +// Add creates and adds a node to the graph +func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node { + n := &Node{ + Id: proto.Uint64(g.genNodeID()), + Value: proto.String(value), + Weight: proto.Int64(weight), + Type: ttype.Enum(), + Adjacent: make([]uint64, 0), + } + g.nodes[*n.Id] = n + g.s.Nodes = append(g.s.Nodes, n) + return n +} + +// Connect connects nodes to and from with an edge of weight w +func (g *Graph) Connect(to, from *Node, weight int64) error { + if to == nil || from == nil { + return errors.New("Cannot add edge to nil node") + } + from.Adjacent = append(from.Adjacent, *to.Id) // Directed edge + from.Weights = append(from.Weights, weight) + + if !g.s.GetDirected() { // UnDirected edge (return trip) + to.Adjacent = append(to.Adjacent, *from.Id) + to.Weights = append(to.Weights, weight) + } + return nil +} + +func (g *Graph) genNodeID() (id uint64) { + id = g.s.GetNodeCount() + *g.s.NodeCount++ + return id +} + +// DecodeGraph hydrates a graph from a serialized format (returned by Bytes()). +func DecodeGraph(data []byte) (*Graph, error) { + sg, err := DecodeSerialGraph(data) + if err != nil { + return nil, err + } + g := &Graph{sg, make(map[uint64]*Node)} + + // Hydrate Graph from SerialGraph + for _, node := range sg.Nodes { + g.nodes[*node.Id] = node + } + return g, nil +} + +// Bytes flattens a graph to a flat file format +func (g *Graph) Bytes() ([]byte, error) { + // TODO: use smaller numbers for encoding... + return g.s.Bytes() +} + +func main() { + log.Println("Do stuff...") + + graph := New(false) + a := graph.Add("http://super-stupid-long-url.com/more-crap-over-here1", NodeType_UNKNOWN, 0) + b := graph.Add("http://super-stupid-long-url.com/more-crap-over-here2", NodeType_UNKNOWN, 0) + graph.Connect(a, b, 0) + + // Compress + bits, err := graph.Bytes() + if err != nil { + panic(err) + } + + // Decompress + result, err := DecodeGraph(bits) + if err != nil { + panic(err) + } + + // Compare + log.Printf("Message (%d): %q", len(bits), string(bits)) + log.Printf("Digit:\n%v\n%v", graph, result) + log.Printf("Nodes:\n%v\n%v", graph.s.Nodes, result.s.Nodes) +} diff --git a/app/cron/proj/graph/graph.pb.go b/app/cron/proj/graph/graph.pb.go new file mode 100644 index 0000000..75953ae --- /dev/null +++ b/app/cron/proj/graph/graph.pb.go @@ -0,0 +1,163 @@ +// Code generated by protoc-gen-go. +// source: graph.proto +// DO NOT EDIT! + +/* +Package main is a generated protocol buffer package. + +It is generated from these files: + graph.proto + +It has these top-level messages: + SerialGraph + Node +*/ +package main + +import proto "github.com/golang/protobuf/proto" +import fmt "fmt" +import math "math" + +// Reference imports to suppress errors if they are not otherwise used. +var _ = proto.Marshal +var _ = fmt.Errorf +var _ = math.Inf + +type NodeType int32 + +const ( + NodeType_UNKNOWN NodeType = 0 + NodeType_POST NodeType = 1 + NodeType_IMG NodeType = 2 + NodeType_TAG NodeType = 3 + NodeType_USER NodeType = 4 +) + +var NodeType_name = map[int32]string{ + 0: "UNKNOWN", + 1: "POST", + 2: "IMG", + 3: "TAG", + 4: "USER", +} +var NodeType_value = map[string]int32{ + "UNKNOWN": 0, + "POST": 1, + "IMG": 2, + "TAG": 3, + "USER": 4, +} + +func (x NodeType) Enum() *NodeType { + p := new(NodeType) + *p = x + return p +} +func (x NodeType) String() string { + return proto.EnumName(NodeType_name, int32(x)) +} +func (x *NodeType) UnmarshalJSON(data []byte) error { + value, err := proto.UnmarshalJSONEnum(NodeType_value, data, "NodeType") + if err != nil { + return err + } + *x = NodeType(value) + return nil +} + +type SerialGraph struct { + Nodes []*Node `protobuf:"bytes,1,rep,name=nodes" json:"nodes,omitempty"` + Directed *bool `protobuf:"varint,2,opt,name=directed,def=0" json:"directed,omitempty"` + NodeCount *uint64 `protobuf:"varint,3,req,name=nodeCount,def=0" json:"nodeCount,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *SerialGraph) Reset() { *m = SerialGraph{} } +func (m *SerialGraph) String() string { return proto.CompactTextString(m) } +func (*SerialGraph) ProtoMessage() {} + +const Default_SerialGraph_Directed bool = false +const Default_SerialGraph_NodeCount uint64 = 0 + +func (m *SerialGraph) GetNodes() []*Node { + if m != nil { + return m.Nodes + } + return nil +} + +func (m *SerialGraph) GetDirected() bool { + if m != nil && m.Directed != nil { + return *m.Directed + } + return Default_SerialGraph_Directed +} + +func (m *SerialGraph) GetNodeCount() uint64 { + if m != nil && m.NodeCount != nil { + return *m.NodeCount + } + return Default_SerialGraph_NodeCount +} + +type Node struct { + Id *uint64 `protobuf:"varint,1,req,name=id" json:"id,omitempty"` + Value *string `protobuf:"bytes,2,req,name=value" json:"value,omitempty"` + Weight *int64 `protobuf:"varint,3,opt,name=weight" json:"weight,omitempty"` + Type *NodeType `protobuf:"varint,4,opt,name=type,enum=main.NodeType,def=0" json:"type,omitempty"` + Adjacent []uint64 `protobuf:"varint,5,rep,name=adjacent" json:"adjacent,omitempty"` + Weights []int64 `protobuf:"varint,6,rep,name=weights" json:"weights,omitempty"` + XXX_unrecognized []byte `json:"-"` +} + +func (m *Node) Reset() { *m = Node{} } +func (m *Node) String() string { return proto.CompactTextString(m) } +func (*Node) ProtoMessage() {} + +const Default_Node_Type NodeType = NodeType_UNKNOWN + +func (m *Node) GetId() uint64 { + if m != nil && m.Id != nil { + return *m.Id + } + return 0 +} + +func (m *Node) GetValue() string { + if m != nil && m.Value != nil { + return *m.Value + } + return "" +} + +func (m *Node) GetWeight() int64 { + if m != nil && m.Weight != nil { + return *m.Weight + } + return 0 +} + +func (m *Node) GetType() NodeType { + if m != nil && m.Type != nil { + return *m.Type + } + return Default_Node_Type +} + +func (m *Node) GetAdjacent() []uint64 { + if m != nil { + return m.Adjacent + } + return nil +} + +func (m *Node) GetWeights() []int64 { + if m != nil { + return m.Weights + } + return nil +} + +func init() { + proto.RegisterEnum("main.NodeType", NodeType_name, NodeType_value) +} diff --git a/app/cron/proj/graph/graph.proto b/app/cron/proj/graph/graph.proto new file mode 100644 index 0000000..ca34de3 --- /dev/null +++ b/app/cron/proj/graph/graph.proto @@ -0,0 +1,24 @@ +package main; + +message SerialGraph { + repeated Node nodes = 1; + optional bool directed = 2 [default = false]; + required uint64 nodeCount = 3 [default = 0]; +} + +enum NodeType { + UNKNOWN = 0; + POST = 1; + IMG = 2; + TAG = 3; + USER = 4; +} + +message Node { + required uint64 id = 1; + required string value = 2; + optional int64 weight = 3; + optional NodeType type = 4 [default = UNKNOWN]; + repeated uint64 adjacent = 5; + repeated int64 weights = 6; +} diff --git a/app/cron/proj/graph/load.sh b/app/cron/proj/graph/load.sh new file mode 100755 index 0000000..4ebec6a --- /dev/null +++ b/app/cron/proj/graph/load.sh @@ -0,0 +1,14 @@ +#!/bin/sh + +if ! which proto >/dev/null; then + echo "Installing proto and protoc-gen-go" + go get -u github.com/golang/protobuf/{proto,protoc-gen-go} + export PATH=$PATH:$GOPATH/bin +else + echo "Proto and protoc-gen-go already installed" +fi + +echo "Generating Protobuff files..." +protoc --go_out=. *.proto +sed -i '' '/RegisterType/d' graph.pb.go +echo "Protobuff files generated." diff --git a/app/cron/proj/graph/serialGraph.go b/app/cron/proj/graph/serialGraph.go new file mode 100644 index 0000000..a251871 --- /dev/null +++ b/app/cron/proj/graph/serialGraph.go @@ -0,0 +1,76 @@ +package main + +import ( + "bytes" + "compress/gzip" + + "github.com/golang/protobuf/proto" +) + +const shouldGZIP = true + +// DecodeSerialGraph converts a byte string back into a hydrated SerialGraph. +func DecodeSerialGraph(data []byte) (g *SerialGraph, err error) { + if shouldGZIP { + if data, err = decompress(data); err != nil { + return nil, err + } + } + + // log.Printf("DecodeSerialGraph: %q", data) + + g = &SerialGraph{} + if err := proto.Unmarshal(data, g); err != nil { + return nil, err + } + return g, nil +} + +// Bytes converts a serial graph to a gzipped graph (used for storage) +func (g *SerialGraph) Bytes() (data []byte, err error) { + data, err = proto.Marshal(g) + if err != nil { + return nil, err + } + + // log.Printf(" Graph.Bytes: %q", data) + + if shouldGZIP { + if data, err = compress(data); err != nil { + return nil, err + } + } + return data, nil +} + +// Simple GZIP decompression +func decompress(garbage []byte) ([]byte, error) { + gz, err := gzip.NewReader(bytes.NewBuffer(garbage)) + if err != nil { + return nil, err + } + var buff bytes.Buffer + if _, err := buff.ReadFrom(gz); err != nil { + return nil, err + } + if err := gz.Close(); err != nil { + return nil, err + } + return buff.Bytes(), nil +} + +// Simple GZIP compression +func compress(data []byte) ([]byte, error) { + var buff bytes.Buffer + gz := gzip.NewWriter(&buff) + if _, err := gz.Write(data); err != nil { + return nil, err + } + if err := gz.Flush(); err != nil { + return nil, err + } + if err := gz.Close(); err != nil { + return nil, err + } + return buff.Bytes(), nil +} From 41184396199c0f286240d8da9f472bb5bc2cefe2 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sun, 15 Nov 2015 13:01:47 -0700 Subject: [PATCH 14/26] Improving the /proj/tags endpoint performance --- app/cron/proj/tags.go | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/app/cron/proj/tags.go b/app/cron/proj/tags.go index eb8b45c..e28bf8e 100644 --- a/app/cron/proj/tags.go +++ b/app/cron/proj/tags.go @@ -22,6 +22,7 @@ func Tags(c appengine.Context, w http.ResponseWriter, r *http.Request) { defer func() { c.Infof("Time took: %v", time.Since(start)) }() + // w.Header().Set("Content-Type", "text/csv; charset=utf-8") // Check from memcache if item, err := memcache.Get(c, tagsMemcacheKey); err == nil { @@ -39,22 +40,45 @@ func Tags(c appengine.Context, w http.ResponseWriter, r *http.Request) { found[tag.(string)]++ } + // Compute average (used to clip data, so it's not huge) + avg := int64(0) + for _, value := range found { + avg += value + } + avg /= int64(len(found)) + c.Infof("Num tags: %v; Avg: %v", len(found), avg) + + // Compute the 75%-tile + cap := int64(0) + for key, value := range found { + if avg <= value { + cap += value + } else { + delete(found, key) + } + } + cap /= int64(len(found)) + c.Infof("Above average tags: %v; 75%%-tile: %v", len(found), cap) + // Output results var buffer bytes.Buffer + result := int64(0) for key, value := range found { - buffer.WriteString(fmt.Sprintf("%s,%d\n", key, value)) + if cap <= value { + buffer.WriteString(fmt.Sprintf("%s,%d\n", key, value)) + result++ + } } - data := buffer.String() - - fmt.Fprint(w, data) - c.Infof("Num tags: %v", len(found)) + data := buffer.Bytes() + w.Write(data) + c.Infof("Returned tags: %v", result) // Save to memcache, but only wait up to 3ms. done := make(chan bool, 1) go func() { memcache.Set(c, &memcache.Item{ Key: tagsMemcacheKey, - Value: []byte(data), + Value: data, }) done <- true }() @@ -71,7 +95,7 @@ func tags(obj interface{}, out chan<- interface{}, idx int) { } // http://xpo6.com/list-of-english-stop-words/ -var chiveWords = "web only," +var chiveWords = "web only,thebrigade,theberry,thechive,chive,chive humanity," var stopWords = chiveWords + "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your" var stops = map[string]bool{} From e61d9c4c164748bde2e7e2ae36f24589044ea7b4 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sun, 15 Nov 2015 14:16:25 -0700 Subject: [PATCH 15/26] Filling graph with proj endpoint (todo: shard and store) --- app/cron/cron.go | 1 + app/cron/proj/graph.go | 79 ++++++++++++++++++++++----- app/cron/proj/graph/graph.go | 87 ++++++++++++++++++++---------- app/cron/proj/graph/graph.pb.go | 2 +- app/cron/proj/graph/graph.proto | 2 +- app/cron/proj/graph/serialGraph.go | 2 +- 6 files changed, 129 insertions(+), 44 deletions(-) diff --git a/app/cron/cron.go b/app/cron/cron.go index ddeb2ff..f4953f2 100644 --- a/app/cron/cron.go +++ b/app/cron/cron.go @@ -47,6 +47,7 @@ func Init() { http.HandleFunc("/cron/stage/1", crawler.Crawl) http.Handle("/proj/tags", appstats.NewHandler(proj.Tags)) + http.Handle("/proj/graph", appstats.NewHandler(proj.Graph)) http.HandleFunc("/clean", func(w http.ResponseWriter, r *http.Request) { c := appengine.NewContext(r) diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go index 44b211b..1db8379 100644 --- a/app/cron/proj/graph.go +++ b/app/cron/proj/graph.go @@ -5,24 +5,79 @@ import ( "time" "appengine" + + "github.com/bign8/chive-show/app/cron/proj/graph" ) // Graph processes all posts in attempt to create a graph func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) { start := time.Now() - // pages := puller(c) - // dirtyTags := getNod(c, pages, 100) - // tags := cleaner(dirtyTags) - // - // found := map[string]int64{} - // for tag := range tags { - // found[tag]++ - // } - // - // for key, value := range found { - // fmt.Fprintf(w, "%s,%d\n", key, value) - // } + var item Item + var post, ntag, nimg *graph.Node + + idx := 0 + + g := graph.New(false) + for idk := range getItems(c) { + item = idk.(Item) + post = g.Add(item.GUID, graph.NodeType_POST, 0) + + for _, tag := range validTags(item.Tags) { + ntag = g.Add(tag, graph.NodeType_TAG, 0) + g.Connect(post, ntag, 0) + } + + for _, img := range item.Imgs { + nimg = g.Add(img, graph.NodeType_IMG, 0) + g.Connect(post, nimg, 0) + } + + // This is a SLOW/DEBUG only operation + if idx%2000 == 0 { + c.Infof("Current Duration (%v)", time.Since(start)) + } + idx++ + } + + // Write result + bits, err := g.Bytes() + if err != nil { + c.Errorf("Error in Graph.Bytes: %v", err) + } + w.Write(bits) + + // Count types of nodes + binCtr := make(map[graph.NodeType]uint64) + for _, node := range g.Nodes() { + binCtr[*node.Type]++ + } + + // Log out types of nodes + total := uint64(0) + for key, value := range binCtr { + c.Infof("Nodes (%s): %d", key, value) + total += value + } + c.Infof("Nodes (ALL): %d", total) + + // w/dupes w/invalid tags + // 2015/11/15 20:52:26 INFO: Nodes (IMG): 928728 + // 2015/11/15 20:52:26 INFO: Nodes (TAG): 244212 + // 2015/11/15 20:52:26 INFO: Nodes (POST): 40920 + // 2015/11/15 20:52:26 INFO: Time took: 31.310686059s + + // w/dupes w/o invalid Tags + // 2015/11/15 21:03:06 INFO: Nodes (IMG): 928728 + // 2015/11/15 21:03:06 INFO: Nodes (TAG): 237122 + // 2015/11/15 21:03:06 INFO: Nodes (POST): 40920 + // 2015/11/15 21:03:06 INFO: Time took: 31.850210891s + + // w/o dupes w/o invalid Tags + // 2015/11/15 21:06:18 INFO: Nodes (IMG): 886831 + // 2015/11/15 21:06:18 INFO: Nodes (POST): 40920 + // 2015/11/15 21:06:18 INFO: Nodes (TAG): 18221 + // 2015/11/15 21:06:18 INFO: Time took: 32.651739532s c.Infof("Time took: %v", time.Since(start)) } diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go index ee1485e..18a8530 100644 --- a/app/cron/proj/graph/graph.go +++ b/app/cron/proj/graph/graph.go @@ -1,8 +1,7 @@ -package main +package graph import ( "errors" - "log" "github.com/golang/protobuf/proto" ) @@ -12,7 +11,8 @@ import ( // Graph is the serializable graph we have all been looking for type Graph struct { s *SerialGraph - nodes map[uint64]*Node // Optimal lookup with pointers goes here + nodes map[uint64]*Node // Optimal lookup with pointers goes here + dupes map[NodeType]map[string]*Node // type > value > node } // New creates a new Graph @@ -24,11 +24,21 @@ func New(isDirected bool) *Graph { NodeCount: proto.Uint64(0), }, nodes: make(map[uint64]*Node), + dupes: make(map[NodeType]map[string]*Node), } } // Add creates and adds a node to the graph func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node { + + // Check duplicate node (add weight) + dupe := g.dupes[ttype][value] + if dupe != nil { + *dupe.Weight += weight + return dupe + } + + // Create new node n := &Node{ Id: proto.Uint64(g.genNodeID()), Value: proto.String(value), @@ -38,11 +48,18 @@ func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node { } g.nodes[*n.Id] = n g.s.Nodes = append(g.s.Nodes, n) + + // Add dupe check to list + if g.dupes[ttype] == nil { + g.dupes[ttype] = make(map[string]*Node) + } + g.dupes[ttype][value] = n return n } // Connect connects nodes to and from with an edge of weight w func (g *Graph) Connect(to, from *Node, weight int64) error { + // TODO: collision checks if to == nil || from == nil { return errors.New("Cannot add edge to nil node") } @@ -62,13 +79,25 @@ func (g *Graph) genNodeID() (id uint64) { return id } +// Nodes returns all the nodes in the Graph +func (g *Graph) Nodes() []*Node { + n := make([]*Node, len(g.nodes)) + ctr := 0 + for _, node := range g.nodes { + n[ctr] = node + ctr++ + } + return n +} + // DecodeGraph hydrates a graph from a serialized format (returned by Bytes()). func DecodeGraph(data []byte) (*Graph, error) { sg, err := DecodeSerialGraph(data) if err != nil { return nil, err } - g := &Graph{sg, make(map[uint64]*Node)} + g := New(false) // Don't care about directed because it's stored on s (assigned below) + g.s = sg // Hydrate Graph from SerialGraph for _, node := range sg.Nodes { @@ -83,28 +112,28 @@ func (g *Graph) Bytes() ([]byte, error) { return g.s.Bytes() } -func main() { - log.Println("Do stuff...") - - graph := New(false) - a := graph.Add("http://super-stupid-long-url.com/more-crap-over-here1", NodeType_UNKNOWN, 0) - b := graph.Add("http://super-stupid-long-url.com/more-crap-over-here2", NodeType_UNKNOWN, 0) - graph.Connect(a, b, 0) - - // Compress - bits, err := graph.Bytes() - if err != nil { - panic(err) - } - - // Decompress - result, err := DecodeGraph(bits) - if err != nil { - panic(err) - } - - // Compare - log.Printf("Message (%d): %q", len(bits), string(bits)) - log.Printf("Digit:\n%v\n%v", graph, result) - log.Printf("Nodes:\n%v\n%v", graph.s.Nodes, result.s.Nodes) -} +// func main() { +// log.Println("Do stuff...") +// +// graph := New(false) +// a := graph.Add("http://super-stupid-long-url.com/more-crap-over-here1", NodeType_UNKNOWN, 0) +// b := graph.Add("http://super-stupid-long-url.com/more-crap-over-here2", NodeType_UNKNOWN, 0) +// graph.Connect(a, b, 0) +// +// // Compress +// bits, err := graph.Bytes() +// if err != nil { +// panic(err) +// } +// +// // Decompress +// result, err := DecodeGraph(bits) +// if err != nil { +// panic(err) +// } +// +// // Compare +// log.Printf("Message (%d): %q", len(bits), string(bits)) +// log.Printf("Digit:\n%v\n%v", graph, result) +// log.Printf("Nodes:\n%v\n%v", graph.s.Nodes, result.s.Nodes) +// } diff --git a/app/cron/proj/graph/graph.pb.go b/app/cron/proj/graph/graph.pb.go index 75953ae..3d661a5 100644 --- a/app/cron/proj/graph/graph.pb.go +++ b/app/cron/proj/graph/graph.pb.go @@ -12,7 +12,7 @@ It has these top-level messages: SerialGraph Node */ -package main +package graph import proto "github.com/golang/protobuf/proto" import fmt "fmt" diff --git a/app/cron/proj/graph/graph.proto b/app/cron/proj/graph/graph.proto index ca34de3..d5693dd 100644 --- a/app/cron/proj/graph/graph.proto +++ b/app/cron/proj/graph/graph.proto @@ -1,4 +1,4 @@ -package main; +package graph; message SerialGraph { repeated Node nodes = 1; diff --git a/app/cron/proj/graph/serialGraph.go b/app/cron/proj/graph/serialGraph.go index a251871..af059f2 100644 --- a/app/cron/proj/graph/serialGraph.go +++ b/app/cron/proj/graph/serialGraph.go @@ -1,4 +1,4 @@ -package main +package graph import ( "bytes" From a964f986b6609b72a2fa3d379af543a6df26999e Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Sun, 15 Nov 2015 20:07:44 -0700 Subject: [PATCH 16/26] Adding datastore based sharder --- app/cron/cron.go | 1 + app/cron/proj/graph.go | 62 ++++++++++++++++++----- app/cron/proj/graph/graph.go | 8 +-- app/helpers/sharder/reader.go | 38 ++++++++++++++ app/helpers/sharder/sharder.go | 68 +++++++++++++++++++++++++ app/helpers/sharder/writer.go | 91 ++++++++++++++++++++++++++++++++++ 6 files changed, 253 insertions(+), 15 deletions(-) create mode 100644 app/helpers/sharder/reader.go create mode 100644 app/helpers/sharder/sharder.go create mode 100644 app/helpers/sharder/writer.go diff --git a/app/cron/cron.go b/app/cron/cron.go index f4953f2..45740a7 100644 --- a/app/cron/cron.go +++ b/app/cron/cron.go @@ -48,6 +48,7 @@ func Init() { http.Handle("/proj/tags", appstats.NewHandler(proj.Tags)) http.Handle("/proj/graph", appstats.NewHandler(proj.Graph)) + http.Handle("/proj/shard", appstats.NewHandler(proj.TestShard)) http.HandleFunc("/clean", func(w http.ResponseWriter, r *http.Request) { c := appengine.NewContext(r) diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go index 1db8379..ec82ebd 100644 --- a/app/cron/proj/graph.go +++ b/app/cron/proj/graph.go @@ -7,8 +7,41 @@ import ( "appengine" "github.com/bign8/chive-show/app/cron/proj/graph" + "github.com/bign8/chive-show/app/helpers/sharder" ) +// TestShard to delete +func TestShard(c appengine.Context, w http.ResponseWriter, r *http.Request) { + start := time.Now() + + s, err := sharder.NewWriter(c, "test") + if err != nil { + c.Errorf("Writer Error: %s", err) + return + } + + s.Write([]byte("012345678901234567890")) + s.Close() + + key, err := s.Key() + if err != nil { + c.Errorf("Error in Key: %s", err) + } + c.Infof("Has Key: %s", key) + + c.Infof("Write took: %v", time.Since(start)) + start = time.Now() + + read, err := sharder.Reader(c, "test") + if err != nil { + c.Errorf("Reader Error: %s", err) + return + } + c.Infof("Data: %q", read.String()) + + c.Infof("Read took: %v", time.Since(start)) +} + // Graph processes all posts in attempt to create a graph func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) { start := time.Now() @@ -62,22 +95,27 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) { c.Infof("Nodes (ALL): %d", total) // w/dupes w/invalid tags - // 2015/11/15 20:52:26 INFO: Nodes (IMG): 928728 - // 2015/11/15 20:52:26 INFO: Nodes (TAG): 244212 - // 2015/11/15 20:52:26 INFO: Nodes (POST): 40920 - // 2015/11/15 20:52:26 INFO: Time took: 31.310686059s + // INFO: Nodes (IMG): 928728 + // INFO: Nodes (TAG): 244212 + // INFO: Nodes (POST): 40920 + // INFO: Nodes (ALL): 1213860 + // INFO: Time took: 31.310686059s // w/dupes w/o invalid Tags - // 2015/11/15 21:03:06 INFO: Nodes (IMG): 928728 - // 2015/11/15 21:03:06 INFO: Nodes (TAG): 237122 - // 2015/11/15 21:03:06 INFO: Nodes (POST): 40920 - // 2015/11/15 21:03:06 INFO: Time took: 31.850210891s + // INFO: Nodes (IMG): 928728 + // INFO: Nodes (TAG): 237122 + // INFO: Nodes (POST): 40920 + // INFO: Nodes (ALL): 1206770 + // INFO: Time took: 31.850210891s // w/o dupes w/o invalid Tags - // 2015/11/15 21:06:18 INFO: Nodes (IMG): 886831 - // 2015/11/15 21:06:18 INFO: Nodes (POST): 40920 - // 2015/11/15 21:06:18 INFO: Nodes (TAG): 18221 - // 2015/11/15 21:06:18 INFO: Time took: 32.651739532s + // INFO: Nodes (IMG): 886831 + // INFO: Nodes (POST): 40920 + // INFO: Nodes (TAG): 18221 + // INFO: Nodes (ALL): 945972 + // INFO: Time took: 32.651739532s + + // TODO: write to sharded datastore entity c.Infof("Time took: %v", time.Since(start)) } diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go index 18a8530..65950ab 100644 --- a/app/cron/proj/graph/graph.go +++ b/app/cron/proj/graph/graph.go @@ -50,10 +50,12 @@ func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node { g.s.Nodes = append(g.s.Nodes, n) // Add dupe check to list - if g.dupes[ttype] == nil { - g.dupes[ttype] = make(map[string]*Node) + dub, ok := g.dupes[ttype] + if !ok { + dub = make(map[string]*Node) + g.dupes[ttype] = dub } - g.dupes[ttype][value] = n + dub[value] = n return n } diff --git a/app/helpers/sharder/reader.go b/app/helpers/sharder/reader.go new file mode 100644 index 0000000..58fe6d2 --- /dev/null +++ b/app/helpers/sharder/reader.go @@ -0,0 +1,38 @@ +package sharder + +import ( + "bytes" + + "appengine" + "appengine/datastore" +) + +// Reader creates a new shard reader to retrieve data from datastore +func Reader(c appengine.Context, name string) (*bytes.Buffer, error) { + if name == "" { + return nil, ErrInvalidName + } + + var master shardMaster + if err := datastore.Get(c, masterKey(c, name), &master); err != nil { + panic(err) + return nil, err + } + + data := make([]byte, master.Size) + for i := 0; i < master.Shards; i++ { + var shardData shard + if err := datastore.Get(c, shardKey(c, name, i), &shardData); err != nil { + return nil, err + } + c.Infof("Out Data %d: %q", i, string(shardData.Data)) + + end := i*divisor + divisor + if end > master.Size { + end = master.Size + } + copy(data[i*divisor:end], shardData.Data) + } + + return bytes.NewBuffer(data), nil +} diff --git a/app/helpers/sharder/sharder.go b/app/helpers/sharder/sharder.go new file mode 100644 index 0000000..2dcaa90 --- /dev/null +++ b/app/helpers/sharder/sharder.go @@ -0,0 +1,68 @@ +package sharder + +import ( + "errors" + "fmt" + "time" + + "appengine" + "appengine/datastore" +) + +const ( + masterKind = "shard-master" + shardKind = "shard-pieces" + divisor = 10 // 9e6 +) + +// ErrInvalidName because reasons +var ErrInvalidName = errors.New("Must provide name of sharded item") + +// ShardKey is an identifying string for shards +type ShardKey string + +func (sk *ShardKey) String() string { + return fmt.Sprint(*sk) +} + +// newKey takes the name of a file and creates a ShardKey +func newKey(c appengine.Context, name string) ShardKey { + return ShardKey(masterKey(c, name).Encode()) +} + +func masterKey(c appengine.Context, name string) *datastore.Key { + return datastore.NewKey(c, masterKind, name, 0, nil) +} + +func shardKey(c appengine.Context, name string, idx int) *datastore.Key { + return datastore.NewKey(c, shardKind, fmt.Sprintf("%s-%d", name, idx), 0, nil) +} + +// ShardInfo implements the io.writer interface and allows for sharding data +type ShardInfo struct { + Key ShardKey + CreationTime time.Time + Size int + MD5 string +} + +type shardMaster struct { + Name string `datastore:"name"` + Stamp time.Time `datastore:"stamp"` + Shards int `datastore:"shards"` + MD5 string `datastore:"md5_hash"` + Size int `datastore:"size"` +} + +func (sm *shardMaster) toInfo(c appengine.Context) *ShardInfo { + return &ShardInfo{ + Key: newKey(c, sm.Name), + CreationTime: sm.Stamp, + Size: sm.Size, + MD5: sm.MD5, + } +} + +type shard struct { + Data []byte +} diff --git a/app/helpers/sharder/writer.go b/app/helpers/sharder/writer.go new file mode 100644 index 0000000..08aa219 --- /dev/null +++ b/app/helpers/sharder/writer.go @@ -0,0 +1,91 @@ +package sharder + +import ( + "bytes" + "errors" + "time" + + "appengine" + "appengine/datastore" +) + +// NewWriter creates a new Sharder to write sharded data to datastore +func NewWriter(c appengine.Context, name string) (*Writer, error) { + if name == "" { + return nil, ErrInvalidName + } + return &Writer{ + ctx: c, + key: nil, + buff: bytes.NewBufferString(""), + name: name, + }, nil +} + +// Writer is the item that deals with writing sharded data +type Writer struct { + buff *bytes.Buffer + ctx appengine.Context + key *ShardKey + name string +} + +// Write pushed p bytes to underlying data stream. +func (w *Writer) Write(p []byte) (n int, err error) { + if w.buff == nil { + return 0, errors.New("Buffer is closed") + } + return w.buff.Write(p) +} + +// Close finishes off the current buffer, shards and stores the data. +// Once Close is called, the user may call Key to get the key of the stored object. +func (w *Writer) Close() error { + // TODO: datastore.RunInTransaction + go-routines with waitGroups + + length := w.buff.Len() + shards := (length-1)/divisor + 1 + key := masterKey(w.ctx, w.name) + + // Store shardMaster + master := shardMaster{ + Name: w.name, + Stamp: time.Now(), + Shards: shards, + MD5: "TO-IMPLEMENT", + Size: length, + } + if _, err := datastore.Put(w.ctx, key, &master); err != nil { + panic(err) + return err + } + + // shard data and store shards + data := w.buff.Bytes() + for i := 0; i < shards; i++ { + shardKey := shardKey(w.ctx, w.name, i) + shardData := data[i*divisor:] + if len(shardData) > divisor { + shardData = data[:divisor] + } + s := shard{shardData} + w.ctx.Infof("Inn Data %d: %q", i, s.Data) + if _, err := datastore.Put(w.ctx, shardKey, &s); err != nil { + panic(err) + return err + } + } + + w.key = new(ShardKey) + *w.key = ShardKey(key.Encode()) + w.buff = nil + return nil +} + +// Key returns the key of the sharded data. Note: will return an error if not Closed. +func (w *Writer) Key() (*ShardKey, error) { + if w.key == nil { + return nil, errors.New("Writer must be closed before a Key is available") + } + return w.key, nil +} From 3429d97935b912778c8034bf504073c0489e41b2 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Mon, 16 Nov 2015 00:49:17 -0700 Subject: [PATCH 17/26] Pulling in go-routines --- app/cron/cron.go | 4 +++- app/cron/proj/graph.go | 20 +++++++------------- app/helpers/sharder/reader.go | 30 +++++++++++++++++++----------- app/helpers/sharder/sharder.go | 2 +- 4 files changed, 30 insertions(+), 26 deletions(-) diff --git a/app/cron/cron.go b/app/cron/cron.go index 45740a7..068d867 100644 --- a/app/cron/cron.go +++ b/app/cron/cron.go @@ -50,12 +50,14 @@ func Init() { http.Handle("/proj/graph", appstats.NewHandler(proj.Graph)) http.Handle("/proj/shard", appstats.NewHandler(proj.TestShard)) - http.HandleFunc("/clean", func(w http.ResponseWriter, r *http.Request) { + http.HandleFunc("/cron/clean", func(w http.ResponseWriter, r *http.Request) { c := appengine.NewContext(r) cleanup(c, "buff") cleanup(c, "edge") cleanup(c, "vertex") cleanup(c, "post") + cleanup(c, "shard-pieces") + cleanup(c, "shard-master") }) http.Handle("/cron/stats", appstats.NewHandler(crawler.Stats)) diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go index ec82ebd..ffc86df 100644 --- a/app/cron/proj/graph.go +++ b/app/cron/proj/graph.go @@ -2,6 +2,7 @@ package proj import ( "net/http" + "strings" "time" "appengine" @@ -12,33 +13,26 @@ import ( // TestShard to delete func TestShard(c appengine.Context, w http.ResponseWriter, r *http.Request) { - start := time.Now() + // Writing + start := time.Now() s, err := sharder.NewWriter(c, "test") if err != nil { c.Errorf("Writer Error: %s", err) return } - - s.Write([]byte("012345678901234567890")) + s.Write([]byte(strings.Repeat("01234567890123456789", 1e6))) s.Close() - - key, err := s.Key() - if err != nil { - c.Errorf("Error in Key: %s", err) - } - c.Infof("Has Key: %s", key) - c.Infof("Write took: %v", time.Since(start)) - start = time.Now() + // Reading + start = time.Now() read, err := sharder.Reader(c, "test") if err != nil { c.Errorf("Reader Error: %s", err) return } - c.Infof("Data: %q", read.String()) - + c.Infof("Data Length: %d", read.Len()) c.Infof("Read took: %v", time.Since(start)) } diff --git a/app/helpers/sharder/reader.go b/app/helpers/sharder/reader.go index 58fe6d2..b9a77c8 100644 --- a/app/helpers/sharder/reader.go +++ b/app/helpers/sharder/reader.go @@ -2,6 +2,7 @@ package sharder import ( "bytes" + "sync" "appengine" "appengine/datastore" @@ -18,21 +19,28 @@ func Reader(c appengine.Context, name string) (*bytes.Buffer, error) { panic(err) return nil, err } + shards := (master.Size-1)/divisor + 1 + var wg sync.WaitGroup + wg.Add(shards) data := make([]byte, master.Size) - for i := 0; i < master.Shards; i++ { - var shardData shard - if err := datastore.Get(c, shardKey(c, name, i), &shardData); err != nil { - return nil, err - } - c.Infof("Out Data %d: %q", i, string(shardData.Data)) + for i := 0; i < shards; i++ { + go func(i int) { + var shardData shard + if err := datastore.Get(c, shardKey(c, name, i), &shardData); err != nil { + panic(err) + } + // c.Infof("Out Data %d: %q", i, string(shardData.Data)) - end := i*divisor + divisor - if end > master.Size { - end = master.Size - } - copy(data[i*divisor:end], shardData.Data) + end := i*divisor + divisor + if end > master.Size { + end = master.Size + } + copy(data[i*divisor:end], shardData.Data) + wg.Done() + }(i) } + wg.Wait() return bytes.NewBuffer(data), nil } diff --git a/app/helpers/sharder/sharder.go b/app/helpers/sharder/sharder.go index 2dcaa90..4b538a9 100644 --- a/app/helpers/sharder/sharder.go +++ b/app/helpers/sharder/sharder.go @@ -12,7 +12,7 @@ import ( const ( masterKind = "shard-master" shardKind = "shard-pieces" - divisor = 10 // 9e6 + divisor = 1e6 // 1MB ) // ErrInvalidName because reasons From 5781d36aafb5147bf522d0dfd3ce867a60f09928 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Mon, 16 Nov 2015 00:57:37 -0700 Subject: [PATCH 18/26] Removing un-used master data --- app/helpers/sharder/sharder.go | 36 +----------------------- app/helpers/sharder/writer.go | 50 ++++++++++++++-------------------- 2 files changed, 21 insertions(+), 65 deletions(-) diff --git a/app/helpers/sharder/sharder.go b/app/helpers/sharder/sharder.go index 4b538a9..ba30d7a 100644 --- a/app/helpers/sharder/sharder.go +++ b/app/helpers/sharder/sharder.go @@ -3,7 +3,6 @@ package sharder import ( "errors" "fmt" - "time" "appengine" "appengine/datastore" @@ -18,18 +17,6 @@ const ( // ErrInvalidName because reasons var ErrInvalidName = errors.New("Must provide name of sharded item") -// ShardKey is an identifying string for shards -type ShardKey string - -func (sk *ShardKey) String() string { - return fmt.Sprint(*sk) -} - -// newKey takes the name of a file and creates a ShardKey -func newKey(c appengine.Context, name string) ShardKey { - return ShardKey(masterKey(c, name).Encode()) -} - func masterKey(c appengine.Context, name string) *datastore.Key { return datastore.NewKey(c, masterKind, name, 0, nil) } @@ -38,29 +25,8 @@ func shardKey(c appengine.Context, name string, idx int) *datastore.Key { return datastore.NewKey(c, shardKind, fmt.Sprintf("%s-%d", name, idx), 0, nil) } -// ShardInfo implements the io.writer interface and allows for sharding data -type ShardInfo struct { - Key ShardKey - CreationTime time.Time - Size int - MD5 string -} - type shardMaster struct { - Name string `datastore:"name"` - Stamp time.Time `datastore:"stamp"` - Shards int `datastore:"shards"` - MD5 string `datastore:"md5_hash"` - Size int `datastore:"size"` -} - -func (sm *shardMaster) toInfo(c appengine.Context) *ShardInfo { - return &ShardInfo{ - Key: newKey(c, sm.Name), - CreationTime: sm.Stamp, - Size: sm.Size, - MD5: sm.MD5, - } + Size int `datastore:"size"` } type shard struct { diff --git a/app/helpers/sharder/writer.go b/app/helpers/sharder/writer.go index 08aa219..e33e6b0 100644 --- a/app/helpers/sharder/writer.go +++ b/app/helpers/sharder/writer.go @@ -3,7 +3,7 @@ package sharder import ( "bytes" "errors" - "time" + "sync" "appengine" "appengine/datastore" @@ -16,7 +16,6 @@ func NewWriter(c appengine.Context, name string) (*Writer, error) { } return &Writer{ ctx: c, - key: nil, buff: bytes.NewBufferString(""), name: name, }, nil @@ -26,7 +25,6 @@ func NewWriter(c appengine.Context, name string) (*Writer, error) { type Writer struct { buff *bytes.Buffer ctx appengine.Context - key *ShardKey name string } @@ -41,7 +39,8 @@ func (w *Writer) Write(p []byte) (n int, err error) { // Close finishes off the current buffer, shards and stores the data. // Once Close is called, the user may call Key to get the key of the stored object. func (w *Writer) Close() error { - // TODO: datastore.RunInTransaction + go-routines with waitGroups + // TODO: datastore.RunInTransaction + // TODO: delete existing shards greater than current length := w.buff.Len() shards := (length-1)/divisor + 1 @@ -49,11 +48,7 @@ func (w *Writer) Close() error { // Store shardMaster master := shardMaster{ - Name: w.name, - Stamp: time.Now(), - Shards: shards, - MD5: "TO-IMPLEMENT", - Size: length, + Size: length, } if _, err := datastore.Put(w.ctx, key, &master); err != nil { panic(err) @@ -62,30 +57,25 @@ func (w *Writer) Close() error { // shard data and store shards data := w.buff.Bytes() + var wg sync.WaitGroup + wg.Add(shards) for i := 0; i < shards; i++ { - shardKey := shardKey(w.ctx, w.name, i) - shardData := data[i*divisor:] - if len(shardData) > divisor { - shardData = data[:divisor] - } - s := shard{shardData} - w.ctx.Infof("Inn Data %d: %q", i, s.Data) - if _, err := datastore.Put(w.ctx, shardKey, &s); err != nil { - panic(err) - return err - } + go func(i int) { + shardKey := shardKey(w.ctx, w.name, i) + shardData := data[i*divisor:] + if len(shardData) > divisor { + shardData = data[:divisor] + } + s := shard{shardData} + // w.ctx.Infof("Inn Data %d: %q", i, s.Data) + if _, err := datastore.Put(w.ctx, shardKey, &s); err != nil { + panic(err) + } + wg.Done() + }(i) } - w.key = new(ShardKey) - *w.key = ShardKey(key.Encode()) + wg.Wait() w.buff = nil return nil } - -// Key returns the key of the sharded data. Note: will return an error if not Closed. -func (w *Writer) Key() (*ShardKey, error) { - if w.key == nil { - return nil, errors.New("Writer must be closed before a Key is available") - } - return w.key, nil -} From b4dec061ac51c4af7c3c8734571098dca5595250 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Mon, 16 Nov 2015 01:12:19 -0700 Subject: [PATCH 19/26] Removing un-needed writer complexity --- app/cron/proj/graph.go | 9 +++--- app/helpers/sharder/reader.go | 8 ++---- app/helpers/sharder/sharder.go | 9 ++++++ app/helpers/sharder/writer.go | 52 ++++++---------------------------- 4 files changed, 25 insertions(+), 53 deletions(-) diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go index ffc86df..0bd49de 100644 --- a/app/cron/proj/graph.go +++ b/app/cron/proj/graph.go @@ -1,6 +1,7 @@ package proj import ( + "bytes" "net/http" "strings" "time" @@ -14,15 +15,15 @@ import ( // TestShard to delete func TestShard(c appengine.Context, w http.ResponseWriter, r *http.Request) { + data := []byte(strings.Repeat("01234567890123456789", 1e6)) + // Writing start := time.Now() - s, err := sharder.NewWriter(c, "test") + err := sharder.Writer(c, "test", data) if err != nil { c.Errorf("Writer Error: %s", err) return } - s.Write([]byte(strings.Repeat("01234567890123456789", 1e6))) - s.Close() c.Infof("Write took: %v", time.Since(start)) // Reading @@ -32,7 +33,7 @@ func TestShard(c appengine.Context, w http.ResponseWriter, r *http.Request) { c.Errorf("Reader Error: %s", err) return } - c.Infof("Data Length: %d", read.Len()) + c.Infof("Data Length: %d; isSame: %v", len(read), bytes.Equal(read, data)) c.Infof("Read took: %v", time.Since(start)) } diff --git a/app/helpers/sharder/reader.go b/app/helpers/sharder/reader.go index b9a77c8..29c9950 100644 --- a/app/helpers/sharder/reader.go +++ b/app/helpers/sharder/reader.go @@ -1,7 +1,6 @@ package sharder import ( - "bytes" "sync" "appengine" @@ -9,7 +8,7 @@ import ( ) // Reader creates a new shard reader to retrieve data from datastore -func Reader(c appengine.Context, name string) (*bytes.Buffer, error) { +func Reader(c appengine.Context, name string) ([]byte, error) { if name == "" { return nil, ErrInvalidName } @@ -19,7 +18,7 @@ func Reader(c appengine.Context, name string) (*bytes.Buffer, error) { panic(err) return nil, err } - shards := (master.Size-1)/divisor + 1 + shards := numShards(master.Size) var wg sync.WaitGroup wg.Add(shards) @@ -41,6 +40,5 @@ func Reader(c appengine.Context, name string) (*bytes.Buffer, error) { }(i) } wg.Wait() - - return bytes.NewBuffer(data), nil + return data, nil } diff --git a/app/helpers/sharder/sharder.go b/app/helpers/sharder/sharder.go index ba30d7a..b14fc55 100644 --- a/app/helpers/sharder/sharder.go +++ b/app/helpers/sharder/sharder.go @@ -8,6 +8,11 @@ import ( "appengine/datastore" ) +// TODO: datastore.RunInTransaction +// TODO: delete existing shards greater than current +// TODO: don't panic and actually use error chans +// TODO: possibly use put and get multi for up to 10MB + const ( masterKind = "shard-master" shardKind = "shard-pieces" @@ -25,6 +30,10 @@ func shardKey(c appengine.Context, name string, idx int) *datastore.Key { return datastore.NewKey(c, shardKind, fmt.Sprintf("%s-%d", name, idx), 0, nil) } +func numShards(size int) int { + return (size-1)/divisor + 1 +} + type shardMaster struct { Size int `datastore:"size"` } diff --git a/app/helpers/sharder/writer.go b/app/helpers/sharder/writer.go index e33e6b0..d9aef2e 100644 --- a/app/helpers/sharder/writer.go +++ b/app/helpers/sharder/writer.go @@ -1,74 +1,39 @@ package sharder import ( - "bytes" - "errors" "sync" "appengine" "appengine/datastore" ) -// NewWriter creates a new Sharder to write sharded data to datastore -func NewWriter(c appengine.Context, name string) (*Writer, error) { +// Writer shards and stores a byte String +func Writer(c appengine.Context, name string, data []byte) error { if name == "" { - return nil, ErrInvalidName + return ErrInvalidName } - return &Writer{ - ctx: c, - buff: bytes.NewBufferString(""), - name: name, - }, nil -} - -// Writer is the item that deals with writing sharded data -type Writer struct { - buff *bytes.Buffer - ctx appengine.Context - name string -} - -// Write pushed p bytes to underlying data stream. -func (w *Writer) Write(p []byte) (n int, err error) { - if w.buff == nil { - return 0, errors.New("Buffer is closed") - } - return w.buff.Write(p) -} -// Close finishes off the current buffer, shards and stores the data. -// Once Close is called, the user may call Key to get the key of the stored object. -func (w *Writer) Close() error { - // TODO: datastore.RunInTransaction - // TODO: delete existing shards greater than current - - length := w.buff.Len() - shards := (length-1)/divisor + 1 - key := masterKey(w.ctx, w.name) + master := shardMaster{len(data)} + shards := numShards(master.Size) // Store shardMaster - master := shardMaster{ - Size: length, - } - if _, err := datastore.Put(w.ctx, key, &master); err != nil { - panic(err) + if _, err := datastore.Put(c, masterKey(c, name), &master); err != nil { return err } // shard data and store shards - data := w.buff.Bytes() var wg sync.WaitGroup wg.Add(shards) for i := 0; i < shards; i++ { go func(i int) { - shardKey := shardKey(w.ctx, w.name, i) + shardKey := shardKey(c, name, i) shardData := data[i*divisor:] if len(shardData) > divisor { shardData = data[:divisor] } s := shard{shardData} // w.ctx.Infof("Inn Data %d: %q", i, s.Data) - if _, err := datastore.Put(w.ctx, shardKey, &s); err != nil { + if _, err := datastore.Put(c, shardKey, &s); err != nil { panic(err) } wg.Done() @@ -76,6 +41,5 @@ func (w *Writer) Close() error { } wg.Wait() - w.buff = nil return nil } From 2c19198a89de06718f043ba8721464476ebb230e Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Mon, 16 Nov 2015 01:36:41 -0700 Subject: [PATCH 20/26] Attempting to add a unit test --- app/helpers/sharder/sharder_test.go | 51 +++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) create mode 100644 app/helpers/sharder/sharder_test.go diff --git a/app/helpers/sharder/sharder_test.go b/app/helpers/sharder/sharder_test.go new file mode 100644 index 0000000..a66928f --- /dev/null +++ b/app/helpers/sharder/sharder_test.go @@ -0,0 +1,51 @@ +package sharder + +import ( + "bytes" + "strings" + "testing" + + "appengine/aetest" +) + +func TestFullCircle(t *testing.T) { + // TODO: verify 20 shards + + c, err := aetest.NewContext(nil) + if err != nil { + t.Fatal(err) + } + defer c.Close() + + data := []byte(strings.Repeat("01234567890123456789", 1e6)) + + // Writing + err = Writer(c, "test", data) + if err != nil { + t.Fatal(err) + } + + // Reading + read, err := Reader(c, "test") + if err != nil { + t.Fatal(err) + } + + if !bytes.Equal(read, data) { + t.Fail() + } +} + +var test bool + +func BenchmarkFullCycle(b *testing.B) { + c, _ := aetest.NewContext(nil) + defer c.Close() + data := []byte(strings.Repeat("1", 1e6)) + + for i := 0; i < b.N; i++ { + Writer(c, "test", data) + read, _ := Reader(c, "test") + test = bytes.Equal(read, data) + } +} From c6ef61e4236d8f88f401085ff7dbe4ae39971cd8 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Mon, 16 Nov 2015 22:59:19 -0700 Subject: [PATCH 21/26] Storing the serialized graph --- app/cron/proj/graph.go | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go index 0bd49de..108a38a 100644 --- a/app/cron/proj/graph.go +++ b/app/cron/proj/graph.go @@ -45,7 +45,7 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) { var post, ntag, nimg *graph.Node idx := 0 - + timeout := time.After(time.Second) g := graph.New(false) for idk := range getItems(c) { item = idk.(Item) @@ -61,19 +61,30 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) { g.Connect(post, nimg, 0) } - // This is a SLOW/DEBUG only operation - if idx%2000 == 0 { - c.Infof("Current Duration (%v)", time.Since(start)) + // This is a DEBUG only operation + select { + case <-timeout: + c.Infof("Index: %d; Duration: %v", idx, time.Since(start)) + timeout = time.After(time.Second) + default: } idx++ } + c.Infof("End Loop: %d; Duration: %v", idx, time.Since(start)) // Write result bits, err := g.Bytes() if err != nil { c.Errorf("Error in Graph.Bytes: %v", err) } - w.Write(bits) + c.Infof("End Serialization: Len(%d); Duration: %v", len(bits), time.Since(start)) + + // Storage + if err := sharder.Writer(c, "graph", bits); err != nil { + c.Errorf("Writer Error: %s", err) + return + } + c.Infof("Write Complete; Duration: %v", time.Since(start)) // Count types of nodes binCtr := make(map[graph.NodeType]uint64) From 240c66a4514efdd4657c7265c891edf931dde01b Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Mon, 16 Nov 2015 23:01:31 -0700 Subject: [PATCH 22/26] Adding adjacency duplicate checks --- app/cron/proj/graph/graph.go | 35 ++++++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 7 deletions(-) diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go index 65950ab..1395277 100644 --- a/app/cron/proj/graph/graph.go +++ b/app/cron/proj/graph/graph.go @@ -13,6 +13,7 @@ type Graph struct { s *SerialGraph nodes map[uint64]*Node // Optimal lookup with pointers goes here dupes map[NodeType]map[string]*Node // type > value > node + edges map[uint64]map[uint64]bool // Edge duplicate detection } // New creates a new Graph @@ -25,6 +26,7 @@ func New(isDirected bool) *Graph { }, nodes: make(map[uint64]*Node), dupes: make(map[NodeType]map[string]*Node), + edges: make(map[uint64]map[uint64]bool), } } @@ -60,17 +62,24 @@ func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node { } // Connect connects nodes to and from with an edge of weight w -func (g *Graph) Connect(to, from *Node, weight int64) error { - // TODO: collision checks +func (g *Graph) Connect(from, to *Node, weight int64) error { if to == nil || from == nil { return errors.New("Cannot add edge to nil node") } - from.Adjacent = append(from.Adjacent, *to.Id) // Directed edge - from.Weights = append(from.Weights, weight) - if !g.s.GetDirected() { // UnDirected edge (return trip) - to.Adjacent = append(to.Adjacent, *from.Id) - to.Weights = append(to.Weights, weight) + mm := g.edges[*from.Id] + if mm == nil { + mm = make(map[uint64]bool) + g.edges[*from.Id] = mm + } + if !mm[*to.Id] { + from.Adjacent = append(from.Adjacent, *to.Id) // Directed edge + from.Weights = append(from.Weights, weight) + mm[*to.Id] = true + } + + if !g.s.GetDirected() && !g.edges[*to.Id][*from.Id] { // UnDirected edge (return trip) + g.Connect(to, from, weight) } return nil } @@ -104,6 +113,18 @@ func DecodeGraph(data []byte) (*Graph, error) { // Hydrate Graph from SerialGraph for _, node := range sg.Nodes { g.nodes[*node.Id] = node + + // initialize node adjacency map + mm := g.edges[*node.Id] + if mm == nil { + mm = make(map[uint64]bool) + g.edges[*node.Id] = mm + } + + // populate node adjacency map + for _, adjID := range node.GetAdjacent() { + mm[adjID] = true + } } return g, nil } From 6861dad09451343650f1baf2fd045c41c6499af7 Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Wed, 18 Nov 2015 18:51:15 -0700 Subject: [PATCH 23/26] Fixing shards to not leave dead data in storage on write --- app/helpers/sharder/sharder.go | 2 +- app/helpers/sharder/writer.go | 22 +++++++++++++++++++--- 2 files changed, 20 insertions(+), 4 deletions(-) diff --git a/app/helpers/sharder/sharder.go b/app/helpers/sharder/sharder.go index b14fc55..4fc6316 100644 --- a/app/helpers/sharder/sharder.go +++ b/app/helpers/sharder/sharder.go @@ -39,5 +39,5 @@ type shardMaster struct { } type shard struct { - Data []byte + Data []byte `datastore:"data"` } diff --git a/app/helpers/sharder/writer.go b/app/helpers/sharder/writer.go index d9aef2e..7d018e0 100644 --- a/app/helpers/sharder/writer.go +++ b/app/helpers/sharder/writer.go @@ -13,11 +13,18 @@ func Writer(c appengine.Context, name string, data []byte) error { return ErrInvalidName } - master := shardMaster{len(data)} - shards := numShards(master.Size) + // Attempt to get existing key + key := masterKey(c, name) + oldMaster := shardMaster{} + oldShards := 0 + if datastore.Get(c, key, &oldMaster) == nil { + oldShards = numShards(oldMaster.Size) + } // Store shardMaster - if _, err := datastore.Put(c, masterKey(c, name), &master); err != nil { + master := shardMaster{len(data)} + shards := numShards(master.Size) + if _, err := datastore.Put(c, key, &master); err != nil { return err } @@ -40,6 +47,15 @@ func Writer(c appengine.Context, name string, data []byte) error { }(i) } + // Delete shards that shouldn't be in datastore (write something smaller than before) + if oldShards > shards { + keys := make([]*datastore.Key, oldShards-shards) + for i := shards; i < oldShards; i++ { + keys[i-shards] = shardKey(c, name, i) + } + datastore.DeleteMulti(c, keys) + } + wg.Wait() return nil } From b0398f932125ee509fc0a95506265a72c337e36e Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Wed, 18 Nov 2015 19:14:22 -0700 Subject: [PATCH 24/26] Fixing flatten problems --- app/cron/proj/proj.go | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/app/cron/proj/proj.go b/app/cron/proj/proj.go index 6e56ec3..525bfd9 100644 --- a/app/cron/proj/proj.go +++ b/app/cron/proj/proj.go @@ -1,6 +1,7 @@ package proj import ( + "bytes" "encoding/xml" "appengine" @@ -62,8 +63,15 @@ func flatten(c appengine.Context) chain.Worker { var xmlPage XMLPage var imgs []string + // Clean FormFeed characters from data + data := bytes.Replace(obj.([]byte), []byte("\u000C"), nil, -1) + + // Start up decoder + decoder := xml.NewDecoder(bytes.NewReader(data)) + decoder.Entity = xml.HTMLEntity + // Parse the XML of an object - if err := xml.Unmarshal(obj.([]byte), &xmlPage); err != nil { + if err := decoder.Decode(&xmlPage); err != nil { c.Errorf("Flatten %d: %v", idx, err) return } From 348600f675e5cb4d493cb99fad324f19b22b547d Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Thu, 19 Nov 2015 21:09:59 -0700 Subject: [PATCH 25/26] Adding edge weights when duped --- app/cron/proj/graph.go | 8 ++++---- app/cron/proj/graph/graph.go | 32 ++++++++++++++++++++++++++------ 2 files changed, 30 insertions(+), 10 deletions(-) diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go index 108a38a..5507f3c 100644 --- a/app/cron/proj/graph.go +++ b/app/cron/proj/graph.go @@ -52,13 +52,13 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) { post = g.Add(item.GUID, graph.NodeType_POST, 0) for _, tag := range validTags(item.Tags) { - ntag = g.Add(tag, graph.NodeType_TAG, 0) - g.Connect(post, ntag, 0) + ntag = g.Add(tag, graph.NodeType_TAG, 1) + g.Connect(post, ntag, 1) } for _, img := range item.Imgs { - nimg = g.Add(img, graph.NodeType_IMG, 0) - g.Connect(post, nimg, 0) + nimg = g.Add(img, graph.NodeType_IMG, 1) + g.Connect(post, nimg, 1) } // This is a DEBUG only operation diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go index 1395277..08a9876 100644 --- a/app/cron/proj/graph/graph.go +++ b/app/cron/proj/graph/graph.go @@ -66,22 +66,34 @@ func (g *Graph) Connect(from, to *Node, weight int64) error { if to == nil || from == nil { return errors.New("Cannot add edge to nil node") } + g.connect(from, to, weight) // Directed edge + if !g.s.GetDirected() { + g.connect(to, from, weight) // UnDirected edge (return trip) + } + return nil +} +func (g *Graph) connect(from, to *Node, weight int64) { mm := g.edges[*from.Id] if mm == nil { mm = make(map[uint64]bool) g.edges[*from.Id] = mm } if !mm[*to.Id] { - from.Adjacent = append(from.Adjacent, *to.Id) // Directed edge + from.Adjacent = append(from.Adjacent, *to.Id) from.Weights = append(from.Weights, weight) mm[*to.Id] = true + } else { + // This si SUPER SLOW for highly connected nodes. TODO: make this not suck + idx := 0 + for i, nodeID := range from.Adjacent { + if nodeID == *to.Id { + idx = i + break + } + } + from.Weights[idx] += weight } - - if !g.s.GetDirected() && !g.edges[*to.Id][*from.Id] { // UnDirected edge (return trip) - g.Connect(to, from, weight) - } - return nil } func (g *Graph) genNodeID() (id uint64) { @@ -114,6 +126,14 @@ func DecodeGraph(data []byte) (*Graph, error) { for _, node := range sg.Nodes { g.nodes[*node.Id] = node + // Initialize dupes map + nn := g.dupes[node.GetType()] + if nn == nil { + nn = make(map[string]*Node) + g.dupes[node.GetType()] = nn + } + nn[node.GetValue()] = node + // initialize node adjacency map mm := g.edges[*node.Id] if mm == nil { From 8ff6affb7be33d58fa4585876245cff0c84ea94b Mon Sep 17 00:00:00 2001 From: Nate Woods Date: Thu, 19 Nov 2015 22:55:20 -0700 Subject: [PATCH 26/26] Migrating Protobuffers to v3.0.0-beta-1 All maps - no pointers - sweet!!! --- app/cron/proj/graph.go | 4 +- app/cron/proj/graph/graph.go | 121 ++++++++++------------------- app/cron/proj/graph/graph.pb.go | 131 +++++++++++--------------------- app/cron/proj/graph/graph.proto | 17 ++--- app/cron/proj/graph/load.sh | 4 +- 5 files changed, 96 insertions(+), 181 deletions(-) diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go index 5507f3c..ab7e7aa 100644 --- a/app/cron/proj/graph.go +++ b/app/cron/proj/graph.go @@ -42,7 +42,7 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) { start := time.Now() var item Item - var post, ntag, nimg *graph.Node + var post, ntag, nimg graph.NodeID idx := 0 timeout := time.After(time.Second) @@ -89,7 +89,7 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) { // Count types of nodes binCtr := make(map[graph.NodeType]uint64) for _, node := range g.Nodes() { - binCtr[*node.Type]++ + binCtr[node.Type]++ } // Log out types of nodes diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go index 08a9876..4bcae7a 100644 --- a/app/cron/proj/graph/graph.go +++ b/app/cron/proj/graph/graph.go @@ -1,112 +1,89 @@ package graph -import ( - "errors" - - "github.com/golang/protobuf/proto" -) +import "errors" // TODO: add some graph processing functions +// NodeID is a graph identifier +type NodeID uint64 + // Graph is the serializable graph we have all been looking for type Graph struct { s *SerialGraph - nodes map[uint64]*Node // Optimal lookup with pointers goes here - dupes map[NodeType]map[string]*Node // type > value > node - edges map[uint64]map[uint64]bool // Edge duplicate detection + dupes map[NodeType]map[string]NodeID // type > value > node } // New creates a new Graph func New(isDirected bool) *Graph { return &Graph{ s: &SerialGraph{ - Nodes: make([]*Node, 0), - Directed: proto.Bool(isDirected), - NodeCount: proto.Uint64(0), + Nodes: make(map[uint64]*Node), + Directed: isDirected, + NodeCount: 0, }, - nodes: make(map[uint64]*Node), - dupes: make(map[NodeType]map[string]*Node), - edges: make(map[uint64]map[uint64]bool), + dupes: make(map[NodeType]map[string]NodeID), } } +// Get returns an associated node for a given ID +func (g *Graph) Get(id NodeID) *Node { + return g.s.Nodes[uint64(id)] +} + // Add creates and adds a node to the graph -func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node { +func (g *Graph) Add(value string, ttype NodeType, weight int64) NodeID { // Check duplicate node (add weight) dupe := g.dupes[ttype][value] - if dupe != nil { - *dupe.Weight += weight + if dupe != 0 { + g.Get(dupe).Weight += weight return dupe } // Create new node + id := g.genNodeID() n := &Node{ - Id: proto.Uint64(g.genNodeID()), - Value: proto.String(value), - Weight: proto.Int64(weight), - Type: ttype.Enum(), - Adjacent: make([]uint64, 0), + Value: value, + Weight: weight, + Type: ttype, + Adjacent: make(map[uint64]int64, 0), } - g.nodes[*n.Id] = n - g.s.Nodes = append(g.s.Nodes, n) + g.s.Nodes[id] = n // Add dupe check to list dub, ok := g.dupes[ttype] if !ok { - dub = make(map[string]*Node) + dub = make(map[string]NodeID) g.dupes[ttype] = dub } - dub[value] = n - return n + nid := NodeID(id) + dub[value] = nid + return nid } // Connect connects nodes to and from with an edge of weight w -func (g *Graph) Connect(from, to *Node, weight int64) error { - if to == nil || from == nil { +func (g *Graph) Connect(from, to NodeID, weight int64) error { + if to == 0 || from == 0 { return errors.New("Cannot add edge to nil node") } - g.connect(from, to, weight) // Directed edge - if !g.s.GetDirected() { - g.connect(to, from, weight) // UnDirected edge (return trip) + g.Get(from).Adjacent[uint64(to)] += weight // Directed edge + if !g.s.Directed { + g.Get(to).Adjacent[uint64(from)] += weight // UnDirected edge (return trip) } return nil } -func (g *Graph) connect(from, to *Node, weight int64) { - mm := g.edges[*from.Id] - if mm == nil { - mm = make(map[uint64]bool) - g.edges[*from.Id] = mm - } - if !mm[*to.Id] { - from.Adjacent = append(from.Adjacent, *to.Id) - from.Weights = append(from.Weights, weight) - mm[*to.Id] = true - } else { - // This si SUPER SLOW for highly connected nodes. TODO: make this not suck - idx := 0 - for i, nodeID := range from.Adjacent { - if nodeID == *to.Id { - idx = i - break - } - } - from.Weights[idx] += weight - } -} - func (g *Graph) genNodeID() (id uint64) { - id = g.s.GetNodeCount() - *g.s.NodeCount++ + g.s.NodeCount++ + id = g.s.NodeCount return id } // Nodes returns all the nodes in the Graph func (g *Graph) Nodes() []*Node { - n := make([]*Node, len(g.nodes)) + n := make([]*Node, len(g.s.Nodes)) ctr := 0 - for _, node := range g.nodes { + for _, node := range g.s.Nodes { n[ctr] = node ctr++ } @@ -123,35 +100,19 @@ func DecodeGraph(data []byte) (*Graph, error) { g.s = sg // Hydrate Graph from SerialGraph - for _, node := range sg.Nodes { - g.nodes[*node.Id] = node - - // Initialize dupes map - nn := g.dupes[node.GetType()] + for id, node := range sg.Nodes { + nn := g.dupes[node.Type] if nn == nil { - nn = make(map[string]*Node) - g.dupes[node.GetType()] = nn - } - nn[node.GetValue()] = node - - // initialize node adjacency map - mm := g.edges[*node.Id] - if mm == nil { - mm = make(map[uint64]bool) - g.edges[*node.Id] = mm - } - - // populate node adjacency map - for _, adjID := range node.GetAdjacent() { - mm[adjID] = true + nn = make(map[string]NodeID) + g.dupes[node.Type] = nn } + nn[node.Value] = NodeID(id) } return g, nil } // Bytes flattens a graph to a flat file format func (g *Graph) Bytes() ([]byte, error) { - // TODO: use smaller numbers for encoding... return g.s.Bytes() } diff --git a/app/cron/proj/graph/graph.pb.go b/app/cron/proj/graph/graph.pb.go index 3d661a5..9e19a04 100644 --- a/app/cron/proj/graph/graph.pb.go +++ b/app/cron/proj/graph/graph.pb.go @@ -3,7 +3,7 @@ // DO NOT EDIT! /* -Package main is a generated protocol buffer package. +Package graph is a generated protocol buffer package. It is generated from these files: graph.proto @@ -48,116 +48,71 @@ var NodeType_value = map[string]int32{ "USER": 4, } -func (x NodeType) Enum() *NodeType { - p := new(NodeType) - *p = x - return p -} func (x NodeType) String() string { return proto.EnumName(NodeType_name, int32(x)) } -func (x *NodeType) UnmarshalJSON(data []byte) error { - value, err := proto.UnmarshalJSONEnum(NodeType_value, data, "NodeType") - if err != nil { - return err - } - *x = NodeType(value) - return nil -} +func (NodeType) EnumDescriptor() ([]byte, []int) { return fileDescriptor0, []int{0} } type SerialGraph struct { - Nodes []*Node `protobuf:"bytes,1,rep,name=nodes" json:"nodes,omitempty"` - Directed *bool `protobuf:"varint,2,opt,name=directed,def=0" json:"directed,omitempty"` - NodeCount *uint64 `protobuf:"varint,3,req,name=nodeCount,def=0" json:"nodeCount,omitempty"` - XXX_unrecognized []byte `json:"-"` + Nodes map[uint64]*Node `protobuf:"bytes,1,rep,name=nodes" json:"nodes,omitempty" protobuf_key:"varint,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"` + Directed bool `protobuf:"varint,2,opt,name=directed" json:"directed,omitempty"` + NodeCount uint64 `protobuf:"varint,3,opt,name=nodeCount" json:"nodeCount,omitempty"` } -func (m *SerialGraph) Reset() { *m = SerialGraph{} } -func (m *SerialGraph) String() string { return proto.CompactTextString(m) } -func (*SerialGraph) ProtoMessage() {} +func (m *SerialGraph) Reset() { *m = SerialGraph{} } +func (m *SerialGraph) String() string { return proto.CompactTextString(m) } +func (*SerialGraph) ProtoMessage() {} +func (*SerialGraph) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{0} } -const Default_SerialGraph_Directed bool = false -const Default_SerialGraph_NodeCount uint64 = 0 - -func (m *SerialGraph) GetNodes() []*Node { +func (m *SerialGraph) GetNodes() map[uint64]*Node { if m != nil { return m.Nodes } return nil } -func (m *SerialGraph) GetDirected() bool { - if m != nil && m.Directed != nil { - return *m.Directed - } - return Default_SerialGraph_Directed -} - -func (m *SerialGraph) GetNodeCount() uint64 { - if m != nil && m.NodeCount != nil { - return *m.NodeCount - } - return Default_SerialGraph_NodeCount -} - type Node struct { - Id *uint64 `protobuf:"varint,1,req,name=id" json:"id,omitempty"` - Value *string `protobuf:"bytes,2,req,name=value" json:"value,omitempty"` - Weight *int64 `protobuf:"varint,3,opt,name=weight" json:"weight,omitempty"` - Type *NodeType `protobuf:"varint,4,opt,name=type,enum=main.NodeType,def=0" json:"type,omitempty"` - Adjacent []uint64 `protobuf:"varint,5,rep,name=adjacent" json:"adjacent,omitempty"` - Weights []int64 `protobuf:"varint,6,rep,name=weights" json:"weights,omitempty"` - XXX_unrecognized []byte `json:"-"` + Value string `protobuf:"bytes,1,opt,name=value" json:"value,omitempty"` + Weight int64 `protobuf:"varint,2,opt,name=weight" json:"weight,omitempty"` + Type NodeType `protobuf:"varint,3,opt,name=type,enum=graph.NodeType" json:"type,omitempty"` + Adjacent map[uint64]int64 `protobuf:"bytes,4,rep,name=adjacent" json:"adjacent,omitempty" protobuf_key:"varint,1,opt,name=key" protobuf_val:"varint,2,opt,name=value"` } -func (m *Node) Reset() { *m = Node{} } -func (m *Node) String() string { return proto.CompactTextString(m) } -func (*Node) ProtoMessage() {} - -const Default_Node_Type NodeType = NodeType_UNKNOWN - -func (m *Node) GetId() uint64 { - if m != nil && m.Id != nil { - return *m.Id - } - return 0 -} +func (m *Node) Reset() { *m = Node{} } +func (m *Node) String() string { return proto.CompactTextString(m) } +func (*Node) ProtoMessage() {} +func (*Node) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{1} } -func (m *Node) GetValue() string { - if m != nil && m.Value != nil { - return *m.Value - } - return "" -} - -func (m *Node) GetWeight() int64 { - if m != nil && m.Weight != nil { - return *m.Weight - } - return 0 -} - -func (m *Node) GetType() NodeType { - if m != nil && m.Type != nil { - return *m.Type - } - return Default_Node_Type -} - -func (m *Node) GetAdjacent() []uint64 { +func (m *Node) GetAdjacent() map[uint64]int64 { if m != nil { return m.Adjacent } return nil } -func (m *Node) GetWeights() []int64 { - if m != nil { - return m.Weights - } - return nil -} - func init() { - proto.RegisterEnum("main.NodeType", NodeType_name, NodeType_value) + proto.RegisterEnum("graph.NodeType", NodeType_name, NodeType_value) +} + +var fileDescriptor0 = []byte{ + // 294 bytes of a gzipped FileDescriptorProto + 0x1f, 0x8b, 0x08, 0x00, 0x00, 0x09, 0x6e, 0x88, 0x02, 0xff, 0x54, 0x91, 0x41, 0x4b, 0xc3, 0x40, + 0x10, 0x85, 0xdd, 0xee, 0xb6, 0x4d, 0x67, 0x69, 0x5d, 0xe7, 0x14, 0x03, 0x85, 0xd2, 0x53, 0x51, + 0x89, 0x10, 0x2f, 0x2a, 0x78, 0x28, 0x52, 0x82, 0x88, 0xa9, 0x98, 0x14, 0xcf, 0xb1, 0x59, 0xda, + 0x68, 0x49, 0x42, 0xdc, 0x2a, 0xf9, 0x2d, 0xde, 0xfc, 0xa5, 0x66, 0xd7, 0x2a, 0xe9, 0x2d, 0x33, + 0xef, 0x7d, 0xf3, 0x5e, 0x58, 0xe0, 0xab, 0x32, 0x2e, 0xd6, 0x6e, 0x51, 0xe6, 0x2a, 0xc7, 0xb6, + 0x19, 0xc6, 0x5f, 0x04, 0x78, 0x28, 0xcb, 0x34, 0xde, 0xf8, 0x7a, 0xc6, 0x33, 0x68, 0x67, 0x79, + 0x22, 0xdf, 0x6d, 0x32, 0xa2, 0x13, 0xee, 0x0d, 0xdd, 0x5f, 0xa6, 0x61, 0x71, 0x03, 0xad, 0xcf, + 0x32, 0x55, 0x56, 0x28, 0xc0, 0x4a, 0xd2, 0x52, 0x2e, 0x95, 0x4c, 0xec, 0xd6, 0x88, 0x4c, 0x2c, + 0x3c, 0x82, 0x9e, 0xe6, 0x6f, 0xf3, 0x6d, 0xa6, 0x6c, 0x5a, 0xaf, 0x98, 0x73, 0x05, 0xd0, 0x40, + 0x38, 0xd0, 0x37, 0x59, 0xd5, 0xe7, 0x6b, 0x09, 0x1d, 0x68, 0x7f, 0xc4, 0x9b, 0xad, 0x34, 0x30, + 0xf7, 0xf8, 0x2e, 0x4d, 0xdb, 0xaf, 0x5b, 0x97, 0x64, 0xfc, 0x4d, 0x80, 0xe9, 0x01, 0xfb, 0x7f, + 0x46, 0xcd, 0xf5, 0x70, 0x00, 0x9d, 0x4f, 0x99, 0xae, 0xd6, 0xca, 0x80, 0x14, 0x87, 0xc0, 0x54, + 0x55, 0x48, 0x13, 0x38, 0xf0, 0x0e, 0x1b, 0x67, 0xa2, 0x7a, 0x8d, 0xa7, 0x60, 0xc5, 0xc9, 0x6b, + 0xbc, 0x94, 0x75, 0x27, 0x66, 0xfe, 0xeb, 0xb8, 0x61, 0x71, 0xa7, 0x3b, 0xcd, 0x14, 0x74, 0xce, + 0xa1, 0xbf, 0xb7, 0xd8, 0x6f, 0xdc, 0x6f, 0x36, 0xa6, 0xba, 0xe4, 0xc9, 0x0d, 0x58, 0xff, 0x49, + 0x1c, 0xba, 0x8b, 0xe0, 0x3e, 0x98, 0x3f, 0x07, 0xe2, 0x00, 0x2d, 0x60, 0x8f, 0xf3, 0x30, 0x12, + 0x04, 0xbb, 0x40, 0xef, 0x1e, 0x7c, 0xd1, 0xd2, 0x1f, 0xd1, 0xd4, 0x17, 0x54, 0x6b, 0x8b, 0x70, + 0xf6, 0x24, 0xd8, 0x4b, 0xc7, 0xbc, 0xc7, 0xc5, 0x4f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xbf, 0x47, + 0x15, 0xe3, 0x9e, 0x01, 0x00, 0x00, } diff --git a/app/cron/proj/graph/graph.proto b/app/cron/proj/graph/graph.proto index d5693dd..3d9bee5 100644 --- a/app/cron/proj/graph/graph.proto +++ b/app/cron/proj/graph/graph.proto @@ -1,9 +1,10 @@ +syntax = "proto3"; package graph; message SerialGraph { - repeated Node nodes = 1; - optional bool directed = 2 [default = false]; - required uint64 nodeCount = 3 [default = 0]; + map nodes = 1; + bool directed = 2; + uint64 nodeCount = 3; } enum NodeType { @@ -15,10 +16,8 @@ enum NodeType { } message Node { - required uint64 id = 1; - required string value = 2; - optional int64 weight = 3; - optional NodeType type = 4 [default = UNKNOWN]; - repeated uint64 adjacent = 5; - repeated int64 weights = 6; + string value = 1; + int64 weight = 2; + NodeType type = 3; + map adjacent = 4; } diff --git a/app/cron/proj/graph/load.sh b/app/cron/proj/graph/load.sh index 4ebec6a..b45f7ee 100755 --- a/app/cron/proj/graph/load.sh +++ b/app/cron/proj/graph/load.sh @@ -1,9 +1,9 @@ #!/bin/sh - +set -e +export PATH=$PATH:$GOPATH/bin if ! which proto >/dev/null; then echo "Installing proto and protoc-gen-go" go get -u github.com/golang/protobuf/{proto,protoc-gen-go} - export PATH=$PATH:$GOPATH/bin else echo "Proto and protoc-gen-go already installed" fi