From ca2b4677039aa25e872da1d6a9229cf04e2ece64 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Mon, 7 Sep 2015 17:38:11 -0600
Subject: [PATCH 01/26] Renaming JsonPostResponse.go -> JsonResponse.go

---
 app/api/{JsonPostResponse.go => JsonResponse.go} | 1 +
 app/api/api.go                                   | 1 -
 2 files changed, 1 insertion(+), 1 deletion(-)
 rename app/api/{JsonPostResponse.go => JsonResponse.go} (92%)
diff --git a/app/api/JsonPostResponse.go b/app/api/JsonResponse.go
similarity index 92%
rename from app/api/JsonPostResponse.go
rename to app/api/JsonResponse.go
index f3a748d..ac4d16a 100644
--- a/app/api/JsonPostResponse.go
+++ b/app/api/JsonResponse.go
@@ -36,6 +36,7 @@ func (res *JsonResponse) write(w http.ResponseWriter) error {
   } else {
     out = string(str_items)
   }
+  w.Header().Set("Content-Type", "application/json; charset=utf-8")
   fmt.Fprint(w, out)
   return err
 }
diff --git a/app/api/api.go b/app/api/api.go
index 75f8174..51c5baf 100644
--- a/app/api/api.go
+++ b/app/api/api.go
@@ -25,7 +25,6 @@ func get_url_count(url *url.URL) int {
 
 // Actual API functions
 func random(c appengine.Context, w http.ResponseWriter, r *http.Request) {
-  w.Header().Set("Content-Type", "application/json; charset=utf-8")
   count := get_url_count(r.URL)
   c.Infof("Requested %v random posts", count)
   result := NewJsonResponse(500, "Unknown Error", nil)

From 3a5f3244f7664cff1be4452da33124a08aa6e3ed Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Mon, 7 Sep 2015 18:04:52 -0600
Subject: [PATCH 02/26] Begin spliting cron into multiple phases

---
 app/cron/crawler.go  | 89 ++++++++++++++++++++++++++++++++++++++++++++
 app/cron/cron.go     |  1 +
 app/cron/parser.go   | 63 +++++++++++++++++++++++++++++++
 app/models/models.go |  5 ++-
 4 files changed, 157 insertions(+), 1 deletion(-)
 create mode 100644 app/cron/crawler.go
 create mode 100644 app/cron/parser.go

diff --git a/app/cron/crawler.go b/app/cron/crawler.go
new file mode 100644
index 0000000..c5dfbcd
--- /dev/null
+++ b/app/cron/crawler.go
@@ -0,0 +1,89 @@
+package cron
+
+import (
+  // "app/models"
+  // "app/helpers/keycache"
+  "appengine"
+  // "appengine/datastore"
+  // "appengine/delay"
+  // "appengine/taskqueue"
+  "appengine/urlfetch"
+  "encoding/xml"
+  "encoding/json"
+  "fmt"
+  "net/http"
+)
+
+// Sourcer: this is a source for defered work chains
+
+type ChivePost struct {
+  KEY string `xml:"guid"`
+  XML string `xml:",innerxml"`
+}
+
+type ChivePostMiner struct {
+  Item ChivePost `xml:"channel>item"`
+}
+
+
+func crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) {
+  url := page_url(0)
+
+  // Get Response
+  c.Infof("Parsing index 0 (%v)", url)
+  resp, err := urlfetch.Client(c).Get(url)
+  if err != nil {
+    fmt.Fprint(w, "client error")
+    return
+  }
+  defer resp.Body.Close()
+  if resp.StatusCode != 200 {
+    fmt.Fprint(w, "unexpected error code")
+  }
+
+  // Decode Response
+  var feed []ChivePostMiner
+  decoder := xml.NewDecoder(resp.Body)
+  if err := decoder.Decode(&feed); err != nil {
+    c.Errorf("decode error %v", err)
+    fmt.Fprint(w, "decode error")
+    return
+  }
+
+  feed[0].Item.XML = "<item>" + feed[0].Item.XML + "</item>"
+
+  c.Infof("Something %v", feed)
+
+  // TODO: store all items to datastore
+
+
+  // DEBUGGING ONLY.... HERE DOWN
+
+  post, err := parseData(feed[0].Item.XML)
+  if err != nil {
+    c.Errorf("error parsing %v", err)
+    return
+  }
+
+  // JSONIFY Response
+  str_items, err := json.MarshalIndent(&post, "", "  ")
+  var out string
+  if err != nil {
+    out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}"
+  } else {
+    out = string(str_items)
+  }
+  w.Header().Set("Content-Type", "application/json; charset=utf-8")
+  fmt.Fprint(w, out)
+}
+
+
+type FeedCrawler struct {
+  context appengine.Context
+  client  *http.Client
+}
+
+func (fc *FeedCrawler) Init(c appengine.Context) {
+  fc.context = c
+  fc.client = urlfetch.Client(c)
+}
diff --git a/app/cron/cron.go b/app/cron/cron.go
index 7b92c57..f7ef0ca 100644
--- a/app/cron/cron.go
+++ b/app/cron/cron.go
@@ -25,6 +25,7 @@ const (
 )
 
 func Init() {
+  http.Handle("/cron/crawl", appstats.NewHandler(crawl))
   http.Handle("/cron/parse", appstats.NewHandler(parseFeeds))
   http.HandleFunc("/cron/delete", delete)
 }
diff --git a/app/cron/parser.go b/app/cron/parser.go
new file mode 100644
index 0000000..e869203
--- /dev/null
+++ b/app/cron/parser.go
@@ -0,0 +1,63 @@
+package cron
+
+import (
+  // "app/models"
+  // "app/helpers/keycache"
+  // "appengine"
+  // "appengine/datastore"
+  // "appengine/delay"
+  // "appengine/taskqueue"
+  // "appengine/urlfetch"
+  "encoding/xml"
+  // "encoding/json"
+  // "fmt"
+  // "net/http"
+  "html/template"
+)
+
+type Node struct {
+  // XML string `xml:",innerxml"`
+  // ATTR []string
+  // DATA string `xml:",chardata"`
+  XMLName xml.Name
+  XMLAttrs []xml.Attr `xml:",any"`
+  DATA string `xml:",chardata"`
+}
+
+type Post struct {
+  Guid       string   `xml:"guid"`
+  Tags       []string `xml:"category"`
+  Link       string   `xml:"link"`
+  Date       string   `xml:"pubDate"`
+  Title      string   `xml:"title"`
+  Creator    string   `xml:"creator"`
+  Media      []Img    `xml:"content"`
+  CommentRSS string   `xml:"commentRss"`
+  Comment    []string   `xml:"comments"`
+  Desc       template.HTML   `xml:"description"`
+  Enclosure  struct {
+    Url      string   `xml:"url,attr"`
+    Children []Node   `xml:",any"`
+  }   `xml:"enclosure"`
+  Thumbnail  struct {
+    Url      string   `xml:"url,attr"`
+    Children []Node   `xml:",any"`
+  }   `xml:"thumbnail"`
+  Children   []Node   `xml:",any"`
+  Content    template.HTML  `xml:"encoded"`
+}
+
+type Img struct {
+  Url      string `xml:"url,attr"`
+  Title    string `xml:"title"`
+  Rating   string `xml:"rating"`
+  Category string `xml:"category"`
+}
+
+// Worker: this will be a worker on defered work chains
+
+func parseData(data string) (*Post, error) {
+  var post Post
+  err := xml.Unmarshal([]byte(data), &post)
+  return &post, err
+}
diff --git a/app/models/models.go b/app/models/models.go
index b7e363e..9795add 100644
--- a/app/models/models.go
+++ b/app/models/models.go
@@ -1,3 +1,6 @@
 package models
 
-const DB_POST_TABLE = "PostNew"
+const (
+  DB_POST_TABLE = "PostNew"
+  DB_RAW_XML_POST_TABLE = "RawXMLPosts"
+)

From c2f57667eb31b3d76188b2471b02da26322a041e Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Mon, 7 Sep 2015 23:24:23 -0600
Subject: [PATCH 03/26] Breaking crawler up into pieces

---
 app/cron/crawler/Batcher.go       |  23 +++++
 app/cron/crawler/FeedCrawler.go   | 137 ++++++++++++++++++++++++++++++
 app/cron/{ => crawler}/crawler.go |  60 +++++++------
 app/cron/cron.go                  |   4 +-
 4 files changed, 197 insertions(+), 27 deletions(-)
 create mode 100644 app/cron/crawler/Batcher.go
 create mode 100644 app/cron/crawler/FeedCrawler.go
 rename app/cron/{ => crawler}/crawler.go (51%)

diff --git a/app/cron/crawler/Batcher.go b/app/cron/crawler/Batcher.go
new file mode 100644
index 0000000..1c68a88
--- /dev/null
+++ b/app/cron/crawler/Batcher.go
@@ -0,0 +1,23 @@
+package crawler
+
+func Batcher(in <-chan ChivePost, batch_size int) <-chan []ChivePost {
+  out := make(chan []ChivePost)
+  go func() {
+    defer close(out)
+    batch := make([]ChivePost, batch_size)
+    count := 0
+    for post := range in {
+      batch[count] = post
+      count++
+      if count >= batch_size {
+        count = 0
+        out <- batch
+        batch = make([]ChivePost, batch_size) // allocate another chunk of memory
+      }
+    }
+    if count > 0 {
+      out <- batch[:count]
+    }
+  }()
+  return out
+}
diff --git a/app/cron/crawler/FeedCrawler.go b/app/cron/crawler/FeedCrawler.go
new file mode 100644
index 0000000..57a3af4
--- /dev/null
+++ b/app/cron/crawler/FeedCrawler.go
@@ -0,0 +1,137 @@
+package crawler
+
+import (
+  // "app/models"
+  // "app/helpers/keycache"
+  "appengine"
+  // "appengine/datastore"
+  // "appengine/delay"
+  // "appengine/taskqueue"
+  "appengine/urlfetch"
+  // "encoding/xml"
+  // "fmt"
+  "net/http"
+  "strconv"
+)
+
+var (
+  DEBUG = true
+  DEBUG_DEPTH = 1
+)
+
+func NewFeedCrawler(c appengine.Context) *FeedCrawler {
+  return &FeedCrawler{
+    context: c,
+    client:  urlfetch.Client(c),
+    results: make(chan ChivePost),
+  }
+}
+
+type FeedCrawler struct {
+  context appengine.Context
+  client  *http.Client
+
+  todo    []int
+  guids   map[string]bool // this could be extremely large
+  results chan ChivePost
+}
+
+func (fc *FeedCrawler) StartSearch() <-chan ChivePost {
+  go func() {
+    defer close(fc.results)
+    for i := 0; i < 99; i++ {
+      fc.results <- ChivePost{KEY:"asdf", XML:strconv.Itoa(i)}
+    }
+    // fc.search(1, -1)
+  }()
+  return fc.results
+}
+
+func (fc *FeedCrawler) addRange(bot, top int) {
+  // TODO: isn't there a better way to perform this operation!?
+  for i := bot + 1; i < top; i++ {
+    fc.todo = append(fc.todo, i)
+  }
+}
+
+// func (fc *FeedCrawler) search(bot, top int) (err error) {
+//   /*
+//   def infinite_length(bottom=1, top=-1):
+//     if bottom == 1 and not item_exists(1): return 0  # Starting edge case
+//     if bottom == top - 1: return bottom  # Result found! (top doesn’t exist)
+//     if top < 0:  # Searching forward
+//       top = bottom << 1  # Base 2 hops
+//       if item_exists(top):
+//         top, bottom = -1, top # continue searching forward
+//     else:  # Binary search between bottom and top
+//       middle = (bottom + top) // 2
+//       bottom, top = middle, top if item_exists(middle) else bottom, middle
+//     return infinite_length(bottom, top)  # Tail recursion!!!
+//   */
+//   if bot == top - 1 {
+//     fc.context.Infof("TOP OF RANGE FOUND! @%d", top)
+//     fc.addRange(bot, top)
+//     return nil
+//   }
+//   var full_stop, is_stop bool = false, false
+//   if top < 0 { // Searching forward
+//     top = bot << 1  // Base 2 hops forward
+//     is_stop, full_stop, err = fc.isStop(top)
+//     if err != nil {
+//       return err
+//     }
+//     if !is_stop {
+//       fc.addRange(bot, top)
+//       top, bot = -1, top
+//     }
+//   } else { // Binary search between top and bottom
+//     mid := (bot + top) / 2
+//     is_stop, full_stop, err = fc.isStop(mid)
+//     if err != nil {
+//       return err
+//     }
+//     if is_stop {
+//       top = mid
+//     } else {
+//       fc.addRange(bot, mid)
+//       bot = mid
+//     }
+//   }
+//   if full_stop {
+//     return nil
+//   }
+//   return fc.search(bot, top)  // TAIL RECURSION!!!
+// }
+//
+// func (fc *FeedCrawler) isStop(idx int) (is_stop, full_stop bool, err error) {
+//   // Gather posts as necessary
+//   posts, err := fc.getAndParseFeed(idx)
+//   if err == FeedParse404Error {
+//     fc.context.Infof("Reached the end of the feed list (%v)", idx)
+//     return true, false, nil
+//   }
+//   if err != nil {
+//     fc.context.Errorf("Error decoding ChiveFeed: %s", err)
+//     return false, false, err
+//   }
+//
+//   // Check for Duplicates
+//   store_count := 0
+//   for _, post := range posts {
+//     id, _, err := guidToInt(post.Guid)
+//     if x.guids[id] || err != nil {
+//       continue
+//     }
+//     store_count += 1
+//   }
+//   fc.posts = append(fc.posts, posts...)
+//
+//   // Use store_count info to determine if isStop
+//   is_stop = store_count == 0 || DEBUG
+//   full_stop = len(posts) != store_count && store_count > 0
+//   if DEBUG {
+//     is_stop = idx > DEBUG_DEPTH
+//     full_stop = idx == DEBUG_DEPTH
+//   }
+//   return
+// }
diff --git a/app/cron/crawler.go b/app/cron/crawler/crawler.go
similarity index 51%
rename from app/cron/crawler.go
rename to app/cron/crawler/crawler.go
index c5dfbcd..fd8f710 100644
--- a/app/cron/crawler.go
+++ b/app/cron/crawler/crawler.go
@@ -1,4 +1,4 @@
-package cron
+package crawler
 
 import (
   // "app/models"
@@ -9,7 +9,6 @@ import (
   // "appengine/taskqueue"
   "appengine/urlfetch"
   "encoding/xml"
-  "encoding/json"
   "fmt"
   "net/http"
 )
@@ -25,8 +24,12 @@ type ChivePostMiner struct {
   Item ChivePost `xml:"channel>item"`
 }
 
+func page_url(idx int) string {
+  return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx)
+}
+
 
-func crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) {
+func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) {
   url := page_url(0)
 
   // Get Response
@@ -59,31 +62,36 @@ func crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 
   // DEBUGGING ONLY.... HERE DOWN
 
-  post, err := parseData(feed[0].Item.XML)
-  if err != nil {
-    c.Errorf("error parsing %v", err)
-    return
-  }
-
-  // JSONIFY Response
-  str_items, err := json.MarshalIndent(&post, "", "  ")
-  var out string
-  if err != nil {
-    out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}"
-  } else {
-    out = string(str_items)
-  }
-  w.Header().Set("Content-Type", "application/json; charset=utf-8")
-  fmt.Fprint(w, out)
+  // post, err := parseData(feed[0].Item.XML)
+  // if err != nil {
+  //   c.Errorf("error parsing %v", err)
+  //   return
+  // }
+  //
+  // // JSONIFY Response
+  // str_items, err := json.MarshalIndent(&post, "", "  ")
+  // var out string
+  // if err != nil {
+  //   out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}"
+  // } else {
+  //   out = string(str_items)
+  // }
+  // w.Header().Set("Content-Type", "application/json; charset=utf-8")
+  // fmt.Fprint(w, out)
 }
 
-
-type FeedCrawler struct {
-  context appengine.Context
-  client  *http.Client
+func Crawl2(c appengine.Context, w http.ResponseWriter, r *http.Request) {
+  crawler := NewFeedCrawler(c)
+  found_posts := crawler.StartSearch()
+  batch_posts := Batcher(found_posts, 20)
+  Storage(batch_posts, c)
 }
 
-func (fc *FeedCrawler) Init(c appengine.Context) {
-  fc.context = c
-  fc.client = urlfetch.Client(c)
+func Storage(in <-chan []ChivePost, c appengine.Context) {
+  go func() {
+    for batch := range in {
+      fmt.Println(batch)
+      c.Infof("Storing %v", batch)
+    }
+  }()
 }
diff --git a/app/cron/cron.go b/app/cron/cron.go
index f7ef0ca..4f7f119 100644
--- a/app/cron/cron.go
+++ b/app/cron/cron.go
@@ -1,6 +1,7 @@
 package cron
 
 import (
+  "app/cron/crawler"
   "app/models"
   "app/helpers/keycache"
   "appengine"
@@ -25,7 +26,8 @@ const (
 )
 
 func Init() {
-  http.Handle("/cron/crawl", appstats.NewHandler(crawl))
+  http.Handle("/cron/crawl", appstats.NewHandler(crawler.Crawl))
+  http.Handle("/cron/crawl2", appstats.NewHandler(crawler.Crawl2))
   http.Handle("/cron/parse", appstats.NewHandler(parseFeeds))
   http.HandleFunc("/cron/delete", delete)
 }

From e70b32956ac68088b67796363f70ec13943fabdb Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sun, 1 Nov 2015 14:32:07 -0700
Subject: [PATCH 04/26] Macking app work with new version of appengine builder

---
 app.yaml => yaml/app.yaml     | 14 +++++---------
 cron.yaml => yaml/cron.yaml   |  1 +
 index.yaml => yaml/index.yaml |  0
 yaml/main.go                  | 16 ++++++++++++++++
 yaml/module-cron.yaml         | 22 ++++++++++++++++++++++
 5 files changed, 44 insertions(+), 9 deletions(-)
 rename app.yaml => yaml/app.yaml (59%)
 rename cron.yaml => yaml/cron.yaml (85%)
 rename index.yaml => yaml/index.yaml (100%)
 create mode 100644 yaml/main.go
 create mode 100644 yaml/module-cron.yaml

diff --git a/app.yaml b/yaml/app.yaml
similarity index 59%
rename from app.yaml
rename to yaml/app.yaml
index ebcc2cc..6efa065 100644
--- a/app.yaml
+++ b/yaml/app.yaml
@@ -8,19 +8,15 @@ skip_files:
 
 handlers:
 - url: /static
-  static_dir: static
+  static_dir: ../static
 
 - url: /
-  static_files: static/index.html
-  upload: static/index.html
+  static_files: ../static/index.html
+  upload: ../static/index.html
 
 - url: /(favicon\.ico|index\.html)
-  static_files: static/\1
-  upload: static/(favicon\.ico|index\.html)
-
-- url: /cron/.*
-  script: _go_app
-  login: admin
+  static_files: ../static/\1
+  upload: ../static/(favicon\.ico|index\.html)
 
 - url: /.*
   script: _go_app
diff --git a/cron.yaml b/yaml/cron.yaml
similarity index 85%
rename from cron.yaml
rename to yaml/cron.yaml
index 407d25e..08dbe64 100644
--- a/cron.yaml
+++ b/yaml/cron.yaml
@@ -2,3 +2,4 @@ cron:
 - description: Parse feeds from source
   url: /cron/parse
   schedule: every 6 hours
+  target: cron
diff --git a/index.yaml b/yaml/index.yaml
similarity index 100%
rename from index.yaml
rename to yaml/index.yaml
diff --git a/yaml/main.go b/yaml/main.go
new file mode 100644
index 0000000..6cd9712
--- /dev/null
+++ b/yaml/main.go
@@ -0,0 +1,16 @@
+package main
+
+import (
+	"net/http"
+
+	"github.com/bign8/chive-show/app/api"
+	"github.com/bign8/chive-show/app/cron"
+)
+
+func init() {
+	http.HandleFunc("/", http.NotFound) // Default Handler
+
+	// Setup Other routes routes
+	api.Init()
+	cron.Init()
+}
diff --git a/yaml/module-cron.yaml b/yaml/module-cron.yaml
new file mode 100644
index 0000000..a70f2c3
--- /dev/null
+++ b/yaml/module-cron.yaml
@@ -0,0 +1,22 @@
+application: crucial-alpha-706
+module: cron
+version: uno
+runtime: go
+api_version: go1
+instance_class: B1
+basic_scaling:
+  max_instances: 1
+  idle_timeout: 30m
+
+skip_files:
+- test/*
+
+handlers:
+- url: /cron/.*
+  script: _go_app
+  login: admin
+
+error_handlers:
+  - file: err/default.html
+  - error_code: over_quota
+    file: err/over_quota.html

From dd9dd11325050784cae822239b127461b1b68b1e Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sun, 1 Nov 2015 14:33:28 -0700
Subject: [PATCH 05/26] Breaking up cron into (fetcher, dePager, parser,
 batcher, saver) [in progress]

---
 app/cron/crawler/Batcher.go     |  41 ++++----
 app/cron/crawler/FeedCrawler.go | 137 ---------------------------
 app/cron/crawler/Fetcher.go     | 155 +++++++++++++++++++++++++++++++
 app/cron/crawler/UnPager.go     |  20 ++++
 app/cron/crawler/crawler.go     | 159 ++++++++++++++++----------------
 app/cron/cron.go                |   3 +
 6 files changed, 279 insertions(+), 236 deletions(-)
 delete mode 100644 app/cron/crawler/FeedCrawler.go
 create mode 100644 app/cron/crawler/Fetcher.go
 create mode 100644 app/cron/crawler/UnPager.go

diff --git a/app/cron/crawler/Batcher.go b/app/cron/crawler/Batcher.go
index 1c68a88..8bbaf2a 100644
--- a/app/cron/crawler/Batcher.go
+++ b/app/cron/crawler/Batcher.go
@@ -1,23 +1,24 @@
 package crawler
 
-func Batcher(in <-chan ChivePost, batch_size int) <-chan []ChivePost {
-  out := make(chan []ChivePost)
-  go func() {
-    defer close(out)
-    batch := make([]ChivePost, batch_size)
-    count := 0
-    for post := range in {
-      batch[count] = post
-      count++
-      if count >= batch_size {
-        count = 0
-        out <- batch
-        batch = make([]ChivePost, batch_size) // allocate another chunk of memory
-      }
-    }
-    if count > 0 {
-      out <- batch[:count]
-    }
-  }()
-  return out
+// Batcher takes input and batches to given sizes
+func Batcher(in <-chan string, size int) <-chan []string {
+	out := make(chan []string)
+	go func() {
+		defer close(out)
+		batch := make([]string, size)
+		count := 0
+		for post := range in {
+			batch[count] = post
+			count++
+			if count >= size {
+				count = 0
+				out <- batch
+				batch = make([]string, size) // allocate another chunk of memory
+			}
+		}
+		if count > 0 {
+			out <- batch[:count]
+		}
+	}()
+	return out
 }
diff --git a/app/cron/crawler/FeedCrawler.go b/app/cron/crawler/FeedCrawler.go
deleted file mode 100644
index 57a3af4..0000000
--- a/app/cron/crawler/FeedCrawler.go
+++ /dev/null
@@ -1,137 +0,0 @@
-package crawler
-
-import (
-  // "app/models"
-  // "app/helpers/keycache"
-  "appengine"
-  // "appengine/datastore"
-  // "appengine/delay"
-  // "appengine/taskqueue"
-  "appengine/urlfetch"
-  // "encoding/xml"
-  // "fmt"
-  "net/http"
-  "strconv"
-)
-
-var (
-  DEBUG = true
-  DEBUG_DEPTH = 1
-)
-
-func NewFeedCrawler(c appengine.Context) *FeedCrawler {
-  return &FeedCrawler{
-    context: c,
-    client:  urlfetch.Client(c),
-    results: make(chan ChivePost),
-  }
-}
-
-type FeedCrawler struct {
-  context appengine.Context
-  client  *http.Client
-
-  todo    []int
-  guids   map[string]bool // this could be extremely large
-  results chan ChivePost
-}
-
-func (fc *FeedCrawler) StartSearch() <-chan ChivePost {
-  go func() {
-    defer close(fc.results)
-    for i := 0; i < 99; i++ {
-      fc.results <- ChivePost{KEY:"asdf", XML:strconv.Itoa(i)}
-    }
-    // fc.search(1, -1)
-  }()
-  return fc.results
-}
-
-func (fc *FeedCrawler) addRange(bot, top int) {
-  // TODO: isn't there a better way to perform this operation!?
-  for i := bot + 1; i < top; i++ {
-    fc.todo = append(fc.todo, i)
-  }
-}
-
-// func (fc *FeedCrawler) search(bot, top int) (err error) {
-//   /*
-//   def infinite_length(bottom=1, top=-1):
-//     if bottom == 1 and not item_exists(1): return 0  # Starting edge case
-//     if bottom == top - 1: return bottom  # Result found! (top doesn’t exist)
-//     if top < 0:  # Searching forward
-//       top = bottom << 1  # Base 2 hops
-//       if item_exists(top):
-//         top, bottom = -1, top # continue searching forward
-//     else:  # Binary search between bottom and top
-//       middle = (bottom + top) // 2
-//       bottom, top = middle, top if item_exists(middle) else bottom, middle
-//     return infinite_length(bottom, top)  # Tail recursion!!!
-//   */
-//   if bot == top - 1 {
-//     fc.context.Infof("TOP OF RANGE FOUND! @%d", top)
-//     fc.addRange(bot, top)
-//     return nil
-//   }
-//   var full_stop, is_stop bool = false, false
-//   if top < 0 { // Searching forward
-//     top = bot << 1  // Base 2 hops forward
-//     is_stop, full_stop, err = fc.isStop(top)
-//     if err != nil {
-//       return err
-//     }
-//     if !is_stop {
-//       fc.addRange(bot, top)
-//       top, bot = -1, top
-//     }
-//   } else { // Binary search between top and bottom
-//     mid := (bot + top) / 2
-//     is_stop, full_stop, err = fc.isStop(mid)
-//     if err != nil {
-//       return err
-//     }
-//     if is_stop {
-//       top = mid
-//     } else {
-//       fc.addRange(bot, mid)
-//       bot = mid
-//     }
-//   }
-//   if full_stop {
-//     return nil
-//   }
-//   return fc.search(bot, top)  // TAIL RECURSION!!!
-// }
-//
-// func (fc *FeedCrawler) isStop(idx int) (is_stop, full_stop bool, err error) {
-//   // Gather posts as necessary
-//   posts, err := fc.getAndParseFeed(idx)
-//   if err == FeedParse404Error {
-//     fc.context.Infof("Reached the end of the feed list (%v)", idx)
-//     return true, false, nil
-//   }
-//   if err != nil {
-//     fc.context.Errorf("Error decoding ChiveFeed: %s", err)
-//     return false, false, err
-//   }
-//
-//   // Check for Duplicates
-//   store_count := 0
-//   for _, post := range posts {
-//     id, _, err := guidToInt(post.Guid)
-//     if x.guids[id] || err != nil {
-//       continue
-//     }
-//     store_count += 1
-//   }
-//   fc.posts = append(fc.posts, posts...)
-//
-//   // Use store_count info to determine if isStop
-//   is_stop = store_count == 0 || DEBUG
-//   full_stop = len(posts) != store_count && store_count > 0
-//   if DEBUG {
-//     is_stop = idx > DEBUG_DEPTH
-//     full_stop = idx == DEBUG_DEPTH
-//   }
-//   return
-// }
diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go
new file mode 100644
index 0000000..89a3c10
--- /dev/null
+++ b/app/cron/crawler/Fetcher.go
@@ -0,0 +1,155 @@
+package crawler
+
+import (
+	"fmt"
+	"io/ioutil"
+	"net/http"
+
+	"appengine"
+	"appengine/urlfetch"
+)
+
+const (
+	// DEBUG enable if troubleshooting algorithm
+	DEBUG = true
+
+	// DEPTH depth of feed mining
+	DEPTH = 3
+)
+
+func pageURL(idx int) string {
+	return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx)
+}
+
+// Fetcher returns stream of un-processed xml posts
+func Fetcher(c appengine.Context) <-chan string {
+	res := make(chan string)
+	worker := &fetcher{
+		res:     res,
+		context: c,
+		client:  urlfetch.Client(c),
+	}
+	go worker.Main()
+	return res
+}
+
+type fetcher struct {
+	res     chan<- string
+	context appengine.Context
+	client  *http.Client
+	todo    chan int
+}
+
+func (x *fetcher) Main() error {
+	// Check first item edge case
+	if isStop, err := x.isStop(1); isStop || err != nil {
+		x.context.Infof("Fetcher: Finished without recursive searching %v", err)
+		return err
+	}
+
+	// Defer as many todo workers as necessary
+	x.todo = make(chan int)
+	go x.processTODO()
+	return x.Search(1, -1)
+}
+
+func (x *fetcher) Search(bottom, top int) (err error) {
+	/*
+	  def infinite_length(bottom=1, top=-1):
+	    if bottom == 1 and not item_exists(1): return 0  # Starting edge case
+	    if bottom == top - 1: return bottom  # Result found! (top doesn’t exist)
+	    if top < 0:  # Searching forward
+	      top = bottom << 1  # Base 2 hops
+	      if item_exists(top):
+	        top, bottom = -1, top # continue searching forward
+	    else:  # Binary search between bottom and top
+	      middle = (bottom + top) // 2
+	      bottom, top = middle, top if item_exists(middle) else bottom, middle
+	    return infinite_length(bottom, top)  # Tail recursion!!!
+	*/
+	if bottom == top-1 {
+		x.context.Infof("Fetcher: TOP OF RANGE FOUND! @%d", top)
+		x.addRange(bottom, top)
+		close(x.res)
+		return nil
+	}
+	x.context.Infof("Fetcher: Search(%d, %d)", bottom, top)
+	var isStop = false
+
+	// Searching forward
+	if top < 0 {
+		top = bottom << 1 // Base 2 hops forward
+		isStop, err = x.isStop(top)
+		if err != nil {
+			close(x.res)
+			return err
+		}
+		if !isStop {
+			x.addRange(bottom, top)
+			top, bottom = -1, top
+		}
+
+		// Binary search between top and bottom
+	} else {
+		middle := (bottom + top) / 2
+		isStop, err = x.isStop(middle)
+		if err != nil {
+			close(x.res)
+			return err
+		}
+		if isStop {
+			top = middle
+		} else {
+			x.addRange(bottom, middle)
+			bottom = middle
+		}
+	}
+	return x.Search(bottom, top) // TAIL RECURSION!!!
+}
+
+func (x *fetcher) isStop(idx int) (isStop bool, err error) {
+
+	// Gather posts as necessary
+	url := pageURL(idx)
+	x.context.Infof("Fetcher: Fetching %s", url)
+	resp, err := x.client.Get(url)
+	if err != nil {
+		x.context.Errorf("Fetcher: Error decoding ChiveFeed: %s", err)
+		return true, err
+	}
+	defer resp.Body.Close()
+
+	// Check Response Codes for non-200 responses
+	if resp.StatusCode != 200 {
+		if resp.StatusCode == 404 {
+			x.context.Infof("Fetcher: Reached the end of the feed list (%v)", idx)
+			return true, nil
+		}
+		return true, fmt.Errorf("Fetcher: Feed parcing received a %d Status Code on (%s)", resp.StatusCode, url)
+	}
+
+	// Pull response content into String
+	contents, err := ioutil.ReadAll(resp.Body)
+	if err != nil {
+		return true, err
+	}
+	x.res <- string(contents)
+
+	// Use store_count info to determine if isStop
+	if DEBUG {
+		isStop = idx >= DEPTH
+	}
+	return isStop, nil
+}
+
+func (x *fetcher) addRange(bottom, top int) {
+	for i := bottom + 1; i < top; i++ {
+		x.todo <- i
+	}
+}
+
+func (x *fetcher) processTODO() {
+	for idx := range x.todo {
+		x.isStop(idx)
+	}
+}
diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go
new file mode 100644
index 0000000..fdbb5d0
--- /dev/null
+++ b/app/cron/crawler/UnPager.go
@@ -0,0 +1,20 @@
+package crawler
+
+import "appengine"
+
+// UnPager process pages of posts to individual posts
+func UnPager(c appengine.Context, pages <-chan string) <-chan string {
+	res := make(chan string)
+	go runUnPager(c, pages, res)
+	return res
+}
+
+func runUnPager(c appengine.Context, in <-chan string, out chan<- string) {
+	defer close(out)
+
+	for page := range in {
+		c.Infof("Retrieved Page %s", page)
+
+		// TODO: decompress page
+	}
+}
diff --git a/app/cron/crawler/crawler.go b/app/cron/crawler/crawler.go
index fd8f710..5e240b8 100644
--- a/app/cron/crawler/crawler.go
+++ b/app/cron/crawler/crawler.go
@@ -1,97 +1,98 @@
 package crawler
 
 import (
-  // "app/models"
-  // "app/helpers/keycache"
-  "appengine"
-  // "appengine/datastore"
-  // "appengine/delay"
-  // "appengine/taskqueue"
-  "appengine/urlfetch"
-  "encoding/xml"
-  "fmt"
-  "net/http"
+	// "app/models"
+	// "app/helpers/keycache"
+	"appengine"
+	// "appengine/datastore"
+	// "appengine/delay"
+	// "appengine/taskqueue"
+	"encoding/xml"
+	"fmt"
+	"net/http"
+
+	"appengine/urlfetch"
 )
 
 // Sourcer: this is a source for defered work chains
 
-type ChivePost struct {
-  KEY string `xml:"guid"`
-  XML string `xml:",innerxml"`
+type chivePost struct {
+	KEY string `xml:"guid"`
+	XML string `xml:",innerxml"`
 }
 
-type ChivePostMiner struct {
-  Item ChivePost `xml:"channel>item"`
+type chivePostMiner struct {
+	Item []chivePost `xml:"channel>item"`
 }
 
-func page_url(idx int) string {
-  return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx)
-}
-
-
 func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) {
-  url := page_url(0)
-
-  // Get Response
-  c.Infof("Parsing index 0 (%v)", url)
-  resp, err := urlfetch.Client(c).Get(url)
-  if err != nil {
-    fmt.Fprint(w, "client error")
-    return
-  }
-  defer resp.Body.Close()
-  if resp.StatusCode != 200 {
-    fmt.Fprint(w, "unexpected error code")
-  }
-
-  // Decode Response
-  var feed []ChivePostMiner
-  decoder := xml.NewDecoder(resp.Body)
-  if err := decoder.Decode(&feed); err != nil {
-    c.Errorf("decode error %v", err)
-    fmt.Fprint(w, "decode error")
-    return
-  }
-
-  feed[0].Item.XML = "<item>" + feed[0].Item.XML + "</item>"
-
-  c.Infof("Something %v", feed)
-
-  // TODO: store all items to datastore
-
-
-  // DEBUGGING ONLY.... HERE DOWN
-
-  // post, err := parseData(feed[0].Item.XML)
-  // if err != nil {
-  //   c.Errorf("error parsing %v", err)
-  //   return
-  // }
-  //
-  // // JSONIFY Response
-  // str_items, err := json.MarshalIndent(&post, "", "  ")
-  // var out string
-  // if err != nil {
-  //   out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}"
-  // } else {
-  //   out = string(str_items)
-  // }
-  // w.Header().Set("Content-Type", "application/json; charset=utf-8")
-  // fmt.Fprint(w, out)
+	url := pageURL(9999)
+
+	// Get Response
+	c.Infof("Parsing index 0 (%v)", url)
+	resp, err := urlfetch.Client(c).Get(url)
+	if err != nil {
+		fmt.Fprint(w, "client error")
+		return
+	}
+	defer resp.Body.Close()
+	if resp.StatusCode != 200 {
+		fmt.Fprint(w, "unexpected error code")
+	}
+
+	// Decode Response
+	var feed chivePostMiner
+	decoder := xml.NewDecoder(resp.Body)
+	if err := decoder.Decode(&feed); err != nil {
+		c.Errorf("decode error %v", err)
+		fmt.Fprint(w, "decode error")
+		return
+	}
+
+	// Wrap posts in xml
+	for idx, post := range feed.Item {
+		feed.Item[idx].XML = "<item>" + post.XML + "</item>"
+	}
+
+	c.Infof("Something %v", feed)
+
+	// TODO: store all items to datastore
+
+	// DEBUGGING ONLY.... HERE DOWN
+
+	// post, err := parseData(feed[0].Item.XML)
+	// if err != nil {
+	//   c.Errorf("error parsing %v", err)
+	//   return
+	// }
+	//
+	// // JSONIFY Response
+	// str_items, err := json.MarshalIndent(&post, "", "  ")
+	// var out string
+	// if err != nil {
+	//   out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}"
+	// } else {
+	//   out = string(str_items)
+	// }
+	// w.Header().Set("Content-Type", "application/json; charset=utf-8")
+	// fmt.Fprint(w, out)
 }
 
 func Crawl2(c appengine.Context, w http.ResponseWriter, r *http.Request) {
-  crawler := NewFeedCrawler(c)
-  found_posts := crawler.StartSearch()
-  batch_posts := Batcher(found_posts, 20)
-  Storage(batch_posts, c)
+	pages := Fetcher(c)
+	for _ = range pages {
+		c.Infof("Found page")
+	}
+	// posts := UnPager(c, pages)
+	// batch := Batcher(posts, 20)
+	// Storage(c, batch)
 }
 
-func Storage(in <-chan []ChivePost, c appengine.Context) {
-  go func() {
-    for batch := range in {
-      fmt.Println(batch)
-      c.Infof("Storing %v", batch)
-    }
-  }()
+func Storage(c appengine.Context, in <-chan []string) {
+	go func() {
+		for batch := range in {
+			fmt.Println(batch)
+			c.Infof("Storing %v", batch)
+		}
+	}()
 }
diff --git a/app/cron/cron.go b/app/cron/cron.go
index 08808b5..475c1e7 100644
--- a/app/cron/cron.go
+++ b/app/cron/cron.go
@@ -41,6 +41,9 @@ func Init() {
 	http.Handle("/cron/crawl2", appstats.NewHandler(crawler.Crawl2))
 	http.Handle("/cron/parse", appstats.NewHandler(parseFeeds))
 	http.HandleFunc("/cron/delete", delete)
+	http.HandleFunc("/_ah/start", func(w http.ResponseWriter, r *http.Request) {
+
+	})
 }
 
 var (

From 87aa042c7fa8f298eae768b86548acf03cdf6fb3 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sun, 1 Nov 2015 14:44:38 -0700
Subject: [PATCH 06/26] Cleaning up fetcher channel closures

---
 app/cron/crawler/Fetcher.go | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go
index 89a3c10..650ce58 100644
--- a/app/cron/crawler/Fetcher.go
+++ b/app/cron/crawler/Fetcher.go
@@ -11,7 +11,7 @@ import (
 
 const (
 	// DEBUG enable if troubleshooting algorithm
-	DEBUG = true
+	DEBUG = false
 
 	// DEPTH depth of feed mining
 	DEPTH = 3
@@ -41,6 +41,8 @@ type fetcher struct {
 }
 
 func (x *fetcher) Main() error {
+	defer close(x.res)
+
 	// Check first item edge case
 	if isStop, err := x.isStop(1); isStop || err != nil {
 		x.context.Infof("Fetcher: Finished without recursive searching %v", err)
@@ -49,6 +51,7 @@ func (x *fetcher) Main() error {
 
 	// Defer as many todo workers as necessary
 	x.todo = make(chan int)
+	defer close(x.todo)
 	go x.processTODO()
 	return x.Search(1, -1)
 }
@@ -70,7 +73,6 @@ func (x *fetcher) Search(bottom, top int) (err error) {
 	if bottom == top-1 {
 		x.context.Infof("Fetcher: TOP OF RANGE FOUND! @%d", top)
 		x.addRange(bottom, top)
-		close(x.res)
 		return nil
 	}
 	x.context.Infof("Fetcher: Search(%d, %d)", bottom, top)
@@ -81,7 +83,6 @@ func (x *fetcher) Search(bottom, top int) (err error) {
 		top = bottom << 1 // Base 2 hops forward
 		isStop, err = x.isStop(top)
 		if err != nil {
-			close(x.res)
 			return err
 		}
 		if !isStop {
@@ -94,7 +95,6 @@ func (x *fetcher) Search(bottom, top int) (err error) {
 		middle := (bottom + top) / 2
 		isStop, err = x.isStop(middle)
 		if err != nil {
-			close(x.res)
 			return err
 		}
 		if isStop {
@@ -150,6 +150,7 @@ func (x *fetcher) addRange(bottom, top int) {
 
 func (x *fetcher) processTODO() {
 	for idx := range x.todo {
-		x.isStop(idx)
+		x.context.Infof("Fetcher: NOT processing TODO %d", idx)
+		//x.isStop(idx)
 	}
 }

From a64b79080e9ea9323357f2ff5c4d57ea6253e7b9 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sun, 1 Nov 2015 15:06:55 -0700
Subject: [PATCH 07/26] Finishing un-paginator

---
 app/cron/crawler/Fetcher.go |  4 ++--
 app/cron/crawler/UnPager.go | 26 +++++++++++++++++++++++---
 app/cron/crawler/crawler.go |  9 +++++----
 3 files changed, 30 insertions(+), 9 deletions(-)

diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go
index 650ce58..6cf8252 100644
--- a/app/cron/crawler/Fetcher.go
+++ b/app/cron/crawler/Fetcher.go
@@ -11,10 +11,10 @@ import (
 
 const (
 	// DEBUG enable if troubleshooting algorithm
-	DEBUG = false
+	DEBUG = true
 
 	// DEPTH depth of feed mining
-	DEPTH = 3
+	DEPTH = 1
 )
 
 func pageURL(idx int) string {
diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go
index fdbb5d0..ef3a800 100644
--- a/app/cron/crawler/UnPager.go
+++ b/app/cron/crawler/UnPager.go
@@ -1,10 +1,16 @@
 package crawler
 
-import "appengine"
+import (
+	"encoding/xml"
+
+	"appengine"
+)
 
 // UnPager process pages of posts to individual posts
 func UnPager(c appengine.Context, pages <-chan string) <-chan string {
 	res := make(chan string)
+
+	// TODO: spin up as many unpages as desired
 	go runUnPager(c, pages, res)
 	return res
 }
@@ -12,9 +18,23 @@ func UnPager(c appengine.Context, pages <-chan string) <-chan string {
 func runUnPager(c appengine.Context, in <-chan string, out chan<- string) {
 	defer close(out)
 
+	var miner struct {
+		Item []struct {
+			KEY string `xml:"guid"`
+			XML string `xml:",innerxml"`
+		} `xml:"channel>item"`
+	}
+
 	for page := range in {
-		c.Infof("Retrieved Page %s", page)
+		c.Infof("UnPager: Retrieved Page")
+
+		if err := xml.Unmarshal([]byte(page), &miner); err != nil {
+			c.Errorf("UnPager: Error %s", err)
+		}
 
-		// TODO: decompress page
+		for _, post := range miner.Item {
+			c.Infof("UnPager: Found Post %s", post.KEY)
+			out <- post.XML
+		}
 	}
 }
diff --git a/app/cron/crawler/crawler.go b/app/cron/crawler/crawler.go
index 5e240b8..418c620 100644
--- a/app/cron/crawler/crawler.go
+++ b/app/cron/crawler/crawler.go
@@ -26,7 +26,7 @@ type chivePostMiner struct {
 }
 
 func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) {
-	url := pageURL(9999)
+	url := pageURL(1)
 
 	// Get Response
 	c.Infof("Parsing index 0 (%v)", url)
@@ -79,11 +79,12 @@ func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 }
 
 func Crawl2(c appengine.Context, w http.ResponseWriter, r *http.Request) {
+	// fetcher, dePager, parser, batcher, saver
 	pages := Fetcher(c)
-	for _ = range pages {
-		c.Infof("Found page")
+	posts := UnPager(c, pages)
+	for post := range posts {
+		c.Infof("Post: %v", post)
 	}
-	// posts := UnPager(c, pages)
 	// batch := Batcher(posts, 20)
 	// Storage(c, batch)
 }

From ad46e1cd8c71bdba49fe7c5135d06986a7bf9e1e Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sun, 1 Nov 2015 15:42:48 -0700
Subject: [PATCH 08/26] Storing crawled posts

---
 app/cron/crawler/Batcher.go |  8 ++--
 app/cron/crawler/Fetcher.go |  8 ----
 app/cron/crawler/Storage.go | 33 +++++++++++++
 app/cron/crawler/UnPager.go | 11 +++--
 app/cron/crawler/crawler.go | 93 +++++++------------------------------
 app/cron/cron.go            |  1 -
 6 files changed, 60 insertions(+), 94 deletions(-)
 create mode 100644 app/cron/crawler/Storage.go

diff --git a/app/cron/crawler/Batcher.go b/app/cron/crawler/Batcher.go
index 8bbaf2a..a4a99f3 100644
--- a/app/cron/crawler/Batcher.go
+++ b/app/cron/crawler/Batcher.go
@@ -1,11 +1,11 @@
 package crawler
 
 // Batcher takes input and batches to given sizes
-func Batcher(in <-chan string, size int) <-chan []string {
-	out := make(chan []string)
+func Batcher(in <-chan Data, size int) <-chan []Data {
+	out := make(chan []Data)
 	go func() {
 		defer close(out)
-		batch := make([]string, size)
+		batch := make([]Data, size)
 		count := 0
 		for post := range in {
 			batch[count] = post
@@ -13,7 +13,7 @@ func Batcher(in <-chan string, size int) <-chan []string {
 			if count >= size {
 				count = 0
 				out <- batch
-				batch = make([]string, size) // allocate another chunk of memory
+				batch = make([]Data, size) // allocate another chunk of memory
 			}
 		}
 		if count > 0 {
diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go
index 6cf8252..9d147c3 100644
--- a/app/cron/crawler/Fetcher.go
+++ b/app/cron/crawler/Fetcher.go
@@ -9,14 +9,6 @@ import (
 	"appengine/urlfetch"
 )
 
-const (
-	// DEBUG enable if troubleshooting algorithm
-	DEBUG = true
-
-	// DEPTH depth of feed mining
-	DEPTH = 1
-)
-
 func pageURL(idx int) string {
 	return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx)
 }
diff --git a/app/cron/crawler/Storage.go b/app/cron/crawler/Storage.go
new file mode 100644
index 0000000..b2a1346
--- /dev/null
+++ b/app/cron/crawler/Storage.go
@@ -0,0 +1,33 @@
+package crawler
+
+import (
+	"appengine"
+	"appengine/datastore"
+)
+
+func Storage(c appengine.Context, in <-chan []Data) {
+	runStorage(c, in)
+}
+
+type Store struct {
+	XML []byte
+}
+
+func runStorage(c appengine.Context, in <-chan []Data) {
+	var keys []*datastore.Key
+	var items []Store
+	for batch := range in {
+		keys = make([]*datastore.Key, len(batch))
+		items = make([]Store, len(batch))
+		for i, item := range batch {
+			keys[i] = datastore.NewKey(c, XML, item.KEY, 0, nil)
+			items[i].XML = []byte(item.XML)
+		}
+
+		c.Infof("Storage: Storing %v", keys)
+		_, err := datastore.PutMulti(c, keys, items)
+		if err != nil {
+			c.Errorf("Storage: Error storing batch %s", err)
+		}
+	}
+}
diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go
index ef3a800..f7ac41b 100644
--- a/app/cron/crawler/UnPager.go
+++ b/app/cron/crawler/UnPager.go
@@ -7,15 +7,15 @@ import (
 )
 
 // UnPager process pages of posts to individual posts
-func UnPager(c appengine.Context, pages <-chan string) <-chan string {
-	res := make(chan string)
+func UnPager(c appengine.Context, pages <-chan string) <-chan Data {
+	res := make(chan Data)
 
 	// TODO: spin up as many unpages as desired
 	go runUnPager(c, pages, res)
 	return res
 }
 
-func runUnPager(c appengine.Context, in <-chan string, out chan<- string) {
+func runUnPager(c appengine.Context, in <-chan string, out chan<- Data) {
 	defer close(out)
 
 	var miner struct {
@@ -34,7 +34,10 @@ func runUnPager(c appengine.Context, in <-chan string, out chan<- string) {
 
 		for _, post := range miner.Item {
 			c.Infof("UnPager: Found Post %s", post.KEY)
-			out <- post.XML
+			out <- Data{
+				KEY: post.KEY,
+				XML: post.XML,
+			}
 		}
 	}
 }
diff --git a/app/cron/crawler/crawler.go b/app/cron/crawler/crawler.go
index 418c620..5227b3f 100644
--- a/app/cron/crawler/crawler.go
+++ b/app/cron/crawler/crawler.go
@@ -7,93 +7,32 @@ import (
 	// "appengine/datastore"
 	// "appengine/delay"
 	// "appengine/taskqueue"
-	"encoding/xml"
+
 	"fmt"
 	"net/http"
-
-	"appengine/urlfetch"
 )
 
-// Sourcer: this is a source for defered work chains
-
-type chivePost struct {
-	KEY string `xml:"guid"`
-	XML string `xml:",innerxml"`
-}
-
-type chivePostMiner struct {
-	Item []chivePost `xml:"channel>item"`
-}
-
-func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) {
-	url := pageURL(1)
-
-	// Get Response
-	c.Infof("Parsing index 0 (%v)", url)
-	resp, err := urlfetch.Client(c).Get(url)
-	if err != nil {
-		fmt.Fprint(w, "client error")
-		return
-	}
-	defer resp.Body.Close()
-	if resp.StatusCode != 200 {
-		fmt.Fprint(w, "unexpected error code")
-	}
-
-	// Decode Response
-	var feed chivePostMiner
-	decoder := xml.NewDecoder(resp.Body)
-	if err := decoder.Decode(&feed); err != nil {
-		c.Errorf("decode error %v", err)
-		fmt.Fprint(w, "decode error")
-		return
-	}
-
-	// Wrap posts in xml
-	for idx, post := range feed.Item {
-		feed.Item[idx].XML = "<item>" + post.XML + "</item>"
-	}
-
-	c.Infof("Something %v", feed)
+const (
+	// DEBUG enable if troubleshooting algorithm
+	DEBUG = false
 
-	// TODO: store all items to datastore
+	// DEPTH depth of feed mining
+	DEPTH = 1
 
-	// DEBUGGING ONLY.... HERE DOWN
+	// XML name of where xml posts are stored
+	XML = "xml"
+)
 
-	// post, err := parseData(feed[0].Item.XML)
-	// if err != nil {
-	//   c.Errorf("error parsing %v", err)
-	//   return
-	// }
-	//
-	// // JSONIFY Response
-	// str_items, err := json.MarshalIndent(&post, "", "  ")
-	// var out string
-	// if err != nil {
-	//   out = "{\"status\":\"error\",\"code\":500,\"data\":null,\"msg\":\"Error marshaling data\"}"
-	// } else {
-	//   out = string(str_items)
-	// }
-	// w.Header().Set("Content-Type", "application/json; charset=utf-8")
-	// fmt.Fprint(w, out)
+type Data struct {
+	KEY string
+	XML string
 }
 
-func Crawl2(c appengine.Context, w http.ResponseWriter, r *http.Request) {
+func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 	// fetcher, dePager, parser, batcher, saver
 	pages := Fetcher(c)
 	posts := UnPager(c, pages)
-	for post := range posts {
-		c.Infof("Post: %v", post)
-	}
-	// batch := Batcher(posts, 20)
-	// Storage(c, batch)
-}
-
-func Storage(c appengine.Context, in <-chan []string) {
-	go func() {
-		for batch := range in {
-			fmt.Println(batch)
-			c.Infof("Storing %v", batch)
-		}
-	}()
+	batch := Batcher(posts, 50)
+	Storage(c, batch)
+	fmt.Fprint(w, "Crawl Complete!")
 }
diff --git a/app/cron/cron.go b/app/cron/cron.go
index 475c1e7..aa23eca 100644
--- a/app/cron/cron.go
+++ b/app/cron/cron.go
@@ -38,7 +38,6 @@ const (
 // Init initializes cron handlers
 func Init() {
 	http.Handle("/cron/crawl", appstats.NewHandler(crawler.Crawl))
-	http.Handle("/cron/crawl2", appstats.NewHandler(crawler.Crawl2))
 	http.Handle("/cron/parse", appstats.NewHandler(parseFeeds))
 	http.HandleFunc("/cron/delete", delete)
 	http.HandleFunc("/_ah/start", func(w http.ResponseWriter, r *http.Request) {

From 8dca8c35c8546ade6f7abfe8dcb02afa3e08b22e Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Tue, 10 Nov 2015 00:49:08 -0700
Subject: [PATCH 09/26] Splitting up cron into multiple parts

---
 app/cron/crawler/Batcher.go |  7 ++++--
 app/cron/crawler/Fetcher.go | 49 ++++++++++++++++++++++++++-----------
 app/cron/crawler/Storage.go | 19 +++++++++++---
 app/cron/crawler/UnPager.go | 27 ++++++++++++++------
 app/cron/crawler/crawler.go | 36 ++++++++++++++++++++++-----
 app/cron/cron.go            |  6 +++--
 6 files changed, 108 insertions(+), 36 deletions(-)

diff --git a/app/cron/crawler/Batcher.go b/app/cron/crawler/Batcher.go
index a4a99f3..3ff3d77 100644
--- a/app/cron/crawler/Batcher.go
+++ b/app/cron/crawler/Batcher.go
@@ -1,8 +1,10 @@
 package crawler
 
+import "appengine"
+
 // Batcher takes input and batches to given sizes
-func Batcher(in <-chan Data, size int) <-chan []Data {
-	out := make(chan []Data)
+func Batcher(c appengine.Context, in <-chan Data, size int) <-chan []Data {
+	out := make(chan []Data, 10000)
 	go func() {
 		defer close(out)
 		batch := make([]Data, size)
@@ -16,6 +18,7 @@ func Batcher(in <-chan Data, size int) <-chan []Data {
 				batch = make([]Data, size) // allocate another chunk of memory
 			}
 		}
+		c.Infof("Batcher: Finished Batching")
 		if count > 0 {
 			out <- batch[:count]
 		}
diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go
index 9d147c3..c43c9a5 100644
--- a/app/cron/crawler/Fetcher.go
+++ b/app/cron/crawler/Fetcher.go
@@ -4,6 +4,8 @@ import (
 	"fmt"
 	"io/ioutil"
 	"net/http"
+	"sync"
+	"time"
 
 	"appengine"
 	"appengine/urlfetch"
@@ -14,25 +16,25 @@ func pageURL(idx int) string {
 }
 
 // Fetcher returns stream of un-processed xml posts
-func Fetcher(c appengine.Context) <-chan string {
-	res := make(chan string)
+func Fetcher(c appengine.Context, workers int) <-chan Data {
+	res := make(chan Data, 100)
 	worker := &fetcher{
 		res:     res,
 		context: c,
 		client:  urlfetch.Client(c),
 	}
-	go worker.Main()
+	go worker.Main(workers)
 	return res
 }
 
 type fetcher struct {
-	res     chan<- string
+	res     chan<- Data
 	context appengine.Context
 	client  *http.Client
 	todo    chan int
 }
 
-func (x *fetcher) Main() error {
+func (x *fetcher) Main(workers int) error {
 	defer close(x.res)
 
 	// Check first item edge case
@@ -42,10 +44,24 @@ func (x *fetcher) Main() error {
 	}
 
 	// Defer as many todo workers as necessary
-	x.todo = make(chan int)
-	defer close(x.todo)
-	go x.processTODO()
-	return x.Search(1, -1)
+	x.todo = make(chan int, 1000)
+
+	// Number of batch fetchers to process
+	var wg sync.WaitGroup
+	for i := 0; i < workers; i++ {
+		go func(idx int) {
+			x.processTODO()
+			wg.Done()
+		}(i)
+	}
+	wg.Add(workers)
+
+	err := x.Search(1, -1)
+
+	// wait for processTODOs to finish
+	wg.Wait()
+	x.context.Infof("Complete with FETCHING")
+	return err
 }
 
 func (x *fetcher) Search(bottom, top int) (err error) {
@@ -65,6 +81,7 @@ func (x *fetcher) Search(bottom, top int) (err error) {
 	if bottom == top-1 {
 		x.context.Infof("Fetcher: TOP OF RANGE FOUND! @%d", top)
 		x.addRange(bottom, top)
+		close(x.todo)
 		return nil
 	}
 	x.context.Infof("Fetcher: Search(%d, %d)", bottom, top)
@@ -106,8 +123,9 @@ func (x *fetcher) isStop(idx int) (isStop bool, err error) {
 	x.context.Infof("Fetcher: Fetching %s", url)
 	resp, err := x.client.Get(url)
 	if err != nil {
-		x.context.Errorf("Fetcher: Error decoding ChiveFeed: %s", err)
-		return true, err
+		x.context.Errorf("Fetcher: Error decoding ChiveFeed (1s sleep): %s", err)
+		time.Sleep(time.Second)
+		return x.isStop(idx) // Tail recursion (this loop may get us into trouble)
 	}
 	defer resp.Body.Close()
 
@@ -125,7 +143,10 @@ func (x *fetcher) isStop(idx int) (isStop bool, err error) {
 	if err != nil {
 		return true, err
 	}
-	x.res <- string(contents)
+	x.res <- Data{
+		KEY: url,
+		XML: string(contents),
+	}
 
 	// Use store_count info to determine if isStop
 	if DEBUG {
@@ -142,7 +163,7 @@ func (x *fetcher) addRange(bottom, top int) {
 
 func (x *fetcher) processTODO() {
 	for idx := range x.todo {
-		x.context.Infof("Fetcher: NOT processing TODO %d", idx)
-		//x.isStop(idx)
+		// x.context.Infof("Fetcher: NOT processing TODO %d", idx)
+		x.isStop(idx)
 	}
 }
diff --git a/app/cron/crawler/Storage.go b/app/cron/crawler/Storage.go
index b2a1346..6d62bbc 100644
--- a/app/cron/crawler/Storage.go
+++ b/app/cron/crawler/Storage.go
@@ -1,22 +1,33 @@
 package crawler
 
 import (
+	"sync"
+
 	"appengine"
 	"appengine/datastore"
 )
 
-func Storage(c appengine.Context, in <-chan []Data) {
-	runStorage(c, in)
+func Storage(c appengine.Context, in <-chan []Data, workers int) {
+	var wg sync.WaitGroup
+	for i := 0; i < workers; i++ {
+		go func(x int) {
+			runStorage(c, in, x)
+			wg.Done()
+		}(i)
+	}
+	wg.Add(workers)
+	wg.Wait()
 }
 
 type Store struct {
 	XML []byte
 }
 
-func runStorage(c appengine.Context, in <-chan []Data) {
+func runStorage(c appengine.Context, in <-chan []Data, x int) {
 	var keys []*datastore.Key
 	var items []Store
 	for batch := range in {
+		c.Infof("Storage %d: Storing chunk", x)
 		keys = make([]*datastore.Key, len(batch))
 		items = make([]Store, len(batch))
 		for i, item := range batch {
@@ -24,7 +35,7 @@ func runStorage(c appengine.Context, in <-chan []Data) {
 			items[i].XML = []byte(item.XML)
 		}
 
-		c.Infof("Storage: Storing %v", keys)
+		// c.Infof("Storage: Storing %v", keys)
 		_, err := datastore.PutMulti(c, keys, items)
 		if err != nil {
 			c.Errorf("Storage: Error storing batch %s", err)
diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go
index f7ac41b..2a3a21d 100644
--- a/app/cron/crawler/UnPager.go
+++ b/app/cron/crawler/UnPager.go
@@ -2,22 +2,33 @@ package crawler
 
 import (
 	"encoding/xml"
+	"sync"
 
 	"appengine"
 )
 
 // UnPager process pages of posts to individual posts
-func UnPager(c appengine.Context, pages <-chan string) <-chan Data {
-	res := make(chan Data)
+func UnPager(c appengine.Context, pages <-chan string, workers int) <-chan Data {
+	res := make(chan Data, 100000)
 
 	// TODO: spin up as many unpages as desired
-	go runUnPager(c, pages, res)
+	var wg sync.WaitGroup
+	wg.Add(workers)
+	for i := 0; i < workers; i++ {
+		go func(x int) {
+			runUnPager(c, pages, res, x)
+			wg.Done()
+		}(i)
+	}
+	go func() {
+		wg.Wait()
+		close(res)
+	}()
+
 	return res
 }
 
-func runUnPager(c appengine.Context, in <-chan string, out chan<- Data) {
-	defer close(out)
-
+func runUnPager(c appengine.Context, in <-chan string, out chan<- Data, idx int) {
 	var miner struct {
 		Item []struct {
 			KEY string `xml:"guid"`
@@ -26,14 +37,14 @@ func runUnPager(c appengine.Context, in <-chan string, out chan<- Data) {
 	}
 
 	for page := range in {
-		c.Infof("UnPager: Retrieved Page")
+		c.Infof("UnPager %d: Retrieved Page", idx)
 
 		if err := xml.Unmarshal([]byte(page), &miner); err != nil {
 			c.Errorf("UnPager: Error %s", err)
 		}
 
 		for _, post := range miner.Item {
-			c.Infof("UnPager: Found Post %s", post.KEY)
+			// c.Infof("UnPager: Found Post %s", post.KEY)
 			out <- Data{
 				KEY: post.KEY,
 				XML: post.XML,
diff --git a/app/cron/crawler/crawler.go b/app/cron/crawler/crawler.go
index 5227b3f..8e322ce 100644
--- a/app/cron/crawler/crawler.go
+++ b/app/cron/crawler/crawler.go
@@ -3,8 +3,9 @@ package crawler
 import (
 	// "app/models"
 	// "app/helpers/keycache"
+
 	"appengine"
-	// "appengine/datastore"
+	"appengine/datastore"
 	// "appengine/delay"
 	// "appengine/taskqueue"
 
@@ -28,11 +29,34 @@ type Data struct {
 	XML string
 }
 
-func Crawl(c appengine.Context, w http.ResponseWriter, r *http.Request) {
+func Crawl(w http.ResponseWriter, r *http.Request) {
+	c := appengine.NewContext(r)
+
+	fetchers, storers := 50, 20
+
 	// fetcher, dePager, parser, batcher, saver
-	pages := Fetcher(c)
-	posts := UnPager(c, pages)
-	batch := Batcher(posts, 50)
-	Storage(c, batch)
+	pages := Fetcher(c, fetchers)
+	// posts := UnPager(c, pages, pagers)
+	batch := Batcher(c, pages, 10)
+	Storage(c, batch, storers)
+
 	fmt.Fprint(w, "Crawl Complete!")
 }
+
+func Stats(c appengine.Context, w http.ResponseWriter, r *http.Request) {
+
+	q := datastore.NewQuery("xml")
+
+	var data []Store
+	keys, err := q.GetAll(c, &data)
+	if err != nil {
+		fmt.Fprintf(w, "Error %s", err)
+		return
+	}
+
+	for idx, key := range keys {
+		fmt.Fprintf(w, "Data %s: len %d\n", key, len(data[idx].XML))
+	}
+
+	fmt.Fprintf(w, "Overall %d", len(data))
+}
diff --git a/app/cron/cron.go b/app/cron/cron.go
index aa23eca..4bc1439 100644
--- a/app/cron/cron.go
+++ b/app/cron/cron.go
@@ -37,11 +37,13 @@ const (
 
 // Init initializes cron handlers
 func Init() {
-	http.Handle("/cron/crawl", appstats.NewHandler(crawler.Crawl))
+	http.HandleFunc("/cron/crawl", crawler.Crawl)
+	http.Handle("/cron/stats", appstats.NewHandler(crawler.Stats))
+
 	http.Handle("/cron/parse", appstats.NewHandler(parseFeeds))
 	http.HandleFunc("/cron/delete", delete)
 	http.HandleFunc("/_ah/start", func(w http.ResponseWriter, r *http.Request) {
-
+		fmt.Fprintf(w, "Here boys")
 	})
 }
 

From 8738e9a7a7e290de40fdea62adbe5b7951822e14 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sat, 14 Nov 2015 10:32:31 -0700
Subject: [PATCH 10/26] Doing a bad idea (storing all the things super flat)

---
 app/cron/crawler/Batcher.go |   8 +--
 app/cron/crawler/Fetcher.go |   6 +--
 app/cron/crawler/Miner.go   |  76 ++++++++++++++++++++++++++
 app/cron/crawler/Storage.go | 103 +++++++++++++++++++++++++++++++++---
 app/cron/crawler/UnPager.go |  31 ++++++++++-
 app/cron/crawler/crawler.go |   7 ++-
 app/cron/cron.go            |  32 +++++++++--
 7 files changed, 242 insertions(+), 21 deletions(-)
 create mode 100644 app/cron/crawler/Miner.go

diff --git a/app/cron/crawler/Batcher.go b/app/cron/crawler/Batcher.go
index 3ff3d77..720b28e 100644
--- a/app/cron/crawler/Batcher.go
+++ b/app/cron/crawler/Batcher.go
@@ -3,11 +3,11 @@ package crawler
 import "appengine"
 
 // Batcher takes input and batches to given sizes
-func Batcher(c appengine.Context, in <-chan Data, size int) <-chan []Data {
-	out := make(chan []Data, 10000)
+func Batcher(c appengine.Context, in <-chan interface{}, size int) <-chan []interface{} {
+	out := make(chan []interface{}, 10000)
 	go func() {
 		defer close(out)
-		batch := make([]Data, size)
+		batch := make([]interface{}, size)
 		count := 0
 		for post := range in {
 			batch[count] = post
@@ -15,7 +15,7 @@ func Batcher(c appengine.Context, in <-chan Data, size int) <-chan []Data {
 			if count >= size {
 				count = 0
 				out <- batch
-				batch = make([]Data, size) // allocate another chunk of memory
+				batch = make([]interface{}, size) // allocate another chunk of memory
 			}
 		}
 		c.Infof("Batcher: Finished Batching")
diff --git a/app/cron/crawler/Fetcher.go b/app/cron/crawler/Fetcher.go
index c43c9a5..9afd088 100644
--- a/app/cron/crawler/Fetcher.go
+++ b/app/cron/crawler/Fetcher.go
@@ -16,8 +16,8 @@ func pageURL(idx int) string {
 }
 
 // Fetcher returns stream of un-processed xml posts
-func Fetcher(c appengine.Context, workers int) <-chan Data {
-	res := make(chan Data, 100)
+func Fetcher(c appengine.Context, workers int) <-chan interface{} {
+	res := make(chan interface{}, 100)
 	worker := &fetcher{
 		res:     res,
 		context: c,
@@ -28,7 +28,7 @@ func Fetcher(c appengine.Context, workers int) <-chan Data {
 }
 
 type fetcher struct {
-	res     chan<- Data
+	res     chan<- interface{}
 	context appengine.Context
 	client  *http.Client
 	todo    chan int
diff --git a/app/cron/crawler/Miner.go b/app/cron/crawler/Miner.go
new file mode 100644
index 0000000..3a8133b
--- /dev/null
+++ b/app/cron/crawler/Miner.go
@@ -0,0 +1,76 @@
+package crawler
+
+import (
+	"encoding/xml"
+	"sync"
+	"time"
+
+	"appengine"
+)
+
+// Vertex of the graph
+type Vertex struct {
+	Type  string
+	Value string
+	Count int64
+}
+
+// Edge of the graph
+type Edge struct {
+	Nodes []string
+}
+
+// Miner takes posts and mines out a graph
+func Miner(c appengine.Context, posts <-chan Data, workers int) (<-chan interface{}, <-chan interface{}) {
+	vertexes := make(chan interface{}, 10000)
+	edges := make(chan interface{}, 10000)
+
+	var wg sync.WaitGroup
+	for i := 0; i < workers; i++ {
+		go func(i int) {
+			miner(c, posts, vertexes, edges, i)
+			wg.Done()
+		}(i)
+	}
+	wg.Add(workers)
+
+	go func() {
+		wg.Wait()
+		close(vertexes)
+		close(edges)
+	}()
+	return vertexes, edges
+}
+
+func miner(c appengine.Context, posts <-chan Data, vertexes chan<- interface{}, edges chan<- interface{}, i int) {
+	var data struct {
+		Tags []string `xml:"category"`
+		Imgs []struct {
+			URL string `xml:"url,attr"`
+		} `xml:"content"`
+	}
+
+	for post := range posts {
+		vertexes <- Vertex{"Pst", post.KEY, 0}
+
+		// log.Printf("Miner %d: Got Post: %s", i, post.KEY)
+		// log.Printf("Data: %s", post.XML)
+
+		if err := xml.Unmarshal([]byte("<item>"+post.XML+"</item>"), &data); err != nil {
+			c.Errorf("Miner %d: Error %s", i, err)
+		}
+
+		for _, tag := range data.Tags {
+			// log.Printf("Found Tag: %s", tag)
+			vertexes <- Vertex{"Tag", tag, 0}
+			edges <- Edge{[]string{"Tag" + tag, "Pst" + post.KEY}}
+		}
+
+		for _, img := range data.Imgs {
+			// log.Printf("Found Img: %s", img.URL)
+			vertexes <- Vertex{"Img", img.URL, 0}
+			edges <- Edge{[]string{"Img" + img.URL, "Pst" + post.KEY}}
+		}
+		time.Sleep(time.Second)
+	}
+}
diff --git a/app/cron/crawler/Storage.go b/app/cron/crawler/Storage.go
index 6d62bbc..2e803c6 100644
--- a/app/cron/crawler/Storage.go
+++ b/app/cron/crawler/Storage.go
@@ -7,11 +7,23 @@ import (
 	"appengine/datastore"
 )
 
-func Storage(c appengine.Context, in <-chan []Data, workers int) {
+// Storage push items to datastore
+func Storage(c appengine.Context, in <-chan []interface{}, workers int, loc string) {
+	var store func(c appengine.Context, in <-chan []interface{}, x int, loc string)
+
+	switch loc {
+	case XML:
+		store = runStorageData
+	case "vertex":
+		store = runStorageVertex
+	case "edge":
+		store = runStorageEdge
+	}
+
 	var wg sync.WaitGroup
 	for i := 0; i < workers; i++ {
 		go func(x int) {
-			runStorage(c, in, x)
+			store(c, in, x, loc)
 			wg.Done()
 		}(i)
 	}
@@ -19,26 +31,103 @@ func Storage(c appengine.Context, in <-chan []Data, workers int) {
 	wg.Wait()
 }
 
+// Puller pull items from datastore
+// TODO: improve pulling performance (cache number of xml in stage_1, fan out pulling)
+func Puller(c appengine.Context, loc string) <-chan string {
+	out := make(chan string, 10000)
+
+	go func() {
+		defer close(out)
+		q := datastore.NewQuery(loc)
+		t := q.Run(c)
+		for {
+			var s Store
+			_, err := t.Next(&s)
+			if err == datastore.Done {
+				break // No further entities match the query.
+			}
+			if err != nil {
+				c.Errorf("fetching next Person: %v", err)
+				break
+			}
+
+			// Do something with Person p and Key k
+			out <- string(s.XML)
+		}
+	}()
+	return out
+}
+
+// Store single xml item to put in storage
 type Store struct {
 	XML []byte
 }
 
-func runStorage(c appengine.Context, in <-chan []Data, x int) {
+func runStorageData(c appengine.Context, in <-chan []interface{}, x int, loc string) {
 	var keys []*datastore.Key
 	var items []Store
+
 	for batch := range in {
-		c.Infof("Storage %d: Storing chunk", x)
+		c.Infof("Storage %d: Storing Post chunk", x)
 		keys = make([]*datastore.Key, len(batch))
 		items = make([]Store, len(batch))
 		for i, item := range batch {
-			keys[i] = datastore.NewKey(c, XML, item.KEY, 0, nil)
-			items[i].XML = []byte(item.XML)
+			x := item.(Data)
+			keys[i] = datastore.NewKey(c, loc, x.KEY, 0, nil)
+			items[i] = Store{[]byte(x.XML)}
+		}
+
+		// c.Infof("Storage: Storing %v", keys)
+		_, err := datastore.PutMulti(c, keys, items)
+		if err != nil {
+			c.Errorf("Storage %d: Error %s: %v %v", x, err, keys, items)
+			panic(err)
+		}
+	}
+}
+
+func runStorageVertex(c appengine.Context, in <-chan []interface{}, x int, loc string) {
+	var keys []*datastore.Key
+	var items []Vertex
+
+	for batch := range in {
+		c.Infof("Storage %d: Storing Vertex chunk", x)
+		keys = make([]*datastore.Key, len(batch))
+		items = make([]Vertex, len(batch))
+		for i, item := range batch {
+			x := item.(Vertex)
+			keys[i] = datastore.NewKey(c, loc, x.Type+":"+x.Value, 0, nil)
+			items[i] = x
+		}
+
+		// c.Infof("Storage: Storing %v", keys)
+		_, err := datastore.PutMulti(c, keys, items)
+		if err != nil {
+			c.Errorf("Storage %d: Error %s: %v %v", x, err, keys, items)
+			panic(err)
+		}
+	}
+}
+
+func runStorageEdge(c appengine.Context, in <-chan []interface{}, x int, loc string) {
+	var keys []*datastore.Key
+	var items []Edge
+
+	for batch := range in {
+		c.Infof("Storage %d: Storing Edge chunk", x)
+		keys = make([]*datastore.Key, len(batch))
+		items = make([]Edge, len(batch))
+		for i, item := range batch {
+			x := item.(Edge)
+			keys[i] = datastore.NewIncompleteKey(c, loc, nil)
+			items[i] = x
 		}
 
 		// c.Infof("Storage: Storing %v", keys)
 		_, err := datastore.PutMulti(c, keys, items)
 		if err != nil {
-			c.Errorf("Storage: Error storing batch %s", err)
+			c.Errorf("Storage %d: Error %s: %v %v", x, err, keys, items)
+			panic(err)
 		}
 	}
 }
diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go
index 2a3a21d..7cf4168 100644
--- a/app/cron/crawler/UnPager.go
+++ b/app/cron/crawler/UnPager.go
@@ -2,11 +2,40 @@ package crawler
 
 import (
 	"encoding/xml"
+	"fmt"
+	"net/http"
 	"sync"
 
 	"appengine"
 )
 
+// UnPage unpage and flatten data from Crawling
+func UnPage(w http.ResponseWriter, r *http.Request) {
+	c := appengine.NewContext(r)
+
+	pages := Puller(c, XML)
+	posts := UnPager(c, pages, 10)
+
+	vertexes, edges := Miner(c, posts, 30)
+
+	vbatch := Batcher(c, vertexes, 100)
+	ebatch := Batcher(c, edges, 100)
+
+	var wg sync.WaitGroup
+	wg.Add(2)
+	go func() {
+		Storage(c, vbatch, 10, "vertex")
+		wg.Done()
+	}()
+	go func() {
+		Storage(c, ebatch, 10, "edge")
+		wg.Done()
+	}()
+	wg.Wait()
+
+	fmt.Fprintf(w, "Done")
+}
+
 // UnPager process pages of posts to individual posts
 func UnPager(c appengine.Context, pages <-chan string, workers int) <-chan Data {
 	res := make(chan Data, 100000)
@@ -37,7 +66,7 @@ func runUnPager(c appengine.Context, in <-chan string, out chan<- Data, idx int)
 	}
 
 	for page := range in {
-		c.Infof("UnPager %d: Retrieved Page", idx)
+		// c.Infof("UnPager %d: Retrieved Page", idx)
 
 		if err := xml.Unmarshal([]byte(page), &miner); err != nil {
 			c.Errorf("UnPager: Error %s", err)
diff --git a/app/cron/crawler/crawler.go b/app/cron/crawler/crawler.go
index 8e322ce..ca203ab 100644
--- a/app/cron/crawler/crawler.go
+++ b/app/cron/crawler/crawler.go
@@ -20,8 +20,11 @@ const (
 	// DEPTH depth of feed mining
 	DEPTH = 1
 
-	// XML name of where xml posts are stored
+	// XML name of where xml posts pages are stored
 	XML = "xml"
+
+	// POST name of where xml posts are stored
+	POST = "post"
 )
 
 type Data struct {
@@ -38,7 +41,7 @@ func Crawl(w http.ResponseWriter, r *http.Request) {
 	pages := Fetcher(c, fetchers)
 	// posts := UnPager(c, pages, pagers)
 	batch := Batcher(c, pages, 10)
-	Storage(c, batch, storers)
+	Storage(c, batch, storers, XML)
 
 	fmt.Fprint(w, "Crawl Complete!")
 }
diff --git a/app/cron/cron.go b/app/cron/cron.go
index 4bc1439..4612c8f 100644
--- a/app/cron/cron.go
+++ b/app/cron/cron.go
@@ -35,15 +35,39 @@ const (
 	DEFERRED = true
 )
 
+func cleanup(c appengine.Context, name string) error {
+	q := datastore.NewQuery(name).KeysOnly()
+	keys, err := q.GetAll(c, nil)
+	s := 100
+	for len(keys) > 0 {
+		if len(keys) < 100 {
+			s = len(keys)
+		}
+		err = datastore.DeleteMulti(c, keys[:s])
+		keys = keys[s:]
+	}
+	return err
+}
+
 // Init initializes cron handlers
 func Init() {
-	http.HandleFunc("/cron/crawl", crawler.Crawl)
+	http.HandleFunc("/cron/stage/1", crawler.Crawl)
+	http.HandleFunc("/cron/stage/2", crawler.UnPage)
+	http.HandleFunc("/cron/stage/2/clean", func(w http.ResponseWriter, r *http.Request) {
+		c := appengine.NewContext(r)
+		cleanup(c, "edge")
+		cleanup(c, "vertex")
+		cleanup(c, "post")
+	})
 	http.Handle("/cron/stats", appstats.NewHandler(crawler.Stats))
 
-	http.Handle("/cron/parse", appstats.NewHandler(parseFeeds))
-	http.HandleFunc("/cron/delete", delete)
+	// http.Handle("/cron/parse", appstats.NewHandler(parseFeeds))
+	// http.HandleFunc("/cron/delete", delete)
 	http.HandleFunc("/_ah/start", func(w http.ResponseWriter, r *http.Request) {
-		fmt.Fprintf(w, "Here boys")
+		fmt.Fprintf(w, "Start")
+	})
+	http.HandleFunc("/_ah/stop", func(w http.ResponseWriter, r *http.Request) {
+		fmt.Fprintf(w, "Stop")
 	})
 }
 

From ac6a52f0ccb9de2fea1740a08f830987b0058661 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sat, 14 Nov 2015 17:02:57 -0700
Subject: [PATCH 11/26] Finding tags based on response data

---
 app/cron/chain/chain.go     |  51 +++
 app/cron/crawler/Miner.go   |  76 ----
 app/cron/crawler/Storage.go |  77 ----
 app/cron/crawler/UnPager.go |  83 ----
 app/cron/cron.go            | 761 ++++++++++++++++++------------------
 app/cron/proj/graph.go      |  28 ++
 app/cron/proj/proj.go       | 118 ++++++
 app/cron/proj/tags.go       |  87 +++++
 yaml/module-cron.yaml       |   4 +
 9 files changed, 667 insertions(+), 618 deletions(-)
 create mode 100644 app/cron/chain/chain.go
 delete mode 100644 app/cron/crawler/Miner.go
 delete mode 100644 app/cron/crawler/UnPager.go
 create mode 100644 app/cron/proj/graph.go
 create mode 100644 app/cron/proj/proj.go
 create mode 100644 app/cron/proj/tags.go

diff --git a/app/cron/chain/chain.go b/app/cron/chain/chain.go
new file mode 100644
index 0000000..67b235a
--- /dev/null
+++ b/app/cron/chain/chain.go
@@ -0,0 +1,51 @@
+package chain
+
+import "sync"
+
+// Worker is a function designed to fan out and perform work on a piece of Data
+type Worker func(in <-chan interface{}, out chan<- interface{}, idx int)
+
+// FanOut allows lengthy workers to fan out on chanel operations
+func FanOut(count int, buff int, in <-chan interface{}, doIt Worker) <-chan interface{} {
+	out := make(chan interface{}, buff)
+	var wg sync.WaitGroup
+	wg.Add(count)
+	for i := 0; i < count; i++ {
+		go func(idx int) {
+			doIt(in, out, idx)
+			wg.Done()
+		}(i)
+	}
+	go func() {
+		wg.Wait()
+		close(out)
+	}()
+	return out
+}
+
+// FanIn takes multiple chanels and pushes their results into a single channel
+func FanIn(buff int, cs ...<-chan interface{}) <-chan interface{} {
+	var wg sync.WaitGroup
+	out := make(chan interface{})
+
+	// Start an output goroutine for each input channel in cs.  output
+	// copies values from c to out until c is closed, then calls wg.Done.
+	output := func(c <-chan interface{}) {
+		for n := range c {
+			out <- n
+		}
+		wg.Done()
+	}
+	wg.Add(len(cs))
+	for _, c := range cs {
+		go output(c)
+	}
+
+	// Start a goroutine to close out once all the output goroutines are
+	// done.  This must start after the wg.Add call.
+	go func() {
+		wg.Wait()
+		close(out)
+	}()
+	return out
+}
diff --git a/app/cron/crawler/Miner.go b/app/cron/crawler/Miner.go
deleted file mode 100644
index 3a8133b..0000000
--- a/app/cron/crawler/Miner.go
+++ /dev/null
@@ -1,76 +0,0 @@
-package crawler
-
-import (
-	"encoding/xml"
-	"sync"
-	"time"
-
-	"appengine"
-)
-
-// Vertex of the graph
-type Vertex struct {
-	Type  string
-	Value string
-	Count int64
-}
-
-// Edge of the graph
-type Edge struct {
-	Nodes []string
-}
-
-// Miner takes posts and mines out a graph
-func Miner(c appengine.Context, posts <-chan Data, workers int) (<-chan interface{}, <-chan interface{}) {
-	vertexes := make(chan interface{}, 10000)
-	edges := make(chan interface{}, 10000)
-
-	var wg sync.WaitGroup
-	for i := 0; i < workers; i++ {
-		go func(i int) {
-			miner(c, posts, vertexes, edges, i)
-			wg.Done()
-		}(i)
-	}
-	wg.Add(workers)
-
-	go func() {
-		wg.Wait()
-		close(vertexes)
-		close(edges)
-	}()
-	return vertexes, edges
-}
-
-func miner(c appengine.Context, posts <-chan Data, vertexes chan<- interface{}, edges chan<- interface{}, i int) {
-	var data struct {
-		Tags []string `xml:"category"`
-		Imgs []struct {
-			URL string `xml:"url,attr"`
-		} `xml:"content"`
-	}
-
-	for post := range posts {
-		vertexes <- Vertex{"Pst", post.KEY, 0}
-
-		// log.Printf("Miner %d: Got Post: %s", i, post.KEY)
-		// log.Printf("Data: %s", post.XML)
-
-		if err := xml.Unmarshal([]byte("<item>"+post.XML+"</item>"), &data); err != nil {
-			c.Errorf("Miner %d: Error %s", i, err)
-		}
-
-		for _, tag := range data.Tags {
-			// log.Printf("Found Tag: %s", tag)
-			vertexes <- Vertex{"Tag", tag, 0}
-			edges <- Edge{[]string{"Tag" + tag, "Pst" + post.KEY}}
-		}
-
-		for _, img := range data.Imgs {
-			// log.Printf("Found Img: %s", img.URL)
-			vertexes <- Vertex{"Img", img.URL, 0}
-			edges <- Edge{[]string{"Img" + img.URL, "Pst" + post.KEY}}
-		}
-		time.Sleep(time.Second)
-	}
-}
diff --git a/app/cron/crawler/Storage.go b/app/cron/crawler/Storage.go
index 2e803c6..a527a16 100644
--- a/app/cron/crawler/Storage.go
+++ b/app/cron/crawler/Storage.go
@@ -14,10 +14,6 @@ func Storage(c appengine.Context, in <-chan []interface{}, workers int, loc stri
 	switch loc {
 	case XML:
 		store = runStorageData
-	case "vertex":
-		store = runStorageVertex
-	case "edge":
-		store = runStorageEdge
 	}
 
 	var wg sync.WaitGroup
@@ -31,33 +27,6 @@ func Storage(c appengine.Context, in <-chan []interface{}, workers int, loc stri
 	wg.Wait()
 }
 
-// Puller pull items from datastore
-// TODO: improve pulling performance (cache number of xml in stage_1, fan out pulling)
-func Puller(c appengine.Context, loc string) <-chan string {
-	out := make(chan string, 10000)
-
-	go func() {
-		defer close(out)
-		q := datastore.NewQuery(loc)
-		t := q.Run(c)
-		for {
-			var s Store
-			_, err := t.Next(&s)
-			if err == datastore.Done {
-				break // No further entities match the query.
-			}
-			if err != nil {
-				c.Errorf("fetching next Person: %v", err)
-				break
-			}
-
-			// Do something with Person p and Key k
-			out <- string(s.XML)
-		}
-	}()
-	return out
-}
-
 // Store single xml item to put in storage
 type Store struct {
 	XML []byte
@@ -85,49 +54,3 @@ func runStorageData(c appengine.Context, in <-chan []interface{}, x int, loc str
 		}
 	}
 }
-
-func runStorageVertex(c appengine.Context, in <-chan []interface{}, x int, loc string) {
-	var keys []*datastore.Key
-	var items []Vertex
-
-	for batch := range in {
-		c.Infof("Storage %d: Storing Vertex chunk", x)
-		keys = make([]*datastore.Key, len(batch))
-		items = make([]Vertex, len(batch))
-		for i, item := range batch {
-			x := item.(Vertex)
-			keys[i] = datastore.NewKey(c, loc, x.Type+":"+x.Value, 0, nil)
-			items[i] = x
-		}
-
-		// c.Infof("Storage: Storing %v", keys)
-		_, err := datastore.PutMulti(c, keys, items)
-		if err != nil {
-			c.Errorf("Storage %d: Error %s: %v %v", x, err, keys, items)
-			panic(err)
-		}
-	}
-}
-
-func runStorageEdge(c appengine.Context, in <-chan []interface{}, x int, loc string) {
-	var keys []*datastore.Key
-	var items []Edge
-
-	for batch := range in {
-		c.Infof("Storage %d: Storing Edge chunk", x)
-		keys = make([]*datastore.Key, len(batch))
-		items = make([]Edge, len(batch))
-		for i, item := range batch {
-			x := item.(Edge)
-			keys[i] = datastore.NewIncompleteKey(c, loc, nil)
-			items[i] = x
-		}
-
-		// c.Infof("Storage: Storing %v", keys)
-		_, err := datastore.PutMulti(c, keys, items)
-		if err != nil {
-			c.Errorf("Storage %d: Error %s: %v %v", x, err, keys, items)
-			panic(err)
-		}
-	}
-}
diff --git a/app/cron/crawler/UnPager.go b/app/cron/crawler/UnPager.go
deleted file mode 100644
index 7cf4168..0000000
--- a/app/cron/crawler/UnPager.go
+++ /dev/null
@@ -1,83 +0,0 @@
-package crawler
-
-import (
-	"encoding/xml"
-	"fmt"
-	"net/http"
-	"sync"
-
-	"appengine"
-)
-
-// UnPage unpage and flatten data from Crawling
-func UnPage(w http.ResponseWriter, r *http.Request) {
-	c := appengine.NewContext(r)
-
-	pages := Puller(c, XML)
-	posts := UnPager(c, pages, 10)
-
-	vertexes, edges := Miner(c, posts, 30)
-
-	vbatch := Batcher(c, vertexes, 100)
-	ebatch := Batcher(c, edges, 100)
-
-	var wg sync.WaitGroup
-	wg.Add(2)
-	go func() {
-		Storage(c, vbatch, 10, "vertex")
-		wg.Done()
-	}()
-	go func() {
-		Storage(c, ebatch, 10, "edge")
-		wg.Done()
-	}()
-	wg.Wait()
-
-	fmt.Fprintf(w, "Done")
-}
-
-// UnPager process pages of posts to individual posts
-func UnPager(c appengine.Context, pages <-chan string, workers int) <-chan Data {
-	res := make(chan Data, 100000)
-
-	// TODO: spin up as many unpages as desired
-	var wg sync.WaitGroup
-	wg.Add(workers)
-	for i := 0; i < workers; i++ {
-		go func(x int) {
-			runUnPager(c, pages, res, x)
-			wg.Done()
-		}(i)
-	}
-	go func() {
-		wg.Wait()
-		close(res)
-	}()
-
-	return res
-}
-
-func runUnPager(c appengine.Context, in <-chan string, out chan<- Data, idx int) {
-	var miner struct {
-		Item []struct {
-			KEY string `xml:"guid"`
-			XML string `xml:",innerxml"`
-		} `xml:"channel>item"`
-	}
-
-	for page := range in {
-		// c.Infof("UnPager %d: Retrieved Page", idx)
-
-		if err := xml.Unmarshal([]byte(page), &miner); err != nil {
-			c.Errorf("UnPager: Error %s", err)
-		}
-
-		for _, post := range miner.Item {
-			// c.Infof("UnPager: Found Post %s", post.KEY)
-			out <- Data{
-				KEY: post.KEY,
-				XML: post.XML,
-			}
-		}
-	}
-}
diff --git a/app/cron/cron.go b/app/cron/cron.go
index 4612c8f..ddeb2ff 100644
--- a/app/cron/cron.go
+++ b/app/cron/cron.go
@@ -1,41 +1,34 @@
 package cron
 
 import (
-	"encoding/xml"
 	"fmt"
 	"net/http"
-	"net/url"
-	"regexp"
-	"strconv"
 
 	"github.com/bign8/chive-show/app/cron/crawler"
-	"github.com/bign8/chive-show/app/helpers/keycache"
-	"github.com/bign8/chive-show/app/models"
+	"github.com/bign8/chive-show/app/cron/proj"
 
 	"gopkg.in/mjibson/v1/appstats"
 
 	"appengine"
 	"appengine/datastore"
-	"appengine/delay"
-	"appengine/taskqueue"
-	"appengine/urlfetch"
 )
 
-const (
-	// SIZE of a batch
-	SIZE = 10
-
-	// DEBUG enable if troubleshooting algorithm
-	DEBUG = true
-
-	// DEPTH depth of feed mining
-	DEPTH = 1
-
-	// DEFERRED if deferreds should be processed deferred
-	DEFERRED = true
-)
+// const (
+// 	// SIZE of a batch
+// 	SIZE = 10
+//
+// 	// DEBUG enable if troubleshooting algorithm
+// 	DEBUG = true
+//
+// 	// DEPTH depth of feed mining
+// 	DEPTH = 1
+//
+// 	// DEFERRED if deferreds should be processed deferred
+// 	DEFERRED = true
+// )
 
 func cleanup(c appengine.Context, name string) error {
+	c.Infof("Cleaning %s", name)
 	q := datastore.NewQuery(name).KeysOnly()
 	keys, err := q.GetAll(c, nil)
 	s := 100
@@ -52,13 +45,17 @@ func cleanup(c appengine.Context, name string) error {
 // Init initializes cron handlers
 func Init() {
 	http.HandleFunc("/cron/stage/1", crawler.Crawl)
-	http.HandleFunc("/cron/stage/2", crawler.UnPage)
-	http.HandleFunc("/cron/stage/2/clean", func(w http.ResponseWriter, r *http.Request) {
+
+	http.Handle("/proj/tags", appstats.NewHandler(proj.Tags))
+
+	http.HandleFunc("/clean", func(w http.ResponseWriter, r *http.Request) {
 		c := appengine.NewContext(r)
+		cleanup(c, "buff")
 		cleanup(c, "edge")
 		cleanup(c, "vertex")
 		cleanup(c, "post")
 	})
+
 	http.Handle("/cron/stats", appstats.NewHandler(crawler.Stats))
 
 	// http.Handle("/cron/parse", appstats.NewHandler(parseFeeds))
@@ -71,361 +68,361 @@ func Init() {
 	})
 }
 
-var (
-	// ErrFeedParse404 if feed page is not found
-	ErrFeedParse404 = fmt.Errorf("Feed parcing recieved a %d Status Code", 404)
-)
-
-func pageURL(idx int) string {
-	return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx)
-}
-
-func parseFeeds(c appengine.Context, w http.ResponseWriter, r *http.Request) {
-	fp := new(feedParser)
-	err := fp.Main(c, w)
-	if err != nil {
-		http.Error(w, err.Error(), http.StatusInternalServerError)
-	} else {
-		fmt.Fprint(w, "Parsed")
-	}
-}
-
-type feedParser struct {
-	context appengine.Context
-	client  *http.Client
-
-	todo  []int
-	guids map[int64]bool // this could be extremely large
-	posts []models.Post
-}
-
-func (x *feedParser) Main(c appengine.Context, w http.ResponseWriter) error {
-	x.context = c
-	x.client = urlfetch.Client(c)
-
-	// Load guids from DB
-	// TODO: do this with sharded keys
-	keys, err := datastore.NewQuery(models.POST).KeysOnly().GetAll(c, nil)
-	if err != nil {
-		c.Errorf("Error finding keys %v %v", err, appengine.IsOverQuota(err))
-		return err
-	}
-	x.guids = map[int64]bool{}
-	for _, key := range keys {
-		x.guids[key.IntID()] = true
-	}
-	keys = nil
-
-	// // DEBUG ONLY
-	// data, err := json.MarshalIndent(x.guids, "", "  ")
-	// fmt.Fprint(w, string(data))
-	// return err
-	x.posts = make([]models.Post, 0)
-
-	// Initial recursive edge case
-	isStop, fullStop, err := x.isStop(1)
-	if isStop || fullStop || err != nil {
-		c.Infof("Finished without recursive searching %v", err)
-		if err == nil {
-			err = x.storePosts(x.posts)
-		}
-		return err
-	}
-
-	// Recursive search strategy
-	err = x.Search(1, -1)
-
-	// storePosts and processTodo
-	if err == nil {
-		errc := make(chan error)
-		go func() {
-			errc <- x.storePosts(x.posts)
-		}()
-		go func() {
-			errc <- x.processTodo()
-		}()
-		err1, err2 := <-errc, <-errc
-		if err1 != nil {
-			err = err1
-		} else if err2 != nil {
-			err = err2
-		}
-	}
-
-	if err != nil {
-		c.Errorf("Error in Main %v", err)
-	}
-	return err
-}
-
-var processBatchDeferred = delay.Func("process-todo-batch", func(c appengine.Context, ids []int) {
-	parser := feedParser{
-		context: c,
-		client:  urlfetch.Client(c),
-	}
-	parser.processBatch(ids)
-})
-
-func (x *feedParser) processBatch(ids []int) error {
-	done := make(chan error)
-	for _, idx := range ids {
-		go func(idx int) {
-			posts, err := x.getAndParseFeed(idx)
-			if err == nil {
-				err = x.storePosts(posts)
-			}
-			done <- err
-		}(idx)
-	}
-	for i := 0; i < len(ids); i++ {
-		err := <-done
-		if err != nil {
-			x.context.Errorf("error storing feed (at index %d): %v", i, err)
-			return err
-		}
-	}
-	return nil
-}
-
-func (x *feedParser) processTodo() error {
-	x.context.Infof("Processing TODO: %v", x.todo)
-
-	var batch []int
-	var task *taskqueue.Task
-	var allTasks []*taskqueue.Task
-	var err error
-	for _, idx := range x.todo {
-		if batch == nil {
-			batch = make([]int, 0)
-		}
-		batch = append(batch, idx)
-		if len(batch) >= SIZE {
-			if DEFERRED {
-				task, err = processBatchDeferred.Task(batch)
-				if err == nil {
-					allTasks = append(allTasks, task)
-				}
-			} else {
-				err = x.processBatch(batch)
-			}
-			if err != nil {
-				return err
-			}
-			batch = nil
-		}
-	}
-	if len(batch) > 0 {
-		if DEFERRED {
-			task, err = processBatchDeferred.Task(batch)
-			if err == nil {
-				allTasks = append(allTasks, task)
-			}
-		} else {
-			err = x.processBatch(batch)
-		}
-	}
-	if DEFERRED && len(allTasks) > 0 {
-		x.context.Infof("Adding %d task(s) to the default queue", len(allTasks))
-		taskqueue.AddMulti(x.context, allTasks, "default")
-	}
-	return err
-}
-
-func (x *feedParser) addRange(bottom, top int) {
-	for i := bottom + 1; i < top; i++ {
-		x.todo = append(x.todo, i)
-	}
-}
-
-func (x *feedParser) Search(bottom, top int) (err error) {
-	/*
-	  def infinite_length(bottom=1, top=-1):
-	    if bottom == 1 and not item_exists(1): return 0  # Starting edge case
-	    if bottom == top - 1: return bottom  # Result found! (top doesn’t exist)
-	    if top < 0:  # Searching forward
-	      top = bottom << 1  # Base 2 hops
-	      if item_exists(top):
-	        top, bottom = -1, top # continue searching forward
-	    else:  # Binary search between bottom and top
-	      middle = (bottom + top) // 2
-	      bottom, top = middle, top if item_exists(middle) else bottom, middle
-	    return infinite_length(bottom, top)  # Tail recursion!!!
-	*/
-	if bottom == top-1 {
-		x.context.Infof("TOP OF RANGE FOUND! @%d", top)
-		x.addRange(bottom, top)
-		return nil
-	}
-	var fullStop, isStop bool = false, false
-	if top < 0 { // Searching forward
-		top = bottom << 1 // Base 2 hops forward
-		isStop, fullStop, err = x.isStop(top)
-		if err != nil {
-			return err
-		}
-		if !isStop {
-			x.addRange(bottom, top)
-			top, bottom = -1, top
-		}
-	} else { // Binary search between top and bottom
-		middle := (bottom + top) / 2
-		isStop, fullStop, err = x.isStop(middle)
-		if err != nil {
-			return err
-		}
-		if isStop {
-			top = middle
-		} else {
-			x.addRange(bottom, middle)
-			bottom = middle
-		}
-	}
-	if fullStop {
-		return nil
-	}
-	return x.Search(bottom, top) // TAIL RECURSION!!!
-}
-
-func (x *feedParser) isStop(idx int) (isStop, fullStop bool, err error) {
-	// Gather posts as necessary
-	posts, err := x.getAndParseFeed(idx)
-	if err == ErrFeedParse404 {
-		x.context.Infof("Reached the end of the feed list (%v)", idx)
-		return true, false, nil
-	}
-	if err != nil {
-		x.context.Errorf("Error decoding ChiveFeed: %s", err)
-		return false, false, err
-	}
-
-	// Check for Duplicates
-	count := 0
-	for _, post := range posts {
-		id, _, err := guidToInt(post.GUID)
-		if x.guids[id] || err != nil {
-			continue
-		}
-		count++
-	}
-	x.posts = append(x.posts, posts...)
-
-	// Use store_count info to determine if isStop
-	isStop = count == 0 || DEBUG
-	fullStop = len(posts) != count && count > 0
-	if DEBUG {
-		isStop = idx > DEPTH
-		fullStop = idx == DEPTH
-	}
-	return
-}
-
-func (x *feedParser) getAndParseFeed(idx int) ([]models.Post, error) {
-	url := pageURL(idx)
-
-	// Get Response
-	x.context.Infof("Parsing index %v (%v)", idx, url)
-	resp, err := x.client.Get(url)
-	if err != nil {
-		return nil, err
-	}
-	defer resp.Body.Close()
-	if resp.StatusCode != 200 {
-		if resp.StatusCode == 404 {
-			return nil, ErrFeedParse404
-		}
-		return nil, fmt.Errorf("Feed parcing recieved a %d Status Code", resp.StatusCode)
-	}
-
-	// Decode Response
-	decoder := xml.NewDecoder(resp.Body)
-	var feed struct {
-		Items []models.Post `xml:"channel>item"`
-	}
-	if decoder.Decode(&feed) != nil {
-		return nil, err
-	}
-
-	// Cleanup Response
-	for idx := range feed.Items {
-		post := &feed.Items[idx]
-		for i, img := range post.Media {
-			post.Media[i].URL = stripQuery(img.URL)
-		}
-		post.MugShot = post.Media[0].URL
-		post.Media = post.Media[1:]
-	}
-	return feed.Items, err
-}
-
-func (x *feedParser) storePosts(dirty []models.Post) (err error) {
-	var posts []models.Post
-	var keys []*datastore.Key
-	for _, post := range dirty {
-		key, err := x.cleanPost(&post)
-		if err != nil {
-			continue
-		}
-		posts = append(posts, post)
-		keys = append(keys, key)
-	}
-	if len(keys) > 0 {
-		complete, err := datastore.PutMulti(x.context, keys, posts)
-		if err == nil {
-			err = keycache.AddKeys(x.context, models.POST, complete)
-		}
-	}
-	return err
-}
-
-func (x *feedParser) cleanPost(p *models.Post) (*datastore.Key, error) {
-	id, link, err := guidToInt(p.GUID)
-	if err != nil {
-		return nil, err
-	}
-	// Remove link posts
-	if link {
-		x.context.Infof("Ignoring links post %v \"%v\"", p.GUID, p.Title)
-		return nil, fmt.Errorf("Ignoring links post")
-	}
-
-	// Detect video only posts
-	video := regexp.MustCompile("\\([^&]*Video.*\\)")
-	if video.MatchString(p.Title) {
-		x.context.Infof("Ignoring video post %v \"%v\"", p.GUID, p.Title)
-		return nil, fmt.Errorf("Ignoring video post")
-	}
-	x.context.Infof("Storing post %v \"%v\"", p.GUID, p.Title)
-
-	// Cleanup post titles
-	clean := regexp.MustCompile("\\W\\(([^\\)]*)\\)$")
-	p.Title = clean.ReplaceAllLiteralString(p.Title, "")
-
-	// Post
-	// temp_key := datastore.NewIncompleteKey(x.context, DB_POST_TABLE, nil)
-	key := datastore.NewKey(x.context, models.POST, "", id, nil)
-	return key, nil
-}
-
-func guidToInt(guid string) (int64, bool, error) {
-	// Remove link posts
-	url, err := url.Parse(guid)
-	if err != nil {
-		return -1, false, err
-	}
-
-	// Parsing post id from guid url
-	id, err := strconv.Atoi(url.Query().Get("p"))
-	if err != nil {
-		return -1, false, err
-	}
-	return int64(id), url.Query().Get("post_type") == "sdac_links", nil
-}
-
-func stripQuery(dirty string) string {
-	obj, err := url.Parse(dirty)
-	if err != nil {
-		return dirty
-	}
-	obj.RawQuery = ""
-	return obj.String()
-}
+// var (
+// 	// ErrFeedParse404 if feed page is not found
+// 	ErrFeedParse404 = fmt.Errorf("Feed parcing recieved a %d Status Code", 404)
+// )
+//
+// func pageURL(idx int) string {
+// 	return fmt.Sprintf("http://thechive.com/feed/?paged=%d", idx)
+// }
+//
+// func parseFeeds(c appengine.Context, w http.ResponseWriter, r *http.Request) {
+// 	fp := new(feedParser)
+// 	err := fp.Main(c, w)
+// 	if err != nil {
+// 		http.Error(w, err.Error(), http.StatusInternalServerError)
+// 	} else {
+// 		fmt.Fprint(w, "Parsed")
+// 	}
+// }
+//
+// type feedParser struct {
+// 	context appengine.Context
+// 	client  *http.Client
+//
+// 	todo  []int
+// 	guids map[int64]bool // this could be extremely large
+// 	posts []models.Post
+// }
+//
+// func (x *feedParser) Main(c appengine.Context, w http.ResponseWriter) error {
+// 	x.context = c
+// 	x.client = urlfetch.Client(c)
+//
+// 	// Load guids from DB
+// 	// TODO: do this with sharded keys
+// 	keys, err := datastore.NewQuery(models.POST).KeysOnly().GetAll(c, nil)
+// 	if err != nil {
+// 		c.Errorf("Error finding keys %v %v", err, appengine.IsOverQuota(err))
+// 		return err
+// 	}
+// 	x.guids = map[int64]bool{}
+// 	for _, key := range keys {
+// 		x.guids[key.IntID()] = true
+// 	}
+// 	keys = nil
+//
+// 	// // DEBUG ONLY
+// 	// data, err := json.MarshalIndent(x.guids, "", "  ")
+// 	// fmt.Fprint(w, string(data))
+// 	// return err
+// 	x.posts = make([]models.Post, 0)
+//
+// 	// Initial recursive edge case
+// 	isStop, fullStop, err := x.isStop(1)
+// 	if isStop || fullStop || err != nil {
+// 		c.Infof("Finished without recursive searching %v", err)
+// 		if err == nil {
+// 			err = x.storePosts(x.posts)
+// 		}
+// 		return err
+// 	}
+//
+// 	// Recursive search strategy
+// 	err = x.Search(1, -1)
+//
+// 	// storePosts and processTodo
+// 	if err == nil {
+// 		errc := make(chan error)
+// 		go func() {
+// 			errc <- x.storePosts(x.posts)
+// 		}()
+// 		go func() {
+// 			errc <- x.processTodo()
+// 		}()
+// 		err1, err2 := <-errc, <-errc
+// 		if err1 != nil {
+// 			err = err1
+// 		} else if err2 != nil {
+// 			err = err2
+// 		}
+// 	}
+//
+// 	if err != nil {
+// 		c.Errorf("Error in Main %v", err)
+// 	}
+// 	return err
+// }
+//
+// var processBatchDeferred = delay.Func("process-todo-batch", func(c appengine.Context, ids []int) {
+// 	parser := feedParser{
+// 		context: c,
+// 		client:  urlfetch.Client(c),
+// 	}
+// 	parser.processBatch(ids)
+// })
+//
+// func (x *feedParser) processBatch(ids []int) error {
+// 	done := make(chan error)
+// 	for _, idx := range ids {
+// 		go func(idx int) {
+// 			posts, err := x.getAndParseFeed(idx)
+// 			if err == nil {
+// 				err = x.storePosts(posts)
+// 			}
+// 			done <- err
+// 		}(idx)
+// 	}
+// 	for i := 0; i < len(ids); i++ {
+// 		err := <-done
+// 		if err != nil {
+// 			x.context.Errorf("error storing feed (at index %d): %v", i, err)
+// 			return err
+// 		}
+// 	}
+// 	return nil
+// }
+//
+// func (x *feedParser) processTodo() error {
+// 	x.context.Infof("Processing TODO: %v", x.todo)
+//
+// 	var batch []int
+// 	var task *taskqueue.Task
+// 	var allTasks []*taskqueue.Task
+// 	var err error
+// 	for _, idx := range x.todo {
+// 		if batch == nil {
+// 			batch = make([]int, 0)
+// 		}
+// 		batch = append(batch, idx)
+// 		if len(batch) >= SIZE {
+// 			if DEFERRED {
+// 				task, err = processBatchDeferred.Task(batch)
+// 				if err == nil {
+// 					allTasks = append(allTasks, task)
+// 				}
+// 			} else {
+// 				err = x.processBatch(batch)
+// 			}
+// 			if err != nil {
+// 				return err
+// 			}
+// 			batch = nil
+// 		}
+// 	}
+// 	if len(batch) > 0 {
+// 		if DEFERRED {
+// 			task, err = processBatchDeferred.Task(batch)
+// 			if err == nil {
+// 				allTasks = append(allTasks, task)
+// 			}
+// 		} else {
+// 			err = x.processBatch(batch)
+// 		}
+// 	}
+// 	if DEFERRED && len(allTasks) > 0 {
+// 		x.context.Infof("Adding %d task(s) to the default queue", len(allTasks))
+// 		taskqueue.AddMulti(x.context, allTasks, "default")
+// 	}
+// 	return err
+// }
+//
+// func (x *feedParser) addRange(bottom, top int) {
+// 	for i := bottom + 1; i < top; i++ {
+// 		x.todo = append(x.todo, i)
+// 	}
+// }
+//
+// func (x *feedParser) Search(bottom, top int) (err error) {
+// 	/*
+// 	  def infinite_length(bottom=1, top=-1):
+// 	    if bottom == 1 and not item_exists(1): return 0  # Starting edge case
+// 	    if bottom == top - 1: return bottom  # Result found! (top doesn’t exist)
+// 	    if top < 0:  # Searching forward
+// 	      top = bottom << 1  # Base 2 hops
+// 	      if item_exists(top):
+// 	        top, bottom = -1, top # continue searching forward
+// 	    else:  # Binary search between bottom and top
+// 	      middle = (bottom + top) // 2
+// 	      bottom, top = middle, top if item_exists(middle) else bottom, middle
+// 	    return infinite_length(bottom, top)  # Tail recursion!!!
+// 	*/
+// 	if bottom == top-1 {
+// 		x.context.Infof("TOP OF RANGE FOUND! @%d", top)
+// 		x.addRange(bottom, top)
+// 		return nil
+// 	}
+// 	var fullStop, isStop bool = false, false
+// 	if top < 0 { // Searching forward
+// 		top = bottom << 1 // Base 2 hops forward
+// 		isStop, fullStop, err = x.isStop(top)
+// 		if err != nil {
+// 			return err
+// 		}
+// 		if !isStop {
+// 			x.addRange(bottom, top)
+// 			top, bottom = -1, top
+// 		}
+// 	} else { // Binary search between top and bottom
+// 		middle := (bottom + top) / 2
+// 		isStop, fullStop, err = x.isStop(middle)
+// 		if err != nil {
+// 			return err
+// 		}
+// 		if isStop {
+// 			top = middle
+// 		} else {
+// 			x.addRange(bottom, middle)
+// 			bottom = middle
+// 		}
+// 	}
+// 	if fullStop {
+// 		return nil
+// 	}
+// 	return x.Search(bottom, top) // TAIL RECURSION!!!
+// }
+//
+// func (x *feedParser) isStop(idx int) (isStop, fullStop bool, err error) {
+// 	// Gather posts as necessary
+// 	posts, err := x.getAndParseFeed(idx)
+// 	if err == ErrFeedParse404 {
+// 		x.context.Infof("Reached the end of the feed list (%v)", idx)
+// 		return true, false, nil
+// 	}
+// 	if err != nil {
+// 		x.context.Errorf("Error decoding ChiveFeed: %s", err)
+// 		return false, false, err
+// 	}
+//
+// 	// Check for Duplicates
+// 	count := 0
+// 	for _, post := range posts {
+// 		id, _, err := guidToInt(post.GUID)
+// 		if x.guids[id] || err != nil {
+// 			continue
+// 		}
+// 		count++
+// 	}
+// 	x.posts = append(x.posts, posts...)
+//
+// 	// Use store_count info to determine if isStop
+// 	isStop = count == 0 || DEBUG
+// 	fullStop = len(posts) != count && count > 0
+// 	if DEBUG {
+// 		isStop = idx > DEPTH
+// 		fullStop = idx == DEPTH
+// 	}
+// 	return
+// }
+//
+// func (x *feedParser) getAndParseFeed(idx int) ([]models.Post, error) {
+// 	url := pageURL(idx)
+//
+// 	// Get Response
+// 	x.context.Infof("Parsing index %v (%v)", idx, url)
+// 	resp, err := x.client.Get(url)
+// 	if err != nil {
+// 		return nil, err
+// 	}
+// 	defer resp.Body.Close()
+// 	if resp.StatusCode != 200 {
+// 		if resp.StatusCode == 404 {
+// 			return nil, ErrFeedParse404
+// 		}
+// 		return nil, fmt.Errorf("Feed parcing recieved a %d Status Code", resp.StatusCode)
+// 	}
+//
+// 	// Decode Response
+// 	decoder := xml.NewDecoder(resp.Body)
+// 	var feed struct {
+// 		Items []models.Post `xml:"channel>item"`
+// 	}
+// 	if decoder.Decode(&feed) != nil {
+// 		return nil, err
+// 	}
+//
+// 	// Cleanup Response
+// 	for idx := range feed.Items {
+// 		post := &feed.Items[idx]
+// 		for i, img := range post.Media {
+// 			post.Media[i].URL = stripQuery(img.URL)
+// 		}
+// 		post.MugShot = post.Media[0].URL
+// 		post.Media = post.Media[1:]
+// 	}
+// 	return feed.Items, err
+// }
+//
+// func (x *feedParser) storePosts(dirty []models.Post) (err error) {
+// 	var posts []models.Post
+// 	var keys []*datastore.Key
+// 	for _, post := range dirty {
+// 		key, err := x.cleanPost(&post)
+// 		if err != nil {
+// 			continue
+// 		}
+// 		posts = append(posts, post)
+// 		keys = append(keys, key)
+// 	}
+// 	if len(keys) > 0 {
+// 		complete, err := datastore.PutMulti(x.context, keys, posts)
+// 		if err == nil {
+// 			err = keycache.AddKeys(x.context, models.POST, complete)
+// 		}
+// 	}
+// 	return err
+// }
+//
+// func (x *feedParser) cleanPost(p *models.Post) (*datastore.Key, error) {
+// 	id, link, err := guidToInt(p.GUID)
+// 	if err != nil {
+// 		return nil, err
+// 	}
+// 	// Remove link posts
+// 	if link {
+// 		x.context.Infof("Ignoring links post %v \"%v\"", p.GUID, p.Title)
+// 		return nil, fmt.Errorf("Ignoring links post")
+// 	}
+//
+// 	// Detect video only posts
+// 	video := regexp.MustCompile("\\([^&]*Video.*\\)")
+// 	if video.MatchString(p.Title) {
+// 		x.context.Infof("Ignoring video post %v \"%v\"", p.GUID, p.Title)
+// 		return nil, fmt.Errorf("Ignoring video post")
+// 	}
+// 	x.context.Infof("Storing post %v \"%v\"", p.GUID, p.Title)
+//
+// 	// Cleanup post titles
+// 	clean := regexp.MustCompile("\\W\\(([^\\)]*)\\)$")
+// 	p.Title = clean.ReplaceAllLiteralString(p.Title, "")
+//
+// 	// Post
+// 	// temp_key := datastore.NewIncompleteKey(x.context, DB_POST_TABLE, nil)
+// 	key := datastore.NewKey(x.context, models.POST, "", id, nil)
+// 	return key, nil
+// }
+//
+// func guidToInt(guid string) (int64, bool, error) {
+// 	// Remove link posts
+// 	url, err := url.Parse(guid)
+// 	if err != nil {
+// 		return -1, false, err
+// 	}
+//
+// 	// Parsing post id from guid url
+// 	id, err := strconv.Atoi(url.Query().Get("p"))
+// 	if err != nil {
+// 		return -1, false, err
+// 	}
+// 	return int64(id), url.Query().Get("post_type") == "sdac_links", nil
+// }
+//
+// func stripQuery(dirty string) string {
+// 	obj, err := url.Parse(dirty)
+// 	if err != nil {
+// 		return dirty
+// 	}
+// 	obj.RawQuery = ""
+// 	return obj.String()
+// }
diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go
new file mode 100644
index 0000000..44b211b
--- /dev/null
+++ b/app/cron/proj/graph.go
@@ -0,0 +1,28 @@
+package proj
+
+import (
+	"net/http"
+	"time"
+
+	"appengine"
+)
+
+// Graph processes all posts in attempt to create a graph
+func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) {
+	start := time.Now()
+
+	// pages := puller(c)
+	// dirtyTags := getNod(c, pages, 100)
+	// tags := cleaner(dirtyTags)
+	//
+	// found := map[string]int64{}
+	// for tag := range tags {
+	// 	found[tag]++
+	// }
+	//
+	// for key, value := range found {
+	// 	fmt.Fprintf(w, "%s,%d\n", key, value)
+	// }
+
+	c.Infof("Time took: %v", time.Since(start))
+}
diff --git a/app/cron/proj/proj.go b/app/cron/proj/proj.go
new file mode 100644
index 0000000..5ad4d23
--- /dev/null
+++ b/app/cron/proj/proj.go
@@ -0,0 +1,118 @@
+package proj
+
+import (
+	"encoding/xml"
+	"log"
+	"sync"
+
+	"appengine"
+	"appengine/datastore"
+
+	"github.com/bign8/chive-show/app/cron/chain"
+	"github.com/bign8/chive-show/app/cron/crawler"
+)
+
+// XMLPage xml processor for a page
+type XMLPage struct {
+	Items []struct {
+		GUID string   `xml:"guid"`
+		Tags []string `xml:"category"`
+		Imgs []struct {
+			URL string `xml:"url,attr"`
+		} `xml:"content"`
+	} `xml:"channel>item"`
+}
+
+// Item is a post item
+type Item struct {
+	GUID string
+	Tags []string
+	Imgs []string
+}
+
+// TODO: improve pulling performance (cache number of xml in stage_1, fan out pulling)
+func puller(c appengine.Context) <-chan []byte {
+	out := make(chan []byte, 10000)
+
+	go func() {
+		defer close(out)
+		q := datastore.NewQuery(crawler.XML)
+		t := q.Run(c)
+		for {
+			var s crawler.Store
+			_, err := t.Next(&s)
+			if err == datastore.Done {
+				break // No further entities match the query.
+			}
+			if err != nil {
+				c.Errorf("fetching next Person: %v", err)
+				break
+			}
+			out <- s.XML
+		}
+	}()
+	return out
+}
+
+func flatten(c appengine.Context, in <-chan []byte) <-chan Item {
+	const WORKERS = 100
+	out := make(chan Item, 10000)
+	var wg sync.WaitGroup
+	wg.Add(WORKERS)
+	for i := 0; i < WORKERS; i++ {
+		go func(idx int) {
+			flattenWorker(c, in, out, idx)
+			wg.Done()
+		}(i)
+	}
+	go func() {
+		wg.Wait()
+		close(out)
+	}()
+	return out
+}
+
+func flattenWorker(c appengine.Context, in <-chan []byte, out chan<- Item, idx int) {
+	var xmlPage XMLPage
+	var imgs []string
+
+	for data := range in {
+		if err := xml.Unmarshal(data, &xmlPage); err != nil {
+			c.Errorf("Flatten %d: %v", idx, err)
+			continue
+		}
+		for _, item := range xmlPage.Items {
+			imgs = make([]string, len(item.Imgs))
+			for i, img := range item.Imgs {
+				imgs[i] = img.URL
+			}
+
+			out <- Item{
+				GUID: item.GUID,
+				Tags: item.Tags,
+				Imgs: imgs,
+			}
+		}
+	}
+}
+
+func doMagic() {
+	start := make(chan interface{}, 10)
+	out := chain.FanOut(10, 10, start, worker)
+	go func() {
+		for o := range out {
+			log.Printf("Something: %v", o)
+		}
+	}()
+	start <- 1
+	start <- 2
+	start <- 3
+}
+
+func worker(in <-chan interface{}, out chan<- interface{}, idx int) {
+	var bytes []byte
+	for x := range in {
+		bytes = x.([]byte)
+		out <- bytes
+	}
+}
diff --git a/app/cron/proj/tags.go b/app/cron/proj/tags.go
new file mode 100644
index 0000000..337d884
--- /dev/null
+++ b/app/cron/proj/tags.go
@@ -0,0 +1,87 @@
+package proj
+
+import (
+	"encoding/xml"
+	"fmt"
+	"net/http"
+	"strings"
+	"sync"
+	"time"
+
+	"appengine"
+)
+
+// Tags etrieves the tags from the dataset
+func Tags(c appengine.Context, w http.ResponseWriter, r *http.Request) {
+	start := time.Now()
+
+	pages := puller(c)
+	dirtyTags := getTags(c, pages, 100)
+	tags := cleaner(dirtyTags)
+
+	found := map[string]int64{}
+	for tag := range tags {
+		found[tag]++
+	}
+
+	for key, value := range found {
+		fmt.Fprintf(w, "%s,%d\n", key, value)
+	}
+
+	c.Infof("Time took: %v", time.Since(start))
+}
+
+func getTags(c appengine.Context, in <-chan []byte, workers int) <-chan string {
+	out := make(chan string, 10000)
+	var wg sync.WaitGroup
+	wg.Add(workers)
+	for i := 0; i < workers; i++ {
+		go func(idx int) {
+			tags(c, in, out, idx)
+			wg.Done()
+		}(i)
+	}
+	go func() {
+		wg.Wait()
+		close(out)
+	}()
+	return out
+}
+
+func tags(c appengine.Context, in <-chan []byte, out chan<- string, idx int) {
+	var xmlPage = XMLPage{}
+
+	for data := range in {
+		if err := xml.Unmarshal(data, &xmlPage); err != nil {
+			c.Errorf("Miner %d: Error %s", idx, err)
+			continue
+		}
+
+		for _, item := range xmlPage.Items {
+			for _, tag := range item.Tags {
+				out <- tag
+			}
+		}
+	}
+}
+
+func cleaner(in <-chan string) <-chan string {
+	// http://xpo6.com/list-of-english-stop-words/
+	var stopWords = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
+	var stops = map[string]bool{}
+	for _, s := range strings.Split(stopWords, ",") {
+		stops[s] = true
+	}
+
+	out := make(chan string, 10000)
+	go func() {
+		for s := range in {
+			s = strings.ToLower(s)
+			if !stops[s] {
+				out <- s
+			}
+		}
+		close(out)
+	}()
+	return out
+}
diff --git a/yaml/module-cron.yaml b/yaml/module-cron.yaml
index a70f2c3..c9ed0dd 100644
--- a/yaml/module-cron.yaml
+++ b/yaml/module-cron.yaml
@@ -16,6 +16,10 @@ handlers:
   script: _go_app
   login: admin
 
+- url: /proj/.*
+  script: _go_app
+  login: admin
+
 error_handlers:
   - file: err/default.html
   - error_code: over_quota

From a5ddd2a1dd067f0d13e25da6202f327bbaf046df Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sat, 14 Nov 2015 20:46:35 -0700
Subject: [PATCH 12/26] Optimizing tags to take <15s

---
 app/cron/chain/chain.go |   6 ++-
 app/cron/proj/proj.go   |  78 ++++++++--------------------
 app/cron/proj/tags.go   | 110 +++++++++++++++++++++-------------------
 3 files changed, 81 insertions(+), 113 deletions(-)

diff --git a/app/cron/chain/chain.go b/app/cron/chain/chain.go
index 67b235a..35e2952 100644
--- a/app/cron/chain/chain.go
+++ b/app/cron/chain/chain.go
@@ -3,7 +3,7 @@ package chain
 import "sync"
 
 // Worker is a function designed to fan out and perform work on a piece of Data
-type Worker func(in <-chan interface{}, out chan<- interface{}, idx int)
+type Worker func(obj interface{}, out chan<- interface{}, idx int)
 
 // FanOut allows lengthy workers to fan out on chanel operations
 func FanOut(count int, buff int, in <-chan interface{}, doIt Worker) <-chan interface{} {
@@ -12,7 +12,9 @@ func FanOut(count int, buff int, in <-chan interface{}, doIt Worker) <-chan inte
 	wg.Add(count)
 	for i := 0; i < count; i++ {
 		go func(idx int) {
-			doIt(in, out, idx)
+			for obj := range in {
+				doIt(obj, out, idx)
+			}
 			wg.Done()
 		}(i)
 	}
diff --git a/app/cron/proj/proj.go b/app/cron/proj/proj.go
index 5ad4d23..6e56ec3 100644
--- a/app/cron/proj/proj.go
+++ b/app/cron/proj/proj.go
@@ -2,8 +2,6 @@ package proj
 
 import (
 	"encoding/xml"
-	"log"
-	"sync"
 
 	"appengine"
 	"appengine/datastore"
@@ -30,17 +28,22 @@ type Item struct {
 	Imgs []string
 }
 
-// TODO: improve pulling performance (cache number of xml in stage_1, fan out pulling)
-func puller(c appengine.Context) <-chan []byte {
-	out := make(chan []byte, 10000)
+func getItems(c appengine.Context) <-chan interface{} {
+	pages := puller(c)
+	return chain.FanOut(50, 10000, pages, flatten(c))
+}
+
+func puller(c appengine.Context) <-chan interface{} {
+	out := make(chan interface{}, 10000)
 
+	// TODO: improve pulling performance (cache number of xml in stage_1, fan out pulling)
 	go func() {
 		defer close(out)
 		q := datastore.NewQuery(crawler.XML)
-		t := q.Run(c)
+		iterator := q.Run(c)
 		for {
 			var s crawler.Store
-			_, err := t.Next(&s)
+			_, err := iterator.Next(&s)
 			if err == datastore.Done {
 				break // No further entities match the query.
 			}
@@ -54,65 +57,24 @@ func puller(c appengine.Context) <-chan []byte {
 	return out
 }
 
-func flatten(c appengine.Context, in <-chan []byte) <-chan Item {
-	const WORKERS = 100
-	out := make(chan Item, 10000)
-	var wg sync.WaitGroup
-	wg.Add(WORKERS)
-	for i := 0; i < WORKERS; i++ {
-		go func(idx int) {
-			flattenWorker(c, in, out, idx)
-			wg.Done()
-		}(i)
-	}
-	go func() {
-		wg.Wait()
-		close(out)
-	}()
-	return out
-}
-
-func flattenWorker(c appengine.Context, in <-chan []byte, out chan<- Item, idx int) {
-	var xmlPage XMLPage
-	var imgs []string
+func flatten(c appengine.Context) chain.Worker {
+	return func(obj interface{}, out chan<- interface{}, idx int) {
+		var xmlPage XMLPage
+		var imgs []string
 
-	for data := range in {
-		if err := xml.Unmarshal(data, &xmlPage); err != nil {
+		// Parse the XML of an object
+		if err := xml.Unmarshal(obj.([]byte), &xmlPage); err != nil {
 			c.Errorf("Flatten %d: %v", idx, err)
-			continue
+			return
 		}
+
+		// Process items in a particular page
 		for _, item := range xmlPage.Items {
 			imgs = make([]string, len(item.Imgs))
 			for i, img := range item.Imgs {
 				imgs[i] = img.URL
 			}
-
-			out <- Item{
-				GUID: item.GUID,
-				Tags: item.Tags,
-				Imgs: imgs,
-			}
+			out <- Item{item.GUID, item.Tags, imgs}
 		}
 	}
 }
-
-func doMagic() {
-	start := make(chan interface{}, 10)
-	out := chain.FanOut(10, 10, start, worker)
-	go func() {
-		for o := range out {
-			log.Printf("Something: %v", o)
-		}
-	}()
-	start <- 1
-	start <- 2
-	start <- 3
-}
-
-func worker(in <-chan interface{}, out chan<- interface{}, idx int) {
-	var bytes []byte
-	for x := range in {
-		bytes = x.([]byte)
-		out <- bytes
-	}
-}
diff --git a/app/cron/proj/tags.go b/app/cron/proj/tags.go
index 337d884..eb8b45c 100644
--- a/app/cron/proj/tags.go
+++ b/app/cron/proj/tags.go
@@ -1,87 +1,91 @@
 package proj
 
 import (
-	"encoding/xml"
+	"bytes"
 	"fmt"
 	"net/http"
+	"runtime"
 	"strings"
-	"sync"
 	"time"
 
+	"github.com/bign8/chive-show/app/cron/chain"
+
 	"appengine"
+	"appengine/memcache"
 )
 
+const tagsMemcacheKey = "tags-baby"
+
 // Tags etrieves the tags from the dataset
 func Tags(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 	start := time.Now()
+	defer func() {
+		c.Infof("Time took: %v", time.Since(start))
+	}()
+
+	// Check from memcache
+	if item, err := memcache.Get(c, tagsMemcacheKey); err == nil {
+		w.Write(item.Value)
+		return
+	}
 
-	pages := puller(c)
-	dirtyTags := getTags(c, pages, 100)
-	tags := cleaner(dirtyTags)
+	// Pretty sure this doesn't work on prod, but works awesome in dev
+	runtime.GOMAXPROCS(runtime.NumCPU())
+	tags := chain.FanOut(50, 10000, getItems(c), tags) // Pull and clean tags
 
+	// Build a counter dictionary
 	found := map[string]int64{}
 	for tag := range tags {
-		found[tag]++
+		found[tag.(string)]++
 	}
 
+	// Output results
+	var buffer bytes.Buffer
 	for key, value := range found {
-		fmt.Fprintf(w, "%s,%d\n", key, value)
+		buffer.WriteString(fmt.Sprintf("%s,%d\n", key, value))
 	}
+	data := buffer.String()
 
-	c.Infof("Time took: %v", time.Since(start))
-}
+	fmt.Fprint(w, data)
+	c.Infof("Num tags: %v", len(found))
 
-func getTags(c appengine.Context, in <-chan []byte, workers int) <-chan string {
-	out := make(chan string, 10000)
-	var wg sync.WaitGroup
-	wg.Add(workers)
-	for i := 0; i < workers; i++ {
-		go func(idx int) {
-			tags(c, in, out, idx)
-			wg.Done()
-		}(i)
-	}
+	// Save to memcache, but only wait up to 3ms.
+	done := make(chan bool, 1)
 	go func() {
-		wg.Wait()
-		close(out)
+		memcache.Set(c, &memcache.Item{
+			Key:   tagsMemcacheKey,
+			Value: []byte(data),
+		})
+		done <- true
 	}()
-	return out
+	select {
+	case <-done:
+	case <-time.After(3 * time.Millisecond):
+	}
 }
 
-func tags(c appengine.Context, in <-chan []byte, out chan<- string, idx int) {
-	var xmlPage = XMLPage{}
-
-	for data := range in {
-		if err := xml.Unmarshal(data, &xmlPage); err != nil {
-			c.Errorf("Miner %d: Error %s", idx, err)
-			continue
-		}
-
-		for _, item := range xmlPage.Items {
-			for _, tag := range item.Tags {
-				out <- tag
-			}
-		}
+func tags(obj interface{}, out chan<- interface{}, idx int) {
+	for _, tag := range validTags((obj.(Item)).Tags) {
+		out <- tag
 	}
 }
 
-func cleaner(in <-chan string) <-chan string {
-	// http://xpo6.com/list-of-english-stop-words/
-	var stopWords = "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
-	var stops = map[string]bool{}
-	for _, s := range strings.Split(stopWords, ",") {
-		stops[s] = true
-	}
+// http://xpo6.com/list-of-english-stop-words/
+var chiveWords = "web only,"
+var stopWords = chiveWords + "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
+var stops = map[string]bool{}
 
-	out := make(chan string, 10000)
-	go func() {
-		for s := range in {
-			s = strings.ToLower(s)
-			if !stops[s] {
-				out <- s
-			}
+func validTags(tags []string) (res []string) {
+	if len(stops) == 0 {
+		for _, s := range strings.Split(stopWords, ",") {
+			stops[s] = true
 		}
-		close(out)
-	}()
-	return out
+	}
+	for _, tag := range tags {
+		tag = strings.ToLower(tag)
+		if !stops[tag] {
+			res = append(res, tag)
+		}
+	}
+	return
 }

From ea37ea5939f1c59cd1aeff8138423c4c730a9aae Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sun, 15 Nov 2015 13:00:45 -0700
Subject: [PATCH 13/26] Adding graph package (for serialized graphs)

Yes I committed the graph.pb.go file here, because I haven't updated any of the build scripts yet...
---
 app/cron/proj/graph/graph.go       | 110 +++++++++++++++++++
 app/cron/proj/graph/graph.pb.go    | 163 +++++++++++++++++++++++++++++
 app/cron/proj/graph/graph.proto    |  24 +++++
 app/cron/proj/graph/load.sh        |  14 +++
 app/cron/proj/graph/serialGraph.go |  76 ++++++++++++++
 5 files changed, 387 insertions(+)
 create mode 100644 app/cron/proj/graph/graph.go
 create mode 100644 app/cron/proj/graph/graph.pb.go
 create mode 100644 app/cron/proj/graph/graph.proto
 create mode 100755 app/cron/proj/graph/load.sh
 create mode 100644 app/cron/proj/graph/serialGraph.go

diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go
new file mode 100644
index 0000000..ee1485e
--- /dev/null
+++ b/app/cron/proj/graph/graph.go
@@ -0,0 +1,110 @@
+package main
+
+import (
+	"errors"
+	"log"
+
+	"github.com/golang/protobuf/proto"
+)
+
+// TODO: add some graph processing functions
+
+// Graph is the serializable graph we have all been looking for
+type Graph struct {
+	s     *SerialGraph
+	nodes map[uint64]*Node // Optimal lookup with pointers goes here
+}
+
+// New creates a new Graph
+func New(isDirected bool) *Graph {
+	return &Graph{
+		s: &SerialGraph{
+			Nodes:     make([]*Node, 0),
+			Directed:  proto.Bool(isDirected),
+			NodeCount: proto.Uint64(0),
+		},
+		nodes: make(map[uint64]*Node),
+	}
+}
+
+// Add creates and adds a node to the graph
+func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node {
+	n := &Node{
+		Id:       proto.Uint64(g.genNodeID()),
+		Value:    proto.String(value),
+		Weight:   proto.Int64(weight),
+		Type:     ttype.Enum(),
+		Adjacent: make([]uint64, 0),
+	}
+	g.nodes[*n.Id] = n
+	g.s.Nodes = append(g.s.Nodes, n)
+	return n
+}
+
+// Connect connects nodes to and from with an edge of weight w
+func (g *Graph) Connect(to, from *Node, weight int64) error {
+	if to == nil || from == nil {
+		return errors.New("Cannot add edge to nil node")
+	}
+	from.Adjacent = append(from.Adjacent, *to.Id) // Directed edge
+	from.Weights = append(from.Weights, weight)
+
+	if !g.s.GetDirected() { // UnDirected edge (return trip)
+		to.Adjacent = append(to.Adjacent, *from.Id)
+		to.Weights = append(to.Weights, weight)
+	}
+	return nil
+}
+
+func (g *Graph) genNodeID() (id uint64) {
+	id = g.s.GetNodeCount()
+	*g.s.NodeCount++
+	return id
+}
+
+// DecodeGraph hydrates a graph from a serialized format (returned by Bytes()).
+func DecodeGraph(data []byte) (*Graph, error) {
+	sg, err := DecodeSerialGraph(data)
+	if err != nil {
+		return nil, err
+	}
+	g := &Graph{sg, make(map[uint64]*Node)}
+
+	// Hydrate Graph from SerialGraph
+	for _, node := range sg.Nodes {
+		g.nodes[*node.Id] = node
+	}
+	return g, nil
+}
+
+// Bytes flattens a graph to a flat file format
+func (g *Graph) Bytes() ([]byte, error) {
+	// TODO: use smaller numbers for encoding...
+	return g.s.Bytes()
+}
+
+func main() {
+	log.Println("Do stuff...")
+
+	graph := New(false)
+	a := graph.Add("http://super-stupid-long-url.com/more-crap-over-here1", NodeType_UNKNOWN, 0)
+	b := graph.Add("http://super-stupid-long-url.com/more-crap-over-here2", NodeType_UNKNOWN, 0)
+	graph.Connect(a, b, 0)
+
+	// Compress
+	bits, err := graph.Bytes()
+	if err != nil {
+		panic(err)
+	}
+
+	// Decompress
+	result, err := DecodeGraph(bits)
+	if err != nil {
+		panic(err)
+	}
+
+	// Compare
+	log.Printf("Message (%d): %q", len(bits), string(bits))
+	log.Printf("Digit:\n%v\n%v", graph, result)
+	log.Printf("Nodes:\n%v\n%v", graph.s.Nodes, result.s.Nodes)
+}
diff --git a/app/cron/proj/graph/graph.pb.go b/app/cron/proj/graph/graph.pb.go
new file mode 100644
index 0000000..75953ae
--- /dev/null
+++ b/app/cron/proj/graph/graph.pb.go
@@ -0,0 +1,163 @@
+// Code generated by protoc-gen-go.
+// source: graph.proto
+// DO NOT EDIT!
+
+/*
+Package main is a generated protocol buffer package.
+
+It is generated from these files:
+	graph.proto
+
+It has these top-level messages:
+	SerialGraph
+	Node
+*/
+package main
+
+import proto "github.com/golang/protobuf/proto"
+import fmt "fmt"
+import math "math"
+
+// Reference imports to suppress errors if they are not otherwise used.
+var _ = proto.Marshal
+var _ = fmt.Errorf
+var _ = math.Inf
+
+type NodeType int32
+
+const (
+	NodeType_UNKNOWN NodeType = 0
+	NodeType_POST    NodeType = 1
+	NodeType_IMG     NodeType = 2
+	NodeType_TAG     NodeType = 3
+	NodeType_USER    NodeType = 4
+)
+
+var NodeType_name = map[int32]string{
+	0: "UNKNOWN",
+	1: "POST",
+	2: "IMG",
+	3: "TAG",
+	4: "USER",
+}
+var NodeType_value = map[string]int32{
+	"UNKNOWN": 0,
+	"POST":    1,
+	"IMG":     2,
+	"TAG":     3,
+	"USER":    4,
+}
+
+func (x NodeType) Enum() *NodeType {
+	p := new(NodeType)
+	*p = x
+	return p
+}
+func (x NodeType) String() string {
+	return proto.EnumName(NodeType_name, int32(x))
+}
+func (x *NodeType) UnmarshalJSON(data []byte) error {
+	value, err := proto.UnmarshalJSONEnum(NodeType_value, data, "NodeType")
+	if err != nil {
+		return err
+	}
+	*x = NodeType(value)
+	return nil
+}
+
+type SerialGraph struct {
+	Nodes            []*Node `protobuf:"bytes,1,rep,name=nodes" json:"nodes,omitempty"`
+	Directed         *bool   `protobuf:"varint,2,opt,name=directed,def=0" json:"directed,omitempty"`
+	NodeCount        *uint64 `protobuf:"varint,3,req,name=nodeCount,def=0" json:"nodeCount,omitempty"`
+	XXX_unrecognized []byte  `json:"-"`
+}
+
+func (m *SerialGraph) Reset()         { *m = SerialGraph{} }
+func (m *SerialGraph) String() string { return proto.CompactTextString(m) }
+func (*SerialGraph) ProtoMessage()    {}
+
+const Default_SerialGraph_Directed bool = false
+const Default_SerialGraph_NodeCount uint64 = 0
+
+func (m *SerialGraph) GetNodes() []*Node {
+	if m != nil {
+		return m.Nodes
+	}
+	return nil
+}
+
+func (m *SerialGraph) GetDirected() bool {
+	if m != nil && m.Directed != nil {
+		return *m.Directed
+	}
+	return Default_SerialGraph_Directed
+}
+
+func (m *SerialGraph) GetNodeCount() uint64 {
+	if m != nil && m.NodeCount != nil {
+		return *m.NodeCount
+	}
+	return Default_SerialGraph_NodeCount
+}
+
+type Node struct {
+	Id               *uint64   `protobuf:"varint,1,req,name=id" json:"id,omitempty"`
+	Value            *string   `protobuf:"bytes,2,req,name=value" json:"value,omitempty"`
+	Weight           *int64    `protobuf:"varint,3,opt,name=weight" json:"weight,omitempty"`
+	Type             *NodeType `protobuf:"varint,4,opt,name=type,enum=main.NodeType,def=0" json:"type,omitempty"`
+	Adjacent         []uint64  `protobuf:"varint,5,rep,name=adjacent" json:"adjacent,omitempty"`
+	Weights          []int64   `protobuf:"varint,6,rep,name=weights" json:"weights,omitempty"`
+	XXX_unrecognized []byte    `json:"-"`
+}
+
+func (m *Node) Reset()         { *m = Node{} }
+func (m *Node) String() string { return proto.CompactTextString(m) }
+func (*Node) ProtoMessage()    {}
+
+const Default_Node_Type NodeType = NodeType_UNKNOWN
+
+func (m *Node) GetId() uint64 {
+	if m != nil && m.Id != nil {
+		return *m.Id
+	}
+	return 0
+}
+
+func (m *Node) GetValue() string {
+	if m != nil && m.Value != nil {
+		return *m.Value
+	}
+	return ""
+}
+
+func (m *Node) GetWeight() int64 {
+	if m != nil && m.Weight != nil {
+		return *m.Weight
+	}
+	return 0
+}
+
+func (m *Node) GetType() NodeType {
+	if m != nil && m.Type != nil {
+		return *m.Type
+	}
+	return Default_Node_Type
+}
+
+func (m *Node) GetAdjacent() []uint64 {
+	if m != nil {
+		return m.Adjacent
+	}
+	return nil
+}
+
+func (m *Node) GetWeights() []int64 {
+	if m != nil {
+		return m.Weights
+	}
+	return nil
+}
+
+func init() {
+	proto.RegisterEnum("main.NodeType", NodeType_name, NodeType_value)
+}
diff --git a/app/cron/proj/graph/graph.proto b/app/cron/proj/graph/graph.proto
new file mode 100644
index 0000000..ca34de3
--- /dev/null
+++ b/app/cron/proj/graph/graph.proto
@@ -0,0 +1,24 @@
+package main;
+
+message SerialGraph {
+	repeated Node nodes = 1;
+	optional bool directed = 2 [default = false];
+	required uint64 nodeCount = 3 [default = 0];
+}
+
+enum NodeType {
+	UNKNOWN = 0;
+	POST = 1;
+	IMG = 2;
+	TAG = 3;
+	USER = 4;
+}
+
+message Node {
+	required uint64 id = 1;
+	required string value = 2;
+	optional int64 weight = 3;
+	optional NodeType type = 4 [default = UNKNOWN];
+	repeated uint64 adjacent = 5;
+	repeated int64 weights = 6;
+}
diff --git a/app/cron/proj/graph/load.sh b/app/cron/proj/graph/load.sh
new file mode 100755
index 0000000..4ebec6a
--- /dev/null
+++ b/app/cron/proj/graph/load.sh
@@ -0,0 +1,14 @@
+#!/bin/sh
+
+if ! which proto >/dev/null; then
+	echo "Installing proto and protoc-gen-go"
+	go get -u github.com/golang/protobuf/{proto,protoc-gen-go}
+	export PATH=$PATH:$GOPATH/bin
+else
+	echo "Proto and protoc-gen-go already installed"
+fi
+
+echo "Generating Protobuff files..."
+protoc --go_out=. *.proto
+sed -i '' '/RegisterType/d' graph.pb.go
+echo "Protobuff files generated."
diff --git a/app/cron/proj/graph/serialGraph.go b/app/cron/proj/graph/serialGraph.go
new file mode 100644
index 0000000..a251871
--- /dev/null
+++ b/app/cron/proj/graph/serialGraph.go
@@ -0,0 +1,76 @@
+package main
+
+import (
+	"bytes"
+	"compress/gzip"
+
+	"github.com/golang/protobuf/proto"
+)
+
+const shouldGZIP = true
+
+// DecodeSerialGraph converts a byte string back into a hydrated SerialGraph.
+func DecodeSerialGraph(data []byte) (g *SerialGraph, err error) {
+	if shouldGZIP {
+		if data, err = decompress(data); err != nil {
+			return nil, err
+		}
+	}
+
+	// log.Printf("DecodeSerialGraph: %q", data)
+
+	g = &SerialGraph{}
+	if err := proto.Unmarshal(data, g); err != nil {
+		return nil, err
+	}
+	return g, nil
+}
+
+// Bytes converts a serial graph to a gzipped graph (used for storage)
+func (g *SerialGraph) Bytes() (data []byte, err error) {
+	data, err = proto.Marshal(g)
+	if err != nil {
+		return nil, err
+	}
+
+	// log.Printf("      Graph.Bytes: %q", data)
+
+	if shouldGZIP {
+		if data, err = compress(data); err != nil {
+			return nil, err
+		}
+	}
+	return data, nil
+}
+
+// Simple GZIP decompression
+func decompress(garbage []byte) ([]byte, error) {
+	gz, err := gzip.NewReader(bytes.NewBuffer(garbage))
+	if err != nil {
+		return nil, err
+	}
+	var buff bytes.Buffer
+	if _, err := buff.ReadFrom(gz); err != nil {
+		return nil, err
+	}
+	if err := gz.Close(); err != nil {
+		return nil, err
+	}
+	return buff.Bytes(), nil
+}
+
+// Simple GZIP compression
+func compress(data []byte) ([]byte, error) {
+	var buff bytes.Buffer
+	gz := gzip.NewWriter(&buff)
+	if _, err := gz.Write(data); err != nil {
+		return nil, err
+	}
+	if err := gz.Flush(); err != nil {
+		return nil, err
+	}
+	if err := gz.Close(); err != nil {
+		return nil, err
+	}
+	return buff.Bytes(), nil
+}

From 41184396199c0f286240d8da9f472bb5bc2cefe2 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sun, 15 Nov 2015 13:01:47 -0700
Subject: [PATCH 14/26] Improving the /proj/tags endpoint performance

---
 app/cron/proj/tags.go | 38 +++++++++++++++++++++++++++++++-------
 1 file changed, 31 insertions(+), 7 deletions(-)

diff --git a/app/cron/proj/tags.go b/app/cron/proj/tags.go
index eb8b45c..e28bf8e 100644
--- a/app/cron/proj/tags.go
+++ b/app/cron/proj/tags.go
@@ -22,6 +22,7 @@ func Tags(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 	defer func() {
 		c.Infof("Time took: %v", time.Since(start))
 	}()
+	// w.Header().Set("Content-Type", "text/csv; charset=utf-8")
 
 	// Check from memcache
 	if item, err := memcache.Get(c, tagsMemcacheKey); err == nil {
@@ -39,22 +40,45 @@ func Tags(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 		found[tag.(string)]++
 	}
 
+	// Compute average (used to clip data, so it's not huge)
+	avg := int64(0)
+	for _, value := range found {
+		avg += value
+	}
+	avg /= int64(len(found))
+	c.Infof("Num tags: %v; Avg: %v", len(found), avg)
+
+	// Compute the 75%-tile
+	cap := int64(0)
+	for key, value := range found {
+		if avg <= value {
+			cap += value
+		} else {
+			delete(found, key)
+		}
+	}
+	cap /= int64(len(found))
+	c.Infof("Above average tags: %v; 75%%-tile: %v", len(found), cap)
+
 	// Output results
 	var buffer bytes.Buffer
+	result := int64(0)
 	for key, value := range found {
-		buffer.WriteString(fmt.Sprintf("%s,%d\n", key, value))
+		if cap <= value {
+			buffer.WriteString(fmt.Sprintf("%s,%d\n", key, value))
+			result++
+		}
 	}
-	data := buffer.String()
-
-	fmt.Fprint(w, data)
-	c.Infof("Num tags: %v", len(found))
+	data := buffer.Bytes()
+	w.Write(data)
+	c.Infof("Returned tags: %v", result)
 
 	// Save to memcache, but only wait up to 3ms.
 	done := make(chan bool, 1)
 	go func() {
 		memcache.Set(c, &memcache.Item{
 			Key:   tagsMemcacheKey,
-			Value: []byte(data),
+			Value: data,
 		})
 		done <- true
 	}()
@@ -71,7 +95,7 @@ func tags(obj interface{}, out chan<- interface{}, idx int) {
 }
 
 // http://xpo6.com/list-of-english-stop-words/
-var chiveWords = "web only,"
+var chiveWords = "web only,thebrigade,theberry,thechive,chive,chive humanity,"
 var stopWords = chiveWords + "a,able,about,across,after,all,almost,also,am,among,an,and,any,are,as,at,be,because,been,but,by,can,cannot,could,dear,did,do,does,either,else,ever,every,for,from,get,got,had,has,have,he,her,hers,him,his,how,however,i,if,in,into,is,it,its,just,least,let,like,likely,may,me,might,most,must,my,neither,no,nor,not,of,off,often,on,only,or,other,our,own,rather,said,say,says,she,should,since,so,some,than,that,the,their,them,then,there,these,they,this,tis,to,too,twas,us,wants,was,we,were,what,when,where,which,while,who,whom,why,will,with,would,yet,you,your"
 var stops = map[string]bool{}
 

From e61d9c4c164748bde2e7e2ae36f24589044ea7b4 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sun, 15 Nov 2015 14:16:25 -0700
Subject: [PATCH 15/26] Filling graph with proj endpoint (todo: shard and
 store)

---
 app/cron/cron.go                   |  1 +
 app/cron/proj/graph.go             | 79 ++++++++++++++++++++++-----
 app/cron/proj/graph/graph.go       | 87 ++++++++++++++++++++----------
 app/cron/proj/graph/graph.pb.go    |  2 +-
 app/cron/proj/graph/graph.proto    |  2 +-
 app/cron/proj/graph/serialGraph.go |  2 +-
 6 files changed, 129 insertions(+), 44 deletions(-)

diff --git a/app/cron/cron.go b/app/cron/cron.go
index ddeb2ff..f4953f2 100644
--- a/app/cron/cron.go
+++ b/app/cron/cron.go
@@ -47,6 +47,7 @@ func Init() {
 	http.HandleFunc("/cron/stage/1", crawler.Crawl)
 
 	http.Handle("/proj/tags", appstats.NewHandler(proj.Tags))
+	http.Handle("/proj/graph", appstats.NewHandler(proj.Graph))
 
 	http.HandleFunc("/clean", func(w http.ResponseWriter, r *http.Request) {
 		c := appengine.NewContext(r)
diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go
index 44b211b..1db8379 100644
--- a/app/cron/proj/graph.go
+++ b/app/cron/proj/graph.go
@@ -5,24 +5,79 @@ import (
 	"time"
 
 	"appengine"
+
+	"github.com/bign8/chive-show/app/cron/proj/graph"
 )
 
 // Graph processes all posts in attempt to create a graph
 func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 	start := time.Now()
 
-	// pages := puller(c)
-	// dirtyTags := getNod(c, pages, 100)
-	// tags := cleaner(dirtyTags)
-	//
-	// found := map[string]int64{}
-	// for tag := range tags {
-	// 	found[tag]++
-	// }
-	//
-	// for key, value := range found {
-	// 	fmt.Fprintf(w, "%s,%d\n", key, value)
-	// }
+	var item Item
+	var post, ntag, nimg *graph.Node
+
+	idx := 0
+
+	g := graph.New(false)
+	for idk := range getItems(c) {
+		item = idk.(Item)
+		post = g.Add(item.GUID, graph.NodeType_POST, 0)
+
+		for _, tag := range validTags(item.Tags) {
+			ntag = g.Add(tag, graph.NodeType_TAG, 0)
+			g.Connect(post, ntag, 0)
+		}
+
+		for _, img := range item.Imgs {
+			nimg = g.Add(img, graph.NodeType_IMG, 0)
+			g.Connect(post, nimg, 0)
+		}
+
+		// This is a SLOW/DEBUG only operation
+		if idx%2000 == 0 {
+			c.Infof("Current Duration (%v)", time.Since(start))
+		}
+		idx++
+	}
+
+	// Write result
+	bits, err := g.Bytes()
+	if err != nil {
+		c.Errorf("Error in Graph.Bytes: %v", err)
+	}
+	w.Write(bits)
+
+	// Count types of nodes
+	binCtr := make(map[graph.NodeType]uint64)
+	for _, node := range g.Nodes() {
+		binCtr[*node.Type]++
+	}
+
+	// Log out types of nodes
+	total := uint64(0)
+	for key, value := range binCtr {
+		c.Infof("Nodes (%s): %d", key, value)
+		total += value
+	}
+	c.Infof("Nodes (ALL): %d", total)
+
+	// w/dupes w/invalid tags
+	// 2015/11/15 20:52:26 INFO: Nodes (IMG): 928728
+	// 2015/11/15 20:52:26 INFO: Nodes (TAG): 244212
+	// 2015/11/15 20:52:26 INFO: Nodes (POST): 40920
+	// 2015/11/15 20:52:26 INFO: Time took: 31.310686059s
+
+	// w/dupes w/o invalid Tags
+	// 2015/11/15 21:03:06 INFO: Nodes (IMG): 928728
+	// 2015/11/15 21:03:06 INFO: Nodes (TAG): 237122
+	// 2015/11/15 21:03:06 INFO: Nodes (POST): 40920
+	// 2015/11/15 21:03:06 INFO: Time took: 31.850210891s
+
+	// w/o dupes w/o invalid Tags
+	// 2015/11/15 21:06:18 INFO: Nodes (IMG): 886831
+	// 2015/11/15 21:06:18 INFO: Nodes (POST): 40920
+	// 2015/11/15 21:06:18 INFO: Nodes (TAG): 18221
+	// 2015/11/15 21:06:18 INFO: Time took: 32.651739532s
 
 	c.Infof("Time took: %v", time.Since(start))
 }
diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go
index ee1485e..18a8530 100644
--- a/app/cron/proj/graph/graph.go
+++ b/app/cron/proj/graph/graph.go
@@ -1,8 +1,7 @@
-package main
+package graph
 
 import (
 	"errors"
-	"log"
 
 	"github.com/golang/protobuf/proto"
 )
@@ -12,7 +11,8 @@ import (
 // Graph is the serializable graph we have all been looking for
 type Graph struct {
 	s     *SerialGraph
-	nodes map[uint64]*Node // Optimal lookup with pointers goes here
+	nodes map[uint64]*Node              // Optimal lookup with pointers goes here
+	dupes map[NodeType]map[string]*Node // type > value > node
 }
 
 // New creates a new Graph
@@ -24,11 +24,21 @@ func New(isDirected bool) *Graph {
 			NodeCount: proto.Uint64(0),
 		},
 		nodes: make(map[uint64]*Node),
+		dupes: make(map[NodeType]map[string]*Node),
 	}
 }
 
 // Add creates and adds a node to the graph
 func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node {
+
+	// Check duplicate node (add weight)
+	dupe := g.dupes[ttype][value]
+	if dupe != nil {
+		*dupe.Weight += weight
+		return dupe
+	}
+
+	// Create new node
 	n := &Node{
 		Id:       proto.Uint64(g.genNodeID()),
 		Value:    proto.String(value),
@@ -38,11 +48,18 @@ func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node {
 	}
 	g.nodes[*n.Id] = n
 	g.s.Nodes = append(g.s.Nodes, n)
+
+	// Add dupe check to list
+	if g.dupes[ttype] == nil {
+		g.dupes[ttype] = make(map[string]*Node)
+	}
+	g.dupes[ttype][value] = n
 	return n
 }
 
 // Connect connects nodes to and from with an edge of weight w
 func (g *Graph) Connect(to, from *Node, weight int64) error {
+	// TODO: collision checks
 	if to == nil || from == nil {
 		return errors.New("Cannot add edge to nil node")
 	}
@@ -62,13 +79,25 @@ func (g *Graph) genNodeID() (id uint64) {
 	return id
 }
 
+// Nodes returns all the nodes in the Graph
+func (g *Graph) Nodes() []*Node {
+	n := make([]*Node, len(g.nodes))
+	ctr := 0
+	for _, node := range g.nodes {
+		n[ctr] = node
+		ctr++
+	}
+	return n
+}
+
 // DecodeGraph hydrates a graph from a serialized format (returned by Bytes()).
 func DecodeGraph(data []byte) (*Graph, error) {
 	sg, err := DecodeSerialGraph(data)
 	if err != nil {
 		return nil, err
 	}
-	g := &Graph{sg, make(map[uint64]*Node)}
+	g := New(false) // Don't care about directed because it's stored on s (assigned below)
+	g.s = sg
 
 	// Hydrate Graph from SerialGraph
 	for _, node := range sg.Nodes {
@@ -83,28 +112,28 @@ func (g *Graph) Bytes() ([]byte, error) {
 	return g.s.Bytes()
 }
 
-func main() {
-	log.Println("Do stuff...")
-
-	graph := New(false)
-	a := graph.Add("http://super-stupid-long-url.com/more-crap-over-here1", NodeType_UNKNOWN, 0)
-	b := graph.Add("http://super-stupid-long-url.com/more-crap-over-here2", NodeType_UNKNOWN, 0)
-	graph.Connect(a, b, 0)
-
-	// Compress
-	bits, err := graph.Bytes()
-	if err != nil {
-		panic(err)
-	}
-
-	// Decompress
-	result, err := DecodeGraph(bits)
-	if err != nil {
-		panic(err)
-	}
-
-	// Compare
-	log.Printf("Message (%d): %q", len(bits), string(bits))
-	log.Printf("Digit:\n%v\n%v", graph, result)
-	log.Printf("Nodes:\n%v\n%v", graph.s.Nodes, result.s.Nodes)
-}
+// func main() {
+// 	log.Println("Do stuff...")
+//
+// 	graph := New(false)
+// 	a := graph.Add("http://super-stupid-long-url.com/more-crap-over-here1", NodeType_UNKNOWN, 0)
+// 	b := graph.Add("http://super-stupid-long-url.com/more-crap-over-here2", NodeType_UNKNOWN, 0)
+// 	graph.Connect(a, b, 0)
+//
+// 	// Compress
+// 	bits, err := graph.Bytes()
+// 	if err != nil {
+// 		panic(err)
+// 	}
+//
+// 	// Decompress
+// 	result, err := DecodeGraph(bits)
+// 	if err != nil {
+// 		panic(err)
+// 	}
+//
+// 	// Compare
+// 	log.Printf("Message (%d): %q", len(bits), string(bits))
+// 	log.Printf("Digit:\n%v\n%v", graph, result)
+// 	log.Printf("Nodes:\n%v\n%v", graph.s.Nodes, result.s.Nodes)
+// }
diff --git a/app/cron/proj/graph/graph.pb.go b/app/cron/proj/graph/graph.pb.go
index 75953ae..3d661a5 100644
--- a/app/cron/proj/graph/graph.pb.go
+++ b/app/cron/proj/graph/graph.pb.go
@@ -12,7 +12,7 @@ It has these top-level messages:
 	SerialGraph
 	Node
 */
-package main
+package graph
 
 import proto "github.com/golang/protobuf/proto"
 import fmt "fmt"
diff --git a/app/cron/proj/graph/graph.proto b/app/cron/proj/graph/graph.proto
index ca34de3..d5693dd 100644
--- a/app/cron/proj/graph/graph.proto
+++ b/app/cron/proj/graph/graph.proto
@@ -1,4 +1,4 @@
-package main;
+package graph;
 
 message SerialGraph {
 	repeated Node nodes = 1;
diff --git a/app/cron/proj/graph/serialGraph.go b/app/cron/proj/graph/serialGraph.go
index a251871..af059f2 100644
--- a/app/cron/proj/graph/serialGraph.go
+++ b/app/cron/proj/graph/serialGraph.go
@@ -1,4 +1,4 @@
-package main
+package graph
 
 import (
 	"bytes"

From a964f986b6609b72a2fa3d379af543a6df26999e Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Sun, 15 Nov 2015 20:07:44 -0700
Subject: [PATCH 16/26] Adding datastore based sharder

---
 app/cron/cron.go               |  1 +
 app/cron/proj/graph.go         | 62 ++++++++++++++++++-----
 app/cron/proj/graph/graph.go   |  8 +--
 app/helpers/sharder/reader.go  | 38 ++++++++++++++
 app/helpers/sharder/sharder.go | 68 +++++++++++++++++++++++++
 app/helpers/sharder/writer.go  | 91 ++++++++++++++++++++++++++++++++++
 6 files changed, 253 insertions(+), 15 deletions(-)
 create mode 100644 app/helpers/sharder/reader.go
 create mode 100644 app/helpers/sharder/sharder.go
 create mode 100644 app/helpers/sharder/writer.go

diff --git a/app/cron/cron.go b/app/cron/cron.go
index f4953f2..45740a7 100644
--- a/app/cron/cron.go
+++ b/app/cron/cron.go
@@ -48,6 +48,7 @@ func Init() {
 
 	http.Handle("/proj/tags", appstats.NewHandler(proj.Tags))
 	http.Handle("/proj/graph", appstats.NewHandler(proj.Graph))
+	http.Handle("/proj/shard", appstats.NewHandler(proj.TestShard))
 
 	http.HandleFunc("/clean", func(w http.ResponseWriter, r *http.Request) {
 		c := appengine.NewContext(r)
diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go
index 1db8379..ec82ebd 100644
--- a/app/cron/proj/graph.go
+++ b/app/cron/proj/graph.go
@@ -7,8 +7,41 @@ import (
 	"appengine"
 
 	"github.com/bign8/chive-show/app/cron/proj/graph"
+	"github.com/bign8/chive-show/app/helpers/sharder"
 )
 
+// TestShard to delete
+func TestShard(c appengine.Context, w http.ResponseWriter, r *http.Request) {
+	start := time.Now()
+
+	s, err := sharder.NewWriter(c, "test")
+	if err != nil {
+		c.Errorf("Writer Error: %s", err)
+		return
+	}
+
+	s.Write([]byte("012345678901234567890"))
+	s.Close()
+
+	key, err := s.Key()
+	if err != nil {
+		c.Errorf("Error in Key: %s", err)
+	}
+	c.Infof("Has Key: %s", key)
+
+	c.Infof("Write took: %v", time.Since(start))
+	start = time.Now()
+
+	read, err := sharder.Reader(c, "test")
+	if err != nil {
+		c.Errorf("Reader Error: %s", err)
+		return
+	}
+	c.Infof("Data: %q", read.String())
+
+	c.Infof("Read took: %v", time.Since(start))
+}
+
 // Graph processes all posts in attempt to create a graph
 func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 	start := time.Now()
@@ -62,22 +95,27 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 	c.Infof("Nodes (ALL): %d", total)
 
 	// w/dupes w/invalid tags
-	// 2015/11/15 20:52:26 INFO: Nodes (IMG): 928728
-	// 2015/11/15 20:52:26 INFO: Nodes (TAG): 244212
-	// 2015/11/15 20:52:26 INFO: Nodes (POST): 40920
-	// 2015/11/15 20:52:26 INFO: Time took: 31.310686059s
+	// INFO: Nodes (IMG): 928728
+	// INFO: Nodes (TAG): 244212
+	// INFO: Nodes (POST): 40920
+	// INFO: Nodes (ALL): 1213860
+	// INFO: Time took: 31.310686059s
 
 	// w/dupes w/o invalid Tags
-	// 2015/11/15 21:03:06 INFO: Nodes (IMG): 928728
-	// 2015/11/15 21:03:06 INFO: Nodes (TAG): 237122
-	// 2015/11/15 21:03:06 INFO: Nodes (POST): 40920
-	// 2015/11/15 21:03:06 INFO: Time took: 31.850210891s
+	// INFO: Nodes (IMG): 928728
+	// INFO: Nodes (TAG): 237122
+	// INFO: Nodes (POST): 40920
+	// INFO: Nodes (ALL): 1206770
+	// INFO: Time took: 31.850210891s
 
 	// w/o dupes w/o invalid Tags
-	// 2015/11/15 21:06:18 INFO: Nodes (IMG): 886831
-	// 2015/11/15 21:06:18 INFO: Nodes (POST): 40920
-	// 2015/11/15 21:06:18 INFO: Nodes (TAG): 18221
-	// 2015/11/15 21:06:18 INFO: Time took: 32.651739532s
+	// INFO: Nodes (IMG): 886831
+	// INFO: Nodes (POST): 40920
+	// INFO: Nodes (TAG): 18221
+	// INFO: Nodes (ALL): 945972
+	// INFO: Time took: 32.651739532s
+
+	// TODO: write to sharded datastore entity
 
 	c.Infof("Time took: %v", time.Since(start))
 }
diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go
index 18a8530..65950ab 100644
--- a/app/cron/proj/graph/graph.go
+++ b/app/cron/proj/graph/graph.go
@@ -50,10 +50,12 @@ func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node {
 	g.s.Nodes = append(g.s.Nodes, n)
 
 	// Add dupe check to list
-	if g.dupes[ttype] == nil {
-		g.dupes[ttype] = make(map[string]*Node)
+	dub, ok := g.dupes[ttype]
+	if !ok {
+		dub = make(map[string]*Node)
+		g.dupes[ttype] = dub
 	}
-	g.dupes[ttype][value] = n
+	dub[value] = n
 	return n
 }
 
diff --git a/app/helpers/sharder/reader.go b/app/helpers/sharder/reader.go
new file mode 100644
index 0000000..58fe6d2
--- /dev/null
+++ b/app/helpers/sharder/reader.go
@@ -0,0 +1,38 @@
+package sharder
+
+import (
+	"bytes"
+
+	"appengine"
+	"appengine/datastore"
+)
+
+// Reader creates a new shard reader to retrieve data from datastore
+func Reader(c appengine.Context, name string) (*bytes.Buffer, error) {
+	if name == "" {
+		return nil, ErrInvalidName
+	}
+
+	var master shardMaster
+	if err := datastore.Get(c, masterKey(c, name), &master); err != nil {
+		panic(err)
+		return nil, err
+	}
+
+	data := make([]byte, master.Size)
+	for i := 0; i < master.Shards; i++ {
+		var shardData shard
+		if err := datastore.Get(c, shardKey(c, name, i), &shardData); err != nil {
+			return nil, err
+		}
+		c.Infof("Out Data %d: %q", i, string(shardData.Data))
+
+		end := i*divisor + divisor
+		if end > master.Size {
+			end = master.Size
+		}
+		copy(data[i*divisor:end], shardData.Data)
+	}
+
+	return bytes.NewBuffer(data), nil
+}
diff --git a/app/helpers/sharder/sharder.go b/app/helpers/sharder/sharder.go
new file mode 100644
index 0000000..2dcaa90
--- /dev/null
+++ b/app/helpers/sharder/sharder.go
@@ -0,0 +1,68 @@
+package sharder
+
+import (
+	"errors"
+	"fmt"
+	"time"
+
+	"appengine"
+	"appengine/datastore"
+)
+
+const (
+	masterKind = "shard-master"
+	shardKind  = "shard-pieces"
+	divisor    = 10 // 9e6
+)
+
+// ErrInvalidName because reasons
+var ErrInvalidName = errors.New("Must provide name of sharded item")
+
+// ShardKey is an identifying string for shards
+type ShardKey string
+
+func (sk *ShardKey) String() string {
+	return fmt.Sprint(*sk)
+}
+
+// newKey takes the name of a file and creates a ShardKey
+func newKey(c appengine.Context, name string) ShardKey {
+	return ShardKey(masterKey(c, name).Encode())
+}
+
+func masterKey(c appengine.Context, name string) *datastore.Key {
+	return datastore.NewKey(c, masterKind, name, 0, nil)
+}
+
+func shardKey(c appengine.Context, name string, idx int) *datastore.Key {
+	return datastore.NewKey(c, shardKind, fmt.Sprintf("%s-%d", name, idx), 0, nil)
+}
+
+// ShardInfo implements the io.writer interface and allows for sharding data
+type ShardInfo struct {
+	Key          ShardKey
+	CreationTime time.Time
+	Size         int
+	MD5          string
+}
+
+type shardMaster struct {
+	Name   string    `datastore:"name"`
+	Stamp  time.Time `datastore:"stamp"`
+	Shards int       `datastore:"shards"`
+	MD5    string    `datastore:"md5_hash"`
+	Size   int       `datastore:"size"`
+}
+
+func (sm *shardMaster) toInfo(c appengine.Context) *ShardInfo {
+	return &ShardInfo{
+		Key:          newKey(c, sm.Name),
+		CreationTime: sm.Stamp,
+		Size:         sm.Size,
+		MD5:          sm.MD5,
+	}
+}
+
+type shard struct {
+	Data []byte
+}
diff --git a/app/helpers/sharder/writer.go b/app/helpers/sharder/writer.go
new file mode 100644
index 0000000..08aa219
--- /dev/null
+++ b/app/helpers/sharder/writer.go
@@ -0,0 +1,91 @@
+package sharder
+
+import (
+	"bytes"
+	"errors"
+	"time"
+
+	"appengine"
+	"appengine/datastore"
+)
+
+// NewWriter creates a new Sharder to write sharded data to datastore
+func NewWriter(c appengine.Context, name string) (*Writer, error) {
+	if name == "" {
+		return nil, ErrInvalidName
+	}
+	return &Writer{
+		ctx:  c,
+		key:  nil,
+		buff: bytes.NewBufferString(""),
+		name: name,
+	}, nil
+}
+
+// Writer is the item that deals with writing sharded data
+type Writer struct {
+	buff *bytes.Buffer
+	ctx  appengine.Context
+	key  *ShardKey
+	name string
+}
+
+// Write pushed p bytes to underlying data stream.
+func (w *Writer) Write(p []byte) (n int, err error) {
+	if w.buff == nil {
+		return 0, errors.New("Buffer is closed")
+	}
+	return w.buff.Write(p)
+}
+
+// Close finishes off the current buffer, shards and stores the data.
+// Once Close is called, the user may call Key to get the key of the stored object.
+func (w *Writer) Close() error {
+	// TODO: datastore.RunInTransaction + go-routines with waitGroups
+
+	length := w.buff.Len()
+	shards := (length-1)/divisor + 1
+	key := masterKey(w.ctx, w.name)
+
+	// Store shardMaster
+	master := shardMaster{
+		Name:   w.name,
+		Stamp:  time.Now(),
+		Shards: shards,
+		MD5:    "TO-IMPLEMENT",
+		Size:   length,
+	}
+	if _, err := datastore.Put(w.ctx, key, &master); err != nil {
+		panic(err)
+		return err
+	}
+
+	// shard data and store shards
+	data := w.buff.Bytes()
+	for i := 0; i < shards; i++ {
+		shardKey := shardKey(w.ctx, w.name, i)
+		shardData := data[i*divisor:]
+		if len(shardData) > divisor {
+			shardData = data[:divisor]
+		}
+		s := shard{shardData}
+		w.ctx.Infof("Inn Data %d: %q", i, s.Data)
+		if _, err := datastore.Put(w.ctx, shardKey, &s); err != nil {
+			panic(err)
+			return err
+		}
+	}
+
+	w.key = new(ShardKey)
+	*w.key = ShardKey(key.Encode())
+	w.buff = nil
+	return nil
+}
+
+// Key returns the key of the sharded data.  Note: will return an error if not Closed.
+func (w *Writer) Key() (*ShardKey, error) {
+	if w.key == nil {
+		return nil, errors.New("Writer must be closed before a Key is available")
+	}
+	return w.key, nil
+}

From 3429d97935b912778c8034bf504073c0489e41b2 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Mon, 16 Nov 2015 00:49:17 -0700
Subject: [PATCH 17/26] Pulling in go-routines

---
 app/cron/cron.go               |  4 +++-
 app/cron/proj/graph.go         | 20 +++++++-------------
 app/helpers/sharder/reader.go  | 30 +++++++++++++++++++-----------
 app/helpers/sharder/sharder.go |  2 +-
 4 files changed, 30 insertions(+), 26 deletions(-)

diff --git a/app/cron/cron.go b/app/cron/cron.go
index 45740a7..068d867 100644
--- a/app/cron/cron.go
+++ b/app/cron/cron.go
@@ -50,12 +50,14 @@ func Init() {
 	http.Handle("/proj/graph", appstats.NewHandler(proj.Graph))
 	http.Handle("/proj/shard", appstats.NewHandler(proj.TestShard))
 
-	http.HandleFunc("/clean", func(w http.ResponseWriter, r *http.Request) {
+	http.HandleFunc("/cron/clean", func(w http.ResponseWriter, r *http.Request) {
 		c := appengine.NewContext(r)
 		cleanup(c, "buff")
 		cleanup(c, "edge")
 		cleanup(c, "vertex")
 		cleanup(c, "post")
+		cleanup(c, "shard-pieces")
+		cleanup(c, "shard-master")
 	})
 
 	http.Handle("/cron/stats", appstats.NewHandler(crawler.Stats))
diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go
index ec82ebd..ffc86df 100644
--- a/app/cron/proj/graph.go
+++ b/app/cron/proj/graph.go
@@ -2,6 +2,7 @@ package proj
 
 import (
 	"net/http"
+	"strings"
 	"time"
 
 	"appengine"
@@ -12,33 +13,26 @@ import (
 
 // TestShard to delete
 func TestShard(c appengine.Context, w http.ResponseWriter, r *http.Request) {
-	start := time.Now()
 
+	// Writing
+	start := time.Now()
 	s, err := sharder.NewWriter(c, "test")
 	if err != nil {
 		c.Errorf("Writer Error: %s", err)
 		return
 	}
-
-	s.Write([]byte("012345678901234567890"))
+	s.Write([]byte(strings.Repeat("01234567890123456789", 1e6)))
 	s.Close()
-
-	key, err := s.Key()
-	if err != nil {
-		c.Errorf("Error in Key: %s", err)
-	}
-	c.Infof("Has Key: %s", key)
-
 	c.Infof("Write took: %v", time.Since(start))
-	start = time.Now()
 
+	// Reading
+	start = time.Now()
 	read, err := sharder.Reader(c, "test")
 	if err != nil {
 		c.Errorf("Reader Error: %s", err)
 		return
 	}
-	c.Infof("Data: %q", read.String())
-
+	c.Infof("Data Length: %d", read.Len())
 	c.Infof("Read took: %v", time.Since(start))
 }
 
diff --git a/app/helpers/sharder/reader.go b/app/helpers/sharder/reader.go
index 58fe6d2..b9a77c8 100644
--- a/app/helpers/sharder/reader.go
+++ b/app/helpers/sharder/reader.go
@@ -2,6 +2,7 @@ package sharder
 
 import (
 	"bytes"
+	"sync"
 
 	"appengine"
 	"appengine/datastore"
@@ -18,21 +19,28 @@ func Reader(c appengine.Context, name string) (*bytes.Buffer, error) {
 		panic(err)
 		return nil, err
 	}
+	shards := (master.Size-1)/divisor + 1
 
+	var wg sync.WaitGroup
+	wg.Add(shards)
 	data := make([]byte, master.Size)
-	for i := 0; i < master.Shards; i++ {
-		var shardData shard
-		if err := datastore.Get(c, shardKey(c, name, i), &shardData); err != nil {
-			return nil, err
-		}
-		c.Infof("Out Data %d: %q", i, string(shardData.Data))
+	for i := 0; i < shards; i++ {
+		go func(i int) {
+			var shardData shard
+			if err := datastore.Get(c, shardKey(c, name, i), &shardData); err != nil {
+				panic(err)
+			}
+			// c.Infof("Out Data %d: %q", i, string(shardData.Data))
 
-		end := i*divisor + divisor
-		if end > master.Size {
-			end = master.Size
-		}
-		copy(data[i*divisor:end], shardData.Data)
+			end := i*divisor + divisor
+			if end > master.Size {
+				end = master.Size
+			}
+			copy(data[i*divisor:end], shardData.Data)
+			wg.Done()
+		}(i)
 	}
+	wg.Wait()
 
 	return bytes.NewBuffer(data), nil
 }
diff --git a/app/helpers/sharder/sharder.go b/app/helpers/sharder/sharder.go
index 2dcaa90..4b538a9 100644
--- a/app/helpers/sharder/sharder.go
+++ b/app/helpers/sharder/sharder.go
@@ -12,7 +12,7 @@ import (
 const (
 	masterKind = "shard-master"
 	shardKind  = "shard-pieces"
-	divisor    = 10 // 9e6
+	divisor    = 1e6 // 1MB
 )
 
 // ErrInvalidName because reasons

From 5781d36aafb5147bf522d0dfd3ce867a60f09928 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Mon, 16 Nov 2015 00:57:37 -0700
Subject: [PATCH 18/26] Removing un-used master data

---
 app/helpers/sharder/sharder.go | 36 +-----------------------
 app/helpers/sharder/writer.go  | 50 ++++++++++++++--------------------
 2 files changed, 21 insertions(+), 65 deletions(-)

diff --git a/app/helpers/sharder/sharder.go b/app/helpers/sharder/sharder.go
index 4b538a9..ba30d7a 100644
--- a/app/helpers/sharder/sharder.go
+++ b/app/helpers/sharder/sharder.go
@@ -3,7 +3,6 @@ package sharder
 import (
 	"errors"
 	"fmt"
-	"time"
 
 	"appengine"
 	"appengine/datastore"
@@ -18,18 +17,6 @@ const (
 // ErrInvalidName because reasons
 var ErrInvalidName = errors.New("Must provide name of sharded item")
 
-// ShardKey is an identifying string for shards
-type ShardKey string
-
-func (sk *ShardKey) String() string {
-	return fmt.Sprint(*sk)
-}
-
-// newKey takes the name of a file and creates a ShardKey
-func newKey(c appengine.Context, name string) ShardKey {
-	return ShardKey(masterKey(c, name).Encode())
-}
-
 func masterKey(c appengine.Context, name string) *datastore.Key {
 	return datastore.NewKey(c, masterKind, name, 0, nil)
 }
@@ -38,29 +25,8 @@ func shardKey(c appengine.Context, name string, idx int) *datastore.Key {
 	return datastore.NewKey(c, shardKind, fmt.Sprintf("%s-%d", name, idx), 0, nil)
 }
 
-// ShardInfo implements the io.writer interface and allows for sharding data
-type ShardInfo struct {
-	Key          ShardKey
-	CreationTime time.Time
-	Size         int
-	MD5          string
-}
-
 type shardMaster struct {
-	Name   string    `datastore:"name"`
-	Stamp  time.Time `datastore:"stamp"`
-	Shards int       `datastore:"shards"`
-	MD5    string    `datastore:"md5_hash"`
-	Size   int       `datastore:"size"`
-}
-
-func (sm *shardMaster) toInfo(c appengine.Context) *ShardInfo {
-	return &ShardInfo{
-		Key:          newKey(c, sm.Name),
-		CreationTime: sm.Stamp,
-		Size:         sm.Size,
-		MD5:          sm.MD5,
-	}
+	Size int `datastore:"size"`
 }
 
 type shard struct {
diff --git a/app/helpers/sharder/writer.go b/app/helpers/sharder/writer.go
index 08aa219..e33e6b0 100644
--- a/app/helpers/sharder/writer.go
+++ b/app/helpers/sharder/writer.go
@@ -3,7 +3,7 @@ package sharder
 import (
 	"bytes"
 	"errors"
-	"time"
+	"sync"
 
 	"appengine"
 	"appengine/datastore"
@@ -16,7 +16,6 @@ func NewWriter(c appengine.Context, name string) (*Writer, error) {
 	}
 	return &Writer{
 		ctx:  c,
-		key:  nil,
 		buff: bytes.NewBufferString(""),
 		name: name,
 	}, nil
@@ -26,7 +25,6 @@ func NewWriter(c appengine.Context, name string) (*Writer, error) {
 type Writer struct {
 	buff *bytes.Buffer
 	ctx  appengine.Context
-	key  *ShardKey
 	name string
 }
 
@@ -41,7 +39,8 @@ func (w *Writer) Write(p []byte) (n int, err error) {
 // Close finishes off the current buffer, shards and stores the data.
 // Once Close is called, the user may call Key to get the key of the stored object.
 func (w *Writer) Close() error {
-	// TODO: datastore.RunInTransaction + go-routines with waitGroups
+	// TODO: datastore.RunInTransaction
+	// TODO: delete existing shards greater than current
 
 	length := w.buff.Len()
 	shards := (length-1)/divisor + 1
@@ -49,11 +48,7 @@ func (w *Writer) Close() error {
 
 	// Store shardMaster
 	master := shardMaster{
-		Name:   w.name,
-		Stamp:  time.Now(),
-		Shards: shards,
-		MD5:    "TO-IMPLEMENT",
-		Size:   length,
+		Size: length,
 	}
 	if _, err := datastore.Put(w.ctx, key, &master); err != nil {
 		panic(err)
@@ -62,30 +57,25 @@ func (w *Writer) Close() error {
 
 	// shard data and store shards
 	data := w.buff.Bytes()
+	var wg sync.WaitGroup
+	wg.Add(shards)
 	for i := 0; i < shards; i++ {
-		shardKey := shardKey(w.ctx, w.name, i)
-		shardData := data[i*divisor:]
-		if len(shardData) > divisor {
-			shardData = data[:divisor]
-		}
-		s := shard{shardData}
-		w.ctx.Infof("Inn Data %d: %q", i, s.Data)
-		if _, err := datastore.Put(w.ctx, shardKey, &s); err != nil {
-			panic(err)
-			return err
-		}
+		go func(i int) {
+			shardKey := shardKey(w.ctx, w.name, i)
+			shardData := data[i*divisor:]
+			if len(shardData) > divisor {
+				shardData = data[:divisor]
+			}
+			s := shard{shardData}
+			// w.ctx.Infof("Inn Data %d: %q", i, s.Data)
+			if _, err := datastore.Put(w.ctx, shardKey, &s); err != nil {
+				panic(err)
+			}
+			wg.Done()
+		}(i)
 	}
 
-	w.key = new(ShardKey)
-	*w.key = ShardKey(key.Encode())
+	wg.Wait()
 	w.buff = nil
 	return nil
 }
-
-// Key returns the key of the sharded data.  Note: will return an error if not Closed.
-func (w *Writer) Key() (*ShardKey, error) {
-	if w.key == nil {
-		return nil, errors.New("Writer must be closed before a Key is available")
-	}
-	return w.key, nil
-}

From b4dec061ac51c4af7c3c8734571098dca5595250 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Mon, 16 Nov 2015 01:12:19 -0700
Subject: [PATCH 19/26] Removing un-needed writer complexity

---
 app/cron/proj/graph.go         |  9 +++---
 app/helpers/sharder/reader.go  |  8 ++----
 app/helpers/sharder/sharder.go |  9 ++++++
 app/helpers/sharder/writer.go  | 52 ++++++----------------------------
 4 files changed, 25 insertions(+), 53 deletions(-)

diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go
index ffc86df..0bd49de 100644
--- a/app/cron/proj/graph.go
+++ b/app/cron/proj/graph.go
@@ -1,6 +1,7 @@
 package proj
 
 import (
+	"bytes"
 	"net/http"
 	"strings"
 	"time"
@@ -14,15 +15,15 @@ import (
 // TestShard to delete
 func TestShard(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 
+	data := []byte(strings.Repeat("01234567890123456789", 1e6))
+
 	// Writing
 	start := time.Now()
-	s, err := sharder.NewWriter(c, "test")
+	err := sharder.Writer(c, "test", data)
 	if err != nil {
 		c.Errorf("Writer Error: %s", err)
 		return
 	}
-	s.Write([]byte(strings.Repeat("01234567890123456789", 1e6)))
-	s.Close()
 	c.Infof("Write took: %v", time.Since(start))
 
 	// Reading
@@ -32,7 +33,7 @@ func TestShard(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 		c.Errorf("Reader Error: %s", err)
 		return
 	}
-	c.Infof("Data Length: %d", read.Len())
+	c.Infof("Data Length: %d; isSame: %v", len(read), bytes.Equal(read, data))
 	c.Infof("Read took: %v", time.Since(start))
 }
 
diff --git a/app/helpers/sharder/reader.go b/app/helpers/sharder/reader.go
index b9a77c8..29c9950 100644
--- a/app/helpers/sharder/reader.go
+++ b/app/helpers/sharder/reader.go
@@ -1,7 +1,6 @@
 package sharder
 
 import (
-	"bytes"
 	"sync"
 
 	"appengine"
@@ -9,7 +8,7 @@ import (
 )
 
 // Reader creates a new shard reader to retrieve data from datastore
-func Reader(c appengine.Context, name string) (*bytes.Buffer, error) {
+func Reader(c appengine.Context, name string) ([]byte, error) {
 	if name == "" {
 		return nil, ErrInvalidName
 	}
@@ -19,7 +18,7 @@ func Reader(c appengine.Context, name string) (*bytes.Buffer, error) {
 		panic(err)
 		return nil, err
 	}
-	shards := (master.Size-1)/divisor + 1
+	shards := numShards(master.Size)
 
 	var wg sync.WaitGroup
 	wg.Add(shards)
@@ -41,6 +40,5 @@ func Reader(c appengine.Context, name string) (*bytes.Buffer, error) {
 		}(i)
 	}
 	wg.Wait()
-
-	return bytes.NewBuffer(data), nil
+	return data, nil
 }
diff --git a/app/helpers/sharder/sharder.go b/app/helpers/sharder/sharder.go
index ba30d7a..b14fc55 100644
--- a/app/helpers/sharder/sharder.go
+++ b/app/helpers/sharder/sharder.go
@@ -8,6 +8,11 @@ import (
 	"appengine/datastore"
 )
 
+// TODO: datastore.RunInTransaction
+// TODO: delete existing shards greater than current
+// TODO: don't panic and actually use error chans
+// TODO: possibly use put and get multi for up to 10MB
+
 const (
 	masterKind = "shard-master"
 	shardKind  = "shard-pieces"
@@ -25,6 +30,10 @@ func shardKey(c appengine.Context, name string, idx int) *datastore.Key {
 	return datastore.NewKey(c, shardKind, fmt.Sprintf("%s-%d", name, idx), 0, nil)
 }
 
+func numShards(size int) int {
+	return (size-1)/divisor + 1
+}
+
 type shardMaster struct {
 	Size int `datastore:"size"`
 }
diff --git a/app/helpers/sharder/writer.go b/app/helpers/sharder/writer.go
index e33e6b0..d9aef2e 100644
--- a/app/helpers/sharder/writer.go
+++ b/app/helpers/sharder/writer.go
@@ -1,74 +1,39 @@
 package sharder
 
 import (
-	"bytes"
-	"errors"
 	"sync"
 
 	"appengine"
 	"appengine/datastore"
 )
 
-// NewWriter creates a new Sharder to write sharded data to datastore
-func NewWriter(c appengine.Context, name string) (*Writer, error) {
+// Writer shards and stores a byte String
+func Writer(c appengine.Context, name string, data []byte) error {
 	if name == "" {
-		return nil, ErrInvalidName
+		return ErrInvalidName
 	}
-	return &Writer{
-		ctx:  c,
-		buff: bytes.NewBufferString(""),
-		name: name,
-	}, nil
-}
-
-// Writer is the item that deals with writing sharded data
-type Writer struct {
-	buff *bytes.Buffer
-	ctx  appengine.Context
-	name string
-}
-
-// Write pushed p bytes to underlying data stream.
-func (w *Writer) Write(p []byte) (n int, err error) {
-	if w.buff == nil {
-		return 0, errors.New("Buffer is closed")
-	}
-	return w.buff.Write(p)
-}
 
-// Close finishes off the current buffer, shards and stores the data.
-// Once Close is called, the user may call Key to get the key of the stored object.
-func (w *Writer) Close() error {
-	// TODO: datastore.RunInTransaction
-	// TODO: delete existing shards greater than current
-
-	length := w.buff.Len()
-	shards := (length-1)/divisor + 1
-	key := masterKey(w.ctx, w.name)
+	master := shardMaster{len(data)}
+	shards := numShards(master.Size)
 
 	// Store shardMaster
-	master := shardMaster{
-		Size: length,
-	}
-	if _, err := datastore.Put(w.ctx, key, &master); err != nil {
-		panic(err)
+	if _, err := datastore.Put(c, masterKey(c, name), &master); err != nil {
 		return err
 	}
 
 	// shard data and store shards
-	data := w.buff.Bytes()
 	var wg sync.WaitGroup
 	wg.Add(shards)
 	for i := 0; i < shards; i++ {
 		go func(i int) {
-			shardKey := shardKey(w.ctx, w.name, i)
+			shardKey := shardKey(c, name, i)
 			shardData := data[i*divisor:]
 			if len(shardData) > divisor {
 				shardData = data[:divisor]
 			}
 			s := shard{shardData}
 			// w.ctx.Infof("Inn Data %d: %q", i, s.Data)
-			if _, err := datastore.Put(w.ctx, shardKey, &s); err != nil {
+			if _, err := datastore.Put(c, shardKey, &s); err != nil {
 				panic(err)
 			}
 			wg.Done()
@@ -76,6 +41,5 @@ func (w *Writer) Close() error {
 	}
 
 	wg.Wait()
-	w.buff = nil
 	return nil
 }

From 2c19198a89de06718f043ba8721464476ebb230e Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Mon, 16 Nov 2015 01:36:41 -0700
Subject: [PATCH 20/26] Attempting to add a unit test

---
 app/helpers/sharder/sharder_test.go | 51 +++++++++++++++++++++++++++++
 1 file changed, 51 insertions(+)
 create mode 100644 app/helpers/sharder/sharder_test.go

diff --git a/app/helpers/sharder/sharder_test.go b/app/helpers/sharder/sharder_test.go
new file mode 100644
index 0000000..a66928f
--- /dev/null
+++ b/app/helpers/sharder/sharder_test.go
@@ -0,0 +1,51 @@
+package sharder
+
+import (
+	"bytes"
+	"strings"
+	"testing"
+
+	"appengine/aetest"
+)
+
+func TestFullCircle(t *testing.T) {
+	// TODO: verify 20 shards
+
+	c, err := aetest.NewContext(nil)
+	if err != nil {
+		t.Fatal(err)
+	}
+	defer c.Close()
+
+	data := []byte(strings.Repeat("01234567890123456789", 1e6))
+
+	// Writing
+	err = Writer(c, "test", data)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	// Reading
+	read, err := Reader(c, "test")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if !bytes.Equal(read, data) {
+		t.Fail()
+	}
+}
+
+var test bool
+
+func BenchmarkFullCycle(b *testing.B) {
+	c, _ := aetest.NewContext(nil)
+	defer c.Close()
+	data := []byte(strings.Repeat("1", 1e6))
+
+	for i := 0; i < b.N; i++ {
+		Writer(c, "test", data)
+		read, _ := Reader(c, "test")
+		test = bytes.Equal(read, data)
+	}
+}

From c6ef61e4236d8f88f401085ff7dbe4ae39971cd8 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Mon, 16 Nov 2015 22:59:19 -0700
Subject: [PATCH 21/26] Storing the serialized graph

---
 app/cron/proj/graph.go | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go
index 0bd49de..108a38a 100644
--- a/app/cron/proj/graph.go
+++ b/app/cron/proj/graph.go
@@ -45,7 +45,7 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 	var post, ntag, nimg *graph.Node
 
 	idx := 0
-
+	timeout := time.After(time.Second)
 	g := graph.New(false)
 	for idk := range getItems(c) {
 		item = idk.(Item)
@@ -61,19 +61,30 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 			g.Connect(post, nimg, 0)
 		}
 
-		// This is a SLOW/DEBUG only operation
-		if idx%2000 == 0 {
-			c.Infof("Current Duration (%v)", time.Since(start))
+		// This is a DEBUG only operation
+		select {
+		case <-timeout:
+			c.Infof("Index: %d; Duration: %v", idx, time.Since(start))
+			timeout = time.After(time.Second)
+		default:
 		}
 		idx++
 	}
+	c.Infof("End Loop: %d; Duration: %v", idx, time.Since(start))
 
 	// Write result
 	bits, err := g.Bytes()
 	if err != nil {
 		c.Errorf("Error in Graph.Bytes: %v", err)
 	}
-	w.Write(bits)
+	c.Infof("End Serialization: Len(%d); Duration: %v", len(bits), time.Since(start))
+
+	// Storage
+	if err := sharder.Writer(c, "graph", bits); err != nil {
+		c.Errorf("Writer Error: %s", err)
+		return
+	}
+	c.Infof("Write Complete; Duration: %v", time.Since(start))
 
 	// Count types of nodes
 	binCtr := make(map[graph.NodeType]uint64)

From 240c66a4514efdd4657c7265c891edf931dde01b Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Mon, 16 Nov 2015 23:01:31 -0700
Subject: [PATCH 22/26] Adding adjacency duplicate checks

---
 app/cron/proj/graph/graph.go | 35 ++++++++++++++++++++++++++++-------
 1 file changed, 28 insertions(+), 7 deletions(-)

diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go
index 65950ab..1395277 100644
--- a/app/cron/proj/graph/graph.go
+++ b/app/cron/proj/graph/graph.go
@@ -13,6 +13,7 @@ type Graph struct {
 	s     *SerialGraph
 	nodes map[uint64]*Node              // Optimal lookup with pointers goes here
 	dupes map[NodeType]map[string]*Node // type > value > node
+	edges map[uint64]map[uint64]bool    // Edge duplicate detection
 }
 
 // New creates a new Graph
@@ -25,6 +26,7 @@ func New(isDirected bool) *Graph {
 		},
 		nodes: make(map[uint64]*Node),
 		dupes: make(map[NodeType]map[string]*Node),
+		edges: make(map[uint64]map[uint64]bool),
 	}
 }
 
@@ -60,17 +62,24 @@ func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node {
 }
 
 // Connect connects nodes to and from with an edge of weight w
-func (g *Graph) Connect(to, from *Node, weight int64) error {
-	// TODO: collision checks
+func (g *Graph) Connect(from, to *Node, weight int64) error {
 	if to == nil || from == nil {
 		return errors.New("Cannot add edge to nil node")
 	}
-	from.Adjacent = append(from.Adjacent, *to.Id) // Directed edge
-	from.Weights = append(from.Weights, weight)
 
-	if !g.s.GetDirected() { // UnDirected edge (return trip)
-		to.Adjacent = append(to.Adjacent, *from.Id)
-		to.Weights = append(to.Weights, weight)
+	mm := g.edges[*from.Id]
+	if mm == nil {
+		mm = make(map[uint64]bool)
+		g.edges[*from.Id] = mm
+	}
+	if !mm[*to.Id] {
+		from.Adjacent = append(from.Adjacent, *to.Id) // Directed edge
+		from.Weights = append(from.Weights, weight)
+		mm[*to.Id] = true
+	}
+
+	if !g.s.GetDirected() && !g.edges[*to.Id][*from.Id] { // UnDirected edge (return trip)
+		g.Connect(to, from, weight)
 	}
 	return nil
 }
@@ -104,6 +113,18 @@ func DecodeGraph(data []byte) (*Graph, error) {
 	// Hydrate Graph from SerialGraph
 	for _, node := range sg.Nodes {
 		g.nodes[*node.Id] = node
+
+		// initialize node adjacency map
+		mm := g.edges[*node.Id]
+		if mm == nil {
+			mm = make(map[uint64]bool)
+			g.edges[*node.Id] = mm
+		}
+
+		// populate node adjacency map
+		for _, adjID := range node.GetAdjacent() {
+			mm[adjID] = true
+		}
 	}
 	return g, nil
 }

From 6861dad09451343650f1baf2fd045c41c6499af7 Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Wed, 18 Nov 2015 18:51:15 -0700
Subject: [PATCH 23/26] Fixing shards to not leave dead data in storage on
 write

---
 app/helpers/sharder/sharder.go |  2 +-
 app/helpers/sharder/writer.go  | 22 +++++++++++++++++++---
 2 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/app/helpers/sharder/sharder.go b/app/helpers/sharder/sharder.go
index b14fc55..4fc6316 100644
--- a/app/helpers/sharder/sharder.go
+++ b/app/helpers/sharder/sharder.go
@@ -39,5 +39,5 @@ type shardMaster struct {
 }
 
 type shard struct {
-	Data []byte
+	Data []byte `datastore:"data"`
 }
diff --git a/app/helpers/sharder/writer.go b/app/helpers/sharder/writer.go
index d9aef2e..7d018e0 100644
--- a/app/helpers/sharder/writer.go
+++ b/app/helpers/sharder/writer.go
@@ -13,11 +13,18 @@ func Writer(c appengine.Context, name string, data []byte) error {
 		return ErrInvalidName
 	}
 
-	master := shardMaster{len(data)}
-	shards := numShards(master.Size)
+	// Attempt to get existing key
+	key := masterKey(c, name)
+	oldMaster := shardMaster{}
+	oldShards := 0
+	if datastore.Get(c, key, &oldMaster) == nil {
+		oldShards = numShards(oldMaster.Size)
+	}
 
 	// Store shardMaster
-	if _, err := datastore.Put(c, masterKey(c, name), &master); err != nil {
+	master := shardMaster{len(data)}
+	shards := numShards(master.Size)
+	if _, err := datastore.Put(c, key, &master); err != nil {
 		return err
 	}
 
@@ -40,6 +47,15 @@ func Writer(c appengine.Context, name string, data []byte) error {
 		}(i)
 	}
 
+	// Delete shards that shouldn't be in datastore (write something smaller than before)
+	if oldShards > shards {
+		keys := make([]*datastore.Key, oldShards-shards)
+		for i := shards; i < oldShards; i++ {
+			keys[i-shards] = shardKey(c, name, i)
+		}
+		datastore.DeleteMulti(c, keys)
+	}
+
 	wg.Wait()
 	return nil
 }

From b0398f932125ee509fc0a95506265a72c337e36e Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Wed, 18 Nov 2015 19:14:22 -0700
Subject: [PATCH 24/26] Fixing flatten problems

---
 app/cron/proj/proj.go | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/app/cron/proj/proj.go b/app/cron/proj/proj.go
index 6e56ec3..525bfd9 100644
--- a/app/cron/proj/proj.go
+++ b/app/cron/proj/proj.go
@@ -1,6 +1,7 @@
 package proj
 
 import (
+	"bytes"
 	"encoding/xml"
 
 	"appengine"
@@ -62,8 +63,15 @@ func flatten(c appengine.Context) chain.Worker {
 		var xmlPage XMLPage
 		var imgs []string
 
+		// Clean FormFeed characters from data
+		data := bytes.Replace(obj.([]byte), []byte("\u000C"), nil, -1)
+
+		// Start up decoder
+		decoder := xml.NewDecoder(bytes.NewReader(data))
+		decoder.Entity = xml.HTMLEntity
+
 		// Parse the XML of an object
-		if err := xml.Unmarshal(obj.([]byte), &xmlPage); err != nil {
+		if err := decoder.Decode(&xmlPage); err != nil {
 			c.Errorf("Flatten %d: %v", idx, err)
 			return
 		}

From 348600f675e5cb4d493cb99fad324f19b22b547d Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Thu, 19 Nov 2015 21:09:59 -0700
Subject: [PATCH 25/26] Adding edge weights when duped

---
 app/cron/proj/graph.go       |  8 ++++----
 app/cron/proj/graph/graph.go | 32 ++++++++++++++++++++++++++------
 2 files changed, 30 insertions(+), 10 deletions(-)

diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go
index 108a38a..5507f3c 100644
--- a/app/cron/proj/graph.go
+++ b/app/cron/proj/graph.go
@@ -52,13 +52,13 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 		post = g.Add(item.GUID, graph.NodeType_POST, 0)
 
 		for _, tag := range validTags(item.Tags) {
-			ntag = g.Add(tag, graph.NodeType_TAG, 0)
-			g.Connect(post, ntag, 0)
+			ntag = g.Add(tag, graph.NodeType_TAG, 1)
+			g.Connect(post, ntag, 1)
 		}
 
 		for _, img := range item.Imgs {
-			nimg = g.Add(img, graph.NodeType_IMG, 0)
-			g.Connect(post, nimg, 0)
+			nimg = g.Add(img, graph.NodeType_IMG, 1)
+			g.Connect(post, nimg, 1)
 		}
 
 		// This is a DEBUG only operation
diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go
index 1395277..08a9876 100644
--- a/app/cron/proj/graph/graph.go
+++ b/app/cron/proj/graph/graph.go
@@ -66,22 +66,34 @@ func (g *Graph) Connect(from, to *Node, weight int64) error {
 	if to == nil || from == nil {
 		return errors.New("Cannot add edge to nil node")
 	}
+	g.connect(from, to, weight) // Directed edge
+	if !g.s.GetDirected() {
+		g.connect(to, from, weight) // UnDirected edge (return trip)
+	}
+	return nil
+}
 
+func (g *Graph) connect(from, to *Node, weight int64) {
 	mm := g.edges[*from.Id]
 	if mm == nil {
 		mm = make(map[uint64]bool)
 		g.edges[*from.Id] = mm
 	}
 	if !mm[*to.Id] {
-		from.Adjacent = append(from.Adjacent, *to.Id) // Directed edge
+		from.Adjacent = append(from.Adjacent, *to.Id)
 		from.Weights = append(from.Weights, weight)
 		mm[*to.Id] = true
+	} else {
+		// This si SUPER SLOW for highly connected nodes. TODO: make this not suck
+		idx := 0
+		for i, nodeID := range from.Adjacent {
+			if nodeID == *to.Id {
+				idx = i
+				break
+			}
+		}
+		from.Weights[idx] += weight
 	}
-
-	if !g.s.GetDirected() && !g.edges[*to.Id][*from.Id] { // UnDirected edge (return trip)
-		g.Connect(to, from, weight)
-	}
-	return nil
 }
 
 func (g *Graph) genNodeID() (id uint64) {
@@ -114,6 +126,14 @@ func DecodeGraph(data []byte) (*Graph, error) {
 	for _, node := range sg.Nodes {
 		g.nodes[*node.Id] = node
 
+		// Initialize dupes map
+		nn := g.dupes[node.GetType()]
+		if nn == nil {
+			nn = make(map[string]*Node)
+			g.dupes[node.GetType()] = nn
+		}
+		nn[node.GetValue()] = node
+
 		// initialize node adjacency map
 		mm := g.edges[*node.Id]
 		if mm == nil {

From 8ff6affb7be33d58fa4585876245cff0c84ea94b Mon Sep 17 00:00:00 2001
From: Nate Woods <big.nate.w@gmail.com>
Date: Thu, 19 Nov 2015 22:55:20 -0700
Subject: [PATCH 26/26] Migrating Protobuffers to v3.0.0-beta-1

All maps - no pointers - sweet!!!
---
 app/cron/proj/graph.go          |   4 +-
 app/cron/proj/graph/graph.go    | 121 ++++++++++-------------------
 app/cron/proj/graph/graph.pb.go | 131 +++++++++++---------------------
 app/cron/proj/graph/graph.proto |  17 ++---
 app/cron/proj/graph/load.sh     |   4 +-
 5 files changed, 96 insertions(+), 181 deletions(-)

diff --git a/app/cron/proj/graph.go b/app/cron/proj/graph.go
index 5507f3c..ab7e7aa 100644
--- a/app/cron/proj/graph.go
+++ b/app/cron/proj/graph.go
@@ -42,7 +42,7 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 	start := time.Now()
 
 	var item Item
-	var post, ntag, nimg *graph.Node
+	var post, ntag, nimg graph.NodeID
 
 	idx := 0
 	timeout := time.After(time.Second)
@@ -89,7 +89,7 @@ func Graph(c appengine.Context, w http.ResponseWriter, r *http.Request) {
 	// Count types of nodes
 	binCtr := make(map[graph.NodeType]uint64)
 	for _, node := range g.Nodes() {
-		binCtr[*node.Type]++
+		binCtr[node.Type]++
 	}
 
 	// Log out types of nodes
diff --git a/app/cron/proj/graph/graph.go b/app/cron/proj/graph/graph.go
index 08a9876..4bcae7a 100644
--- a/app/cron/proj/graph/graph.go
+++ b/app/cron/proj/graph/graph.go
@@ -1,112 +1,89 @@
 package graph
 
-import (
-	"errors"
-
-	"github.com/golang/protobuf/proto"
-)
+import "errors"
 
 // TODO: add some graph processing functions
 
+// NodeID is a graph identifier
+type NodeID uint64
+
 // Graph is the serializable graph we have all been looking for
 type Graph struct {
 	s     *SerialGraph
-	nodes map[uint64]*Node              // Optimal lookup with pointers goes here
-	dupes map[NodeType]map[string]*Node // type > value > node
-	edges map[uint64]map[uint64]bool    // Edge duplicate detection
+	dupes map[NodeType]map[string]NodeID // type > value > node
 }
 
 // New creates a new Graph
 func New(isDirected bool) *Graph {
 	return &Graph{
 		s: &SerialGraph{
-			Nodes:     make([]*Node, 0),
-			Directed:  proto.Bool(isDirected),
-			NodeCount: proto.Uint64(0),
+			Nodes:     make(map[uint64]*Node),
+			Directed:  isDirected,
+			NodeCount: 0,
 		},
-		nodes: make(map[uint64]*Node),
-		dupes: make(map[NodeType]map[string]*Node),
-		edges: make(map[uint64]map[uint64]bool),
+		dupes: make(map[NodeType]map[string]NodeID),
 	}
 }
 
+// Get returns an associated node for a given ID
+func (g *Graph) Get(id NodeID) *Node {
+	return g.s.Nodes[uint64(id)]
+}
+
 // Add creates and adds a node to the graph
-func (g *Graph) Add(value string, ttype NodeType, weight int64) *Node {
+func (g *Graph) Add(value string, ttype NodeType, weight int64) NodeID {
 
 	// Check duplicate node (add weight)
 	dupe := g.dupes[ttype][value]
-	if dupe != nil {
-		*dupe.Weight += weight
+	if dupe != 0 {
+		g.Get(dupe).Weight += weight
 		return dupe
 	}
 
 	// Create new node
+	id := g.genNodeID()
 	n := &Node{
-		Id:       proto.Uint64(g.genNodeID()),
-		Value:    proto.String(value),
-		Weight:   proto.Int64(weight),
-		Type:     ttype.Enum(),
-		Adjacent: make([]uint64, 0),
+		Value:    value,
+		Weight:   weight,
+		Type:     ttype,
+		Adjacent: make(map[uint64]int64, 0),
 	}
-	g.nodes[*n.Id] = n
-	g.s.Nodes = append(g.s.Nodes, n)
+	g.s.Nodes[id] = n
 
 	// Add dupe check to list
 	dub, ok := g.dupes[ttype]
 	if !ok {
-		dub = make(map[string]*Node)
+		dub = make(map[string]NodeID)
 		g.dupes[ttype] = dub
 	}
-	dub[value] = n
-	return n
+	nid := NodeID(id)
+	dub[value] = nid
+	return nid
 }
 
 // Connect connects nodes to and from with an edge of weight w
-func (g *Graph) Connect(from, to *Node, weight int64) error {
-	if to == nil || from == nil {
+func (g *Graph) Connect(from, to NodeID, weight int64) error {
+	if to == 0 || from == 0 {
 		return errors.New("Cannot add edge to nil node")
 	}
-	g.connect(from, to, weight) // Directed edge
-	if !g.s.GetDirected() {
-		g.connect(to, from, weight) // UnDirected edge (return trip)
+	g.Get(from).Adjacent[uint64(to)] += weight // Directed edge
+	if !g.s.Directed {
+		g.Get(to).Adjacent[uint64(from)] += weight // UnDirected edge (return trip)
 	}
 	return nil
 }
 
-func (g *Graph) connect(from, to *Node, weight int64) {
-	mm := g.edges[*from.Id]
-	if mm == nil {
-		mm = make(map[uint64]bool)
-		g.edges[*from.Id] = mm
-	}
-	if !mm[*to.Id] {
-		from.Adjacent = append(from.Adjacent, *to.Id)
-		from.Weights = append(from.Weights, weight)
-		mm[*to.Id] = true
-	} else {
-		// This si SUPER SLOW for highly connected nodes. TODO: make this not suck
-		idx := 0
-		for i, nodeID := range from.Adjacent {
-			if nodeID == *to.Id {
-				idx = i
-				break
-			}
-		}
-		from.Weights[idx] += weight
-	}
-}
-
 func (g *Graph) genNodeID() (id uint64) {
-	id = g.s.GetNodeCount()
-	*g.s.NodeCount++
+	g.s.NodeCount++
+	id = g.s.NodeCount
 	return id
 }
 
 // Nodes returns all the nodes in the Graph
 func (g *Graph) Nodes() []*Node {
-	n := make([]*Node, len(g.nodes))
+	n := make([]*Node, len(g.s.Nodes))
 	ctr := 0
-	for _, node := range g.nodes {
+	for _, node := range g.s.Nodes {
 		n[ctr] = node
 		ctr++
 	}
@@ -123,35 +100,19 @@ func DecodeGraph(data []byte) (*Graph, error) {
 	g.s = sg
 
 	// Hydrate Graph from SerialGraph
-	for _, node := range sg.Nodes {
-		g.nodes[*node.Id] = node
-
-		// Initialize dupes map
-		nn := g.dupes[node.GetType()]
+	for id, node := range sg.Nodes {
+		nn := g.dupes[node.Type]
 		if nn == nil {
-			nn = make(map[string]*Node)
-			g.dupes[node.GetType()] = nn
-		}
-		nn[node.GetValue()] = node
-
-		// initialize node adjacency map
-		mm := g.edges[*node.Id]
-		if mm == nil {
-			mm = make(map[uint64]bool)
-			g.edges[*node.Id] = mm
-		}
-
-		// populate node adjacency map
-		for _, adjID := range node.GetAdjacent() {
-			mm[adjID] = true
+			nn = make(map[string]NodeID)
+			g.dupes[node.Type] = nn
 		}
+		nn[node.Value] = NodeID(id)
 	}
 	return g, nil
 }
 
 // Bytes flattens a graph to a flat file format
 func (g *Graph) Bytes() ([]byte, error) {
-	// TODO: use smaller numbers for encoding...
 	return g.s.Bytes()
 }
 
diff --git a/app/cron/proj/graph/graph.pb.go b/app/cron/proj/graph/graph.pb.go
index 3d661a5..9e19a04 100644
--- a/app/cron/proj/graph/graph.pb.go
+++ b/app/cron/proj/graph/graph.pb.go
@@ -3,7 +3,7 @@
 // DO NOT EDIT!
 
 /*
-Package main is a generated protocol buffer package.
+Package graph is a generated protocol buffer package.
 
 It is generated from these files:
 	graph.proto
@@ -48,116 +48,71 @@ var NodeType_value = map[string]int32{
 	"USER":    4,
 }
 
-func (x NodeType) Enum() *NodeType {
-	p := new(NodeType)
-	*p = x
-	return p
-}
 func (x NodeType) String() string {
 	return proto.EnumName(NodeType_name, int32(x))
 }
-func (x *NodeType) UnmarshalJSON(data []byte) error {
-	value, err := proto.UnmarshalJSONEnum(NodeType_value, data, "NodeType")
-	if err != nil {
-		return err
-	}
-	*x = NodeType(value)
-	return nil
-}
+func (NodeType) EnumDescriptor() ([]byte, []int) { return fileDescriptor0, []int{0} }
 
 type SerialGraph struct {
-	Nodes            []*Node `protobuf:"bytes,1,rep,name=nodes" json:"nodes,omitempty"`
-	Directed         *bool   `protobuf:"varint,2,opt,name=directed,def=0" json:"directed,omitempty"`
-	NodeCount        *uint64 `protobuf:"varint,3,req,name=nodeCount,def=0" json:"nodeCount,omitempty"`
-	XXX_unrecognized []byte  `json:"-"`
+	Nodes     map[uint64]*Node `protobuf:"bytes,1,rep,name=nodes" json:"nodes,omitempty" protobuf_key:"varint,1,opt,name=key" protobuf_val:"bytes,2,opt,name=value"`
+	Directed  bool             `protobuf:"varint,2,opt,name=directed" json:"directed,omitempty"`
+	NodeCount uint64           `protobuf:"varint,3,opt,name=nodeCount" json:"nodeCount,omitempty"`
 }
 
-func (m *SerialGraph) Reset()         { *m = SerialGraph{} }
-func (m *SerialGraph) String() string { return proto.CompactTextString(m) }
-func (*SerialGraph) ProtoMessage()    {}
+func (m *SerialGraph) Reset()                    { *m = SerialGraph{} }
+func (m *SerialGraph) String() string            { return proto.CompactTextString(m) }
+func (*SerialGraph) ProtoMessage()               {}
+func (*SerialGraph) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{0} }
 
-const Default_SerialGraph_Directed bool = false
-const Default_SerialGraph_NodeCount uint64 = 0
-
-func (m *SerialGraph) GetNodes() []*Node {
+func (m *SerialGraph) GetNodes() map[uint64]*Node {
 	if m != nil {
 		return m.Nodes
 	}
 	return nil
 }
 
-func (m *SerialGraph) GetDirected() bool {
-	if m != nil && m.Directed != nil {
-		return *m.Directed
-	}
-	return Default_SerialGraph_Directed
-}
-
-func (m *SerialGraph) GetNodeCount() uint64 {
-	if m != nil && m.NodeCount != nil {
-		return *m.NodeCount
-	}
-	return Default_SerialGraph_NodeCount
-}
-
 type Node struct {
-	Id               *uint64   `protobuf:"varint,1,req,name=id" json:"id,omitempty"`
-	Value            *string   `protobuf:"bytes,2,req,name=value" json:"value,omitempty"`
-	Weight           *int64    `protobuf:"varint,3,opt,name=weight" json:"weight,omitempty"`
-	Type             *NodeType `protobuf:"varint,4,opt,name=type,enum=main.NodeType,def=0" json:"type,omitempty"`
-	Adjacent         []uint64  `protobuf:"varint,5,rep,name=adjacent" json:"adjacent,omitempty"`
-	Weights          []int64   `protobuf:"varint,6,rep,name=weights" json:"weights,omitempty"`
-	XXX_unrecognized []byte    `json:"-"`
+	Value    string           `protobuf:"bytes,1,opt,name=value" json:"value,omitempty"`
+	Weight   int64            `protobuf:"varint,2,opt,name=weight" json:"weight,omitempty"`
+	Type     NodeType         `protobuf:"varint,3,opt,name=type,enum=graph.NodeType" json:"type,omitempty"`
+	Adjacent map[uint64]int64 `protobuf:"bytes,4,rep,name=adjacent" json:"adjacent,omitempty" protobuf_key:"varint,1,opt,name=key" protobuf_val:"varint,2,opt,name=value"`
 }
 
-func (m *Node) Reset()         { *m = Node{} }
-func (m *Node) String() string { return proto.CompactTextString(m) }
-func (*Node) ProtoMessage()    {}
-
-const Default_Node_Type NodeType = NodeType_UNKNOWN
-
-func (m *Node) GetId() uint64 {
-	if m != nil && m.Id != nil {
-		return *m.Id
-	}
-	return 0
-}
+func (m *Node) Reset()                    { *m = Node{} }
+func (m *Node) String() string            { return proto.CompactTextString(m) }
+func (*Node) ProtoMessage()               {}
+func (*Node) Descriptor() ([]byte, []int) { return fileDescriptor0, []int{1} }
 
-func (m *Node) GetValue() string {
-	if m != nil && m.Value != nil {
-		return *m.Value
-	}
-	return ""
-}
-
-func (m *Node) GetWeight() int64 {
-	if m != nil && m.Weight != nil {
-		return *m.Weight
-	}
-	return 0
-}
-
-func (m *Node) GetType() NodeType {
-	if m != nil && m.Type != nil {
-		return *m.Type
-	}
-	return Default_Node_Type
-}
-
-func (m *Node) GetAdjacent() []uint64 {
+func (m *Node) GetAdjacent() map[uint64]int64 {
 	if m != nil {
 		return m.Adjacent
 	}
 	return nil
 }
 
-func (m *Node) GetWeights() []int64 {
-	if m != nil {
-		return m.Weights
-	}
-	return nil
-}
-
 func init() {
-	proto.RegisterEnum("main.NodeType", NodeType_name, NodeType_value)
+	proto.RegisterEnum("graph.NodeType", NodeType_name, NodeType_value)
+}
+
+var fileDescriptor0 = []byte{
+	// 294 bytes of a gzipped FileDescriptorProto
+	0x1f, 0x8b, 0x08, 0x00, 0x00, 0x09, 0x6e, 0x88, 0x02, 0xff, 0x54, 0x91, 0x41, 0x4b, 0xc3, 0x40,
+	0x10, 0x85, 0xdd, 0xee, 0xb6, 0x4d, 0x67, 0x69, 0x5d, 0xe7, 0x14, 0x03, 0x85, 0xd2, 0x53, 0x51,
+	0x89, 0x10, 0x2f, 0x2a, 0x78, 0x28, 0x52, 0x82, 0x88, 0xa9, 0x98, 0x14, 0xcf, 0xb1, 0x59, 0xda,
+	0x68, 0x49, 0x42, 0xdc, 0x2a, 0xf9, 0x2d, 0xde, 0xfc, 0xa5, 0x66, 0xd7, 0x2a, 0xe9, 0x2d, 0x33,
+	0xef, 0x7d, 0xf3, 0x5e, 0x58, 0xe0, 0xab, 0x32, 0x2e, 0xd6, 0x6e, 0x51, 0xe6, 0x2a, 0xc7, 0xb6,
+	0x19, 0xc6, 0x5f, 0x04, 0x78, 0x28, 0xcb, 0x34, 0xde, 0xf8, 0x7a, 0xc6, 0x33, 0x68, 0x67, 0x79,
+	0x22, 0xdf, 0x6d, 0x32, 0xa2, 0x13, 0xee, 0x0d, 0xdd, 0x5f, 0xa6, 0x61, 0x71, 0x03, 0xad, 0xcf,
+	0x32, 0x55, 0x56, 0x28, 0xc0, 0x4a, 0xd2, 0x52, 0x2e, 0x95, 0x4c, 0xec, 0xd6, 0x88, 0x4c, 0x2c,
+	0x3c, 0x82, 0x9e, 0xe6, 0x6f, 0xf3, 0x6d, 0xa6, 0x6c, 0x5a, 0xaf, 0x98, 0x73, 0x05, 0xd0, 0x40,
+	0x38, 0xd0, 0x37, 0x59, 0xd5, 0xe7, 0x6b, 0x09, 0x1d, 0x68, 0x7f, 0xc4, 0x9b, 0xad, 0x34, 0x30,
+	0xf7, 0xf8, 0x2e, 0x4d, 0xdb, 0xaf, 0x5b, 0x97, 0x64, 0xfc, 0x4d, 0x80, 0xe9, 0x01, 0xfb, 0x7f,
+	0x46, 0xcd, 0xf5, 0x70, 0x00, 0x9d, 0x4f, 0x99, 0xae, 0xd6, 0xca, 0x80, 0x14, 0x87, 0xc0, 0x54,
+	0x55, 0x48, 0x13, 0x38, 0xf0, 0x0e, 0x1b, 0x67, 0xa2, 0x7a, 0x8d, 0xa7, 0x60, 0xc5, 0xc9, 0x6b,
+	0xbc, 0x94, 0x75, 0x27, 0x66, 0xfe, 0xeb, 0xb8, 0x61, 0x71, 0xa7, 0x3b, 0xcd, 0x14, 0x74, 0xce,
+	0xa1, 0xbf, 0xb7, 0xd8, 0x6f, 0xdc, 0x6f, 0x36, 0xa6, 0xba, 0xe4, 0xc9, 0x0d, 0x58, 0xff, 0x49,
+	0x1c, 0xba, 0x8b, 0xe0, 0x3e, 0x98, 0x3f, 0x07, 0xe2, 0x00, 0x2d, 0x60, 0x8f, 0xf3, 0x30, 0x12,
+	0x04, 0xbb, 0x40, 0xef, 0x1e, 0x7c, 0xd1, 0xd2, 0x1f, 0xd1, 0xd4, 0x17, 0x54, 0x6b, 0x8b, 0x70,
+	0xf6, 0x24, 0xd8, 0x4b, 0xc7, 0xbc, 0xc7, 0xc5, 0x4f, 0x00, 0x00, 0x00, 0xff, 0xff, 0xbf, 0x47,
+	0x15, 0xe3, 0x9e, 0x01, 0x00, 0x00,
 }
diff --git a/app/cron/proj/graph/graph.proto b/app/cron/proj/graph/graph.proto
index d5693dd..3d9bee5 100644
--- a/app/cron/proj/graph/graph.proto
+++ b/app/cron/proj/graph/graph.proto
@@ -1,9 +1,10 @@
+syntax = "proto3";
 package graph;
 
 message SerialGraph {
-	repeated Node nodes = 1;
-	optional bool directed = 2 [default = false];
-	required uint64 nodeCount = 3 [default = 0];
+	map<uint64, Node> nodes = 1;
+	bool directed = 2;
+	uint64 nodeCount = 3;
 }
 
 enum NodeType {
@@ -15,10 +16,8 @@ enum NodeType {
 }
 
 message Node {
-	required uint64 id = 1;
-	required string value = 2;
-	optional int64 weight = 3;
-	optional NodeType type = 4 [default = UNKNOWN];
-	repeated uint64 adjacent = 5;
-	repeated int64 weights = 6;
+	string value = 1;
+	int64 weight = 2;
+	NodeType type = 3;
+	map<uint64, int64> adjacent = 4;
 }
diff --git a/app/cron/proj/graph/load.sh b/app/cron/proj/graph/load.sh
index 4ebec6a..b45f7ee 100755
--- a/app/cron/proj/graph/load.sh
+++ b/app/cron/proj/graph/load.sh
@@ -1,9 +1,9 @@
 #!/bin/sh
-
+set -e
+export PATH=$PATH:$GOPATH/bin
 if ! which proto >/dev/null; then
 	echo "Installing proto and protoc-gen-go"
 	go get -u github.com/golang/protobuf/{proto,protoc-gen-go}
-	export PATH=$PATH:$GOPATH/bin
 else
 	echo "Proto and protoc-gen-go already installed"
 fi