Skip to content

Commit

Permalink
BREAKING: rewrite architecture and all code
Browse files Browse the repository at this point in the history
Why rewrite architecture? Make it simple and powerful and also can fast to start in any project.

1. new Middleware interface and implements.
2. new Pipeline interface and implements.
3. new Spider handler interface and implements.
4. lets cookies,compression,robots.txt,proxy middlewares as the build-in middleware that used by crawler.
5. delete SpiderMux class, and Crawler class implements SpiderMux feature for the different websites.
7. delete deprecated middleware(etc redirect,url_dupe,delay...)
  • Loading branch information
zhengchun committed Nov 29, 2017
1 parent dd850bf commit ac16bea
Show file tree
Hide file tree
Showing 32 changed files with 778 additions and 1,765 deletions.
35 changes: 32 additions & 3 deletions README.md
@@ -1,7 +1,36 @@
Antch
====

[![Go Report Card](https://goreportcard.com/badge/github.com/antchfx/antch)](https://goreportcard.com/report/github.com/antchfx/antch)
[![GoDoc](https://godoc.org/github.com/antchfx/antch?status.svg)](https://godoc.org/github.com/antchfx/antch)

Overview
===
Antch is a fast high-level scalable and extensible web crawling and web scraping framework, used to crawl websites and extract structured data from their pages.
Antch, Its inspired by [Scrpay](https://scrapy.org/) project. If you're already familiar
with scrapy, you can quickly get started.

Antch is a fast high-level web crawling & scraping framework for Go, used
to crawl websites and extract structured data from their pages.

Features
====

TODO

Requirements
====

TODO

Tutorial
====

TODO

Middlewares
====

TODO

Pipelines
====

TODO
84 changes: 84 additions & 0 deletions compression.go
@@ -0,0 +1,84 @@
package antch

import (
"compress/gzip"
"compress/zlib"
"io"
"net/http"
)

func decompress(name string, rc io.ReadCloser) (io.ReadCloser, bool) {
switch name {
case "gzip":
return &gzipReader{rc: rc}, true
case "deflate":
return &deflateReader{rc: rc}, true
}
return nil, false
}

func compressionHandler(next HttpMessageHandler) HttpMessageHandler {
return HttpMessageHandlerFunc(func(req *http.Request) (*http.Response, error) {
req.Header.Set("Accept-Encoding", "gzip, deflate")

resp, err := next.Send(req)
if err != nil {
return nil, err
}
if rc, ok := decompress(resp.Header.Get("Content-Encoding"), resp.Body); ok {
resp.Header.Del("Content-Encoding")
resp.Header.Del("Content-Length")

resp.Body = rc
resp.ContentLength = -1
resp.Uncompressed = true
}
return resp, err
})
}

// gzipReader is a reader with gzip decompress mode.
type gzipReader struct {
rr io.Reader
rc io.ReadCloser
}

func (z *gzipReader) Read(p []byte) (n int, err error) {
if z.rr == nil {
z.rr, err = gzip.NewReader(z.rc)
if err != nil {
return n, err
}
}
return z.rr.Read(p)
}

func (z *gzipReader) Close() error {
return z.rc.Close()
}

// deflateReader is a reader with deflate decompress mode.
type deflateReader struct {
rr io.Reader
rc io.ReadCloser
}

func (r *deflateReader) Read(p []byte) (n int, err error) {
if r.rr == nil {
r.rr, err = zlib.NewReader(r.rc)
if err != nil {
return n, err
}
}
return r.rr.Read(p)
}

func (r *deflateReader) Close() error {
return r.rc.Close()
}

// CompressionMiddleware is a middleware to allows compressed
// (gzip, deflate) traffic to be sent/received from sites.
func CompressionMiddleware() Middleware {
return Middleware(compressionHandler)
}
35 changes: 35 additions & 0 deletions cookies.go
@@ -0,0 +1,35 @@
package antch

import (
"net/http"
"net/http/cookiejar"

"golang.org/x/net/publicsuffix"
)

func cookiesHandler(next HttpMessageHandler) HttpMessageHandler {
jar, _ := cookiejar.New(&cookiejar.Options{PublicSuffixList: publicsuffix.List})
return HttpMessageHandlerFunc(func(req *http.Request) (*http.Response, error) {
// Delete previous cookie value before set new cookie value.
req.Header.Del("Cookie")

for _, cookie := range jar.Cookies(req.URL) {
req.AddCookie(cookie)
}

resp, err := next.Send(req)
if err != nil {
return nil, err
}
if rc := resp.Cookies(); len(rc) > 0 {
jar.SetCookies(req.URL, rc)
}
return resp, err
})
}

// CookiesMiddleware is an HTTP cookies middleware to allows cookies
// to tracking for each of HTTP requests.
func CookiesMiddleware() Middleware {
return Middleware(cookiesHandler)
}

0 comments on commit ac16bea

Please sign in to comment.