Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
BREAKING: rewrite architecture and all code
Why rewrite architecture? Make it simple and powerful and also can fast to start in any project. 1. new Middleware interface and implements. 2. new Pipeline interface and implements. 3. new Spider handler interface and implements. 4. lets cookies,compression,robots.txt,proxy middlewares as the build-in middleware that used by crawler. 5. delete SpiderMux class, and Crawler class implements SpiderMux feature for the different websites. 7. delete deprecated middleware(etc redirect,url_dupe,delay...)
- Loading branch information
Showing
32 changed files
with
778 additions
and
1,765 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,36 @@ | ||
Antch | ||
==== | ||
|
||
[![Go Report Card](https://goreportcard.com/badge/github.com/antchfx/antch)](https://goreportcard.com/report/github.com/antchfx/antch) | ||
[![GoDoc](https://godoc.org/github.com/antchfx/antch?status.svg)](https://godoc.org/github.com/antchfx/antch) | ||
|
||
Overview | ||
=== | ||
Antch is a fast high-level scalable and extensible web crawling and web scraping framework, used to crawl websites and extract structured data from their pages. | ||
Antch, Its inspired by [Scrpay](https://scrapy.org/) project. If you're already familiar | ||
with scrapy, you can quickly get started. | ||
|
||
Antch is a fast high-level web crawling & scraping framework for Go, used | ||
to crawl websites and extract structured data from their pages. | ||
|
||
Features | ||
==== | ||
|
||
TODO | ||
|
||
Requirements | ||
==== | ||
|
||
TODO | ||
|
||
Tutorial | ||
==== | ||
|
||
TODO | ||
|
||
Middlewares | ||
==== | ||
|
||
TODO | ||
|
||
Pipelines | ||
==== | ||
|
||
TODO |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
package antch | ||
|
||
import ( | ||
"compress/gzip" | ||
"compress/zlib" | ||
"io" | ||
"net/http" | ||
) | ||
|
||
func decompress(name string, rc io.ReadCloser) (io.ReadCloser, bool) { | ||
switch name { | ||
case "gzip": | ||
return &gzipReader{rc: rc}, true | ||
case "deflate": | ||
return &deflateReader{rc: rc}, true | ||
} | ||
return nil, false | ||
} | ||
|
||
func compressionHandler(next HttpMessageHandler) HttpMessageHandler { | ||
return HttpMessageHandlerFunc(func(req *http.Request) (*http.Response, error) { | ||
req.Header.Set("Accept-Encoding", "gzip, deflate") | ||
|
||
resp, err := next.Send(req) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if rc, ok := decompress(resp.Header.Get("Content-Encoding"), resp.Body); ok { | ||
resp.Header.Del("Content-Encoding") | ||
resp.Header.Del("Content-Length") | ||
|
||
resp.Body = rc | ||
resp.ContentLength = -1 | ||
resp.Uncompressed = true | ||
} | ||
return resp, err | ||
}) | ||
} | ||
|
||
// gzipReader is a reader with gzip decompress mode. | ||
type gzipReader struct { | ||
rr io.Reader | ||
rc io.ReadCloser | ||
} | ||
|
||
func (z *gzipReader) Read(p []byte) (n int, err error) { | ||
if z.rr == nil { | ||
z.rr, err = gzip.NewReader(z.rc) | ||
if err != nil { | ||
return n, err | ||
} | ||
} | ||
return z.rr.Read(p) | ||
} | ||
|
||
func (z *gzipReader) Close() error { | ||
return z.rc.Close() | ||
} | ||
|
||
// deflateReader is a reader with deflate decompress mode. | ||
type deflateReader struct { | ||
rr io.Reader | ||
rc io.ReadCloser | ||
} | ||
|
||
func (r *deflateReader) Read(p []byte) (n int, err error) { | ||
if r.rr == nil { | ||
r.rr, err = zlib.NewReader(r.rc) | ||
if err != nil { | ||
return n, err | ||
} | ||
} | ||
return r.rr.Read(p) | ||
} | ||
|
||
func (r *deflateReader) Close() error { | ||
return r.rc.Close() | ||
} | ||
|
||
// CompressionMiddleware is a middleware to allows compressed | ||
// (gzip, deflate) traffic to be sent/received from sites. | ||
func CompressionMiddleware() Middleware { | ||
return Middleware(compressionHandler) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
package antch | ||
|
||
import ( | ||
"net/http" | ||
"net/http/cookiejar" | ||
|
||
"golang.org/x/net/publicsuffix" | ||
) | ||
|
||
func cookiesHandler(next HttpMessageHandler) HttpMessageHandler { | ||
jar, _ := cookiejar.New(&cookiejar.Options{PublicSuffixList: publicsuffix.List}) | ||
return HttpMessageHandlerFunc(func(req *http.Request) (*http.Response, error) { | ||
// Delete previous cookie value before set new cookie value. | ||
req.Header.Del("Cookie") | ||
|
||
for _, cookie := range jar.Cookies(req.URL) { | ||
req.AddCookie(cookie) | ||
} | ||
|
||
resp, err := next.Send(req) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if rc := resp.Cookies(); len(rc) > 0 { | ||
jar.SetCookies(req.URL, rc) | ||
} | ||
return resp, err | ||
}) | ||
} | ||
|
||
// CookiesMiddleware is an HTTP cookies middleware to allows cookies | ||
// to tracking for each of HTTP requests. | ||
func CookiesMiddleware() Middleware { | ||
return Middleware(cookiesHandler) | ||
} |
Oops, something went wrong.