webseed/client.go

package webseed

import (
	"bytes"
	"context"
	"errors"
	"fmt"
	"io"
	"log"
	"net/http"
	"strings"

	"github.com/RoaringBitmap/roaring"

	"github.com/anacrolix/torrent/common"
	"github.com/anacrolix/torrent/metainfo"
	"github.com/anacrolix/torrent/segments"
)

type RequestSpec = segments.Extent

type requestPartResult struct {
	resp *http.Response
	err  error
}

type requestPart struct {
	req    *http.Request
	e      segments.Extent
	result chan requestPartResult
	start  func()
	// Wrap http response bodies for such things as download rate limiting.
	responseBodyWrapper ResponseBodyWrapper
}

type Request struct {
	cancel func()
	Result chan RequestResult
}

func (r Request) Cancel() {
	r.cancel()
}

type Client struct {
	HttpClient *http.Client
	Url        string
	fileIndex  segments.Index
	info       *metainfo.Info
	// The pieces we can request with the Url. We're more likely to ban/block at the file-level
	// given that's how requests are mapped to webseeds, but the torrent.Client works at the piece
	// level. We can map our file-level adjustments to the pieces here. This probably need to be
	// private in the future, if Client ever starts removing pieces.
	Pieces              roaring.Bitmap
	ResponseBodyWrapper ResponseBodyWrapper
	PathEscaper         PathEscaper
}

type ResponseBodyWrapper func(io.Reader) io.Reader

func (me *Client) SetInfo(info *metainfo.Info) {
	if !strings.HasSuffix(me.Url, "/") && info.IsDir() {
		// In my experience, this is a non-conforming webseed. For example the
		// http://ia600500.us.archive.org/1/items URLs in archive.org torrents.
		return
	}
	me.fileIndex = segments.NewIndex(common.LengthIterFromUpvertedFiles(info.UpvertedFiles()))
	me.info = info
	me.Pieces.AddRange(0, uint64(info.NumPieces()))
}

type RequestResult struct {
	Bytes []byte
	Err   error
}

func (ws *Client) NewRequest(r RequestSpec) Request {
	ctx, cancel := context.WithCancel(context.Background())
	var requestParts []requestPart
	if !ws.fileIndex.Locate(r, func(i int, e segments.Extent) bool {
		req, err := newRequest(
			ws.Url, i, ws.info, e.Start, e.Length,
			ws.PathEscaper,
		)
		if err != nil {
			panic(err)
		}
		req = req.WithContext(ctx)
		part := requestPart{
			req:                 req,
			result:              make(chan requestPartResult, 1),
			e:                   e,
			responseBodyWrapper: ws.ResponseBodyWrapper,
		}
		part.start = func() {
			go func() {
				resp, err := ws.HttpClient.Do(req)
				part.result <- requestPartResult{
					resp: resp,
					err:  err,
				}
			}()
		}
		requestParts = append(requestParts, part)
		return true
	}) {
		panic("request out of file bounds")
	}
	req := Request{
		cancel: cancel,
		Result: make(chan RequestResult, 1),
	}
	go func() {
		b, err := readRequestPartResponses(ctx, requestParts)
		req.Result <- RequestResult{
			Bytes: b,
			Err:   err,
		}
	}()
	return req
}

type ErrBadResponse struct {
	Msg      string
	Response *http.Response
}

func (me ErrBadResponse) Error() string {
	return me.Msg
}

func recvPartResult(ctx context.Context, buf io.Writer, part requestPart) error {
	result := <-part.result
	// Make sure there's no further results coming, it should be a one-shot channel.
	close(part.result)
	if result.err != nil {
		return result.err
	}
	defer result.resp.Body.Close()
	var body io.Reader = result.resp.Body
	if part.responseBodyWrapper != nil {
		body = part.responseBodyWrapper(body)
	}
	// Prevent further accidental use
	result.resp.Body = nil
	if ctx.Err() != nil {
		return ctx.Err()
	}
	switch result.resp.StatusCode {
	case http.StatusPartialContent:
		copied, err := io.Copy(buf, body)
		if err != nil {
			return err
		}
		if copied != part.e.Length {
			return fmt.Errorf("got %v bytes, expected %v", copied, part.e.Length)
		}
		return nil
	case http.StatusOK:
		// This number is based on
		// https://archive.org/download/BloodyPitOfHorror/BloodyPitOfHorror.asr.srt. It seems that
		// archive.org might be using a webserver implementation that refuses to do partial
		// responses to small files.
		if part.e.Start < 48<<10 {
			if part.e.Start != 0 {
				log.Printf("resp status ok but requested range [url=%q, range=%q]",
					part.req.URL,
					part.req.Header.Get("Range"))
			}
			// Instead of discarding, we could try receiving all the chunks present in the response
			// body. I don't know how one would handle multiple chunk requests resulting in an OK
			// response for the same file. The request algorithm might be need to be smarter for
			// that.
			discarded, _ := io.CopyN(io.Discard, body, part.e.Start)
			if discarded != 0 {
				log.Printf("discarded %v bytes in webseed request response part", discarded)
			}
			_, err := io.CopyN(buf, body, part.e.Length)
			return err
		} else {
			return ErrBadResponse{"resp status ok but requested range", result.resp}
		}
	case http.StatusServiceUnavailable:
		return ErrTooFast
	default:
		return ErrBadResponse{
			fmt.Sprintf("unhandled response status code (%v)", result.resp.StatusCode),
			result.resp,
		}
	}
}

var ErrTooFast = errors.New("making requests too fast")

func readRequestPartResponses(ctx context.Context, parts []requestPart) (_ []byte, err error) {
	var buf bytes.Buffer
	for _, part := range parts {
		part.start()
		err = recvPartResult(ctx, &buf, part)
		if err != nil {
			err = fmt.Errorf("reading %q at %q: %w", part.req.URL, part.req.Header.Get("Range"), err)
			break
		}
	}
	return buf.Bytes(), err
}