Skip to content

Commit

Permalink
Add --cache-by-content-type flag (#1070)
Browse files Browse the repository at this point in the history
Add `--cache-by-content-type` flag
  • Loading branch information
abhinavsingh committed Jan 26, 2022
1 parent d6ad69b commit bc577f3
Show file tree
Hide file tree
Showing 19 changed files with 241 additions and 78 deletions.
16 changes: 9 additions & 7 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,17 @@ PROXYPY_CONTAINER_VERSION := latest
# Used by container build and run targets
PROXYPY_CONTAINER_TAG := $(NS)/$(IMAGE_NAME):$(PROXYPY_CONTAINER_VERSION)

HTTPS_KEY_FILE_PATH := https-key.pem
HTTPS_CERT_FILE_PATH := https-cert.pem
HTTPS_CSR_FILE_PATH := https-csr.pem
HTTPS_SIGNED_CERT_FILE_PATH := https-signed-cert.pem
CERT_DIR :=

HTTPS_KEY_FILE_PATH := $(CERT_DIR)https-key.pem
HTTPS_CERT_FILE_PATH := $(CERT_DIR)https-cert.pem
HTTPS_CSR_FILE_PATH := $(CERT_DIR)https-csr.pem
HTTPS_SIGNED_CERT_FILE_PATH := $(CERT_DIR)https-signed-cert.pem

CA_CERT_SUFFIX :=
CA_KEY_FILE_PATH := ca-key$(CA_CERT_SUFFIX).pem
CA_CERT_FILE_PATH := ca-cert$(CA_CERT_SUFFIX).pem
CA_SIGNING_KEY_FILE_PATH := ca-signing-key$(CA_CERT_SUFFIX).pem
CA_KEY_FILE_PATH := $(CERT_DIR)ca-key$(CA_CERT_SUFFIX).pem
CA_CERT_FILE_PATH := $(CERT_DIR)ca-cert$(CA_CERT_SUFFIX).pem
CA_SIGNING_KEY_FILE_PATH := $(CERT_DIR)ca-signing-key$(CA_CERT_SUFFIX).pem

# Dummy invalid hardcoded value
PROXYPY_PKG_PATH := dist/proxy.py.whl
Expand Down
13 changes: 9 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2286,8 +2286,9 @@ usage: -m [-h] [--tunnel-hostname TUNNEL_HOSTNAME] [--tunnel-port TUNNEL_PORT]
[--ca-key-file CA_KEY_FILE] [--ca-cert-dir CA_CERT_DIR]
[--ca-cert-file CA_CERT_FILE] [--ca-file CA_FILE]
[--ca-signing-key-file CA_SIGNING_KEY_FILE]
[--auth-plugin AUTH_PLUGIN] [--cache-dir CACHE_DIR]
[--cache-requests] [--proxy-pool PROXY_POOL] [--enable-web-server]
[--auth-plugin AUTH_PLUGIN] [--cache-requests]
[--cache-by-content-type] [--cache-dir CACHE_DIR]
[--proxy-pool PROXY_POOL] [--enable-web-server]
[--enable-static-server] [--static-server-dir STATIC_SERVER_DIR]
[--min-compression-length MIN_COMPRESSION_LENGTH]
[--enable-reverse-proxy] [--pac-file PAC_FILE]
Expand Down Expand Up @@ -2427,11 +2428,16 @@ options:
--auth-plugin AUTH_PLUGIN
Default: proxy.http.proxy.auth.AuthPlugin. Auth plugin
to use instead of default basic auth plugin.
--cache-requests Default: False. Whether to also write request packets
in the cache file.
--cache-by-content-type
Default: False. Whether to extract content by type
from responses. Extracted content type is written to
the cache directory e.g. video.mp4.
--cache-dir CACHE_DIR
Default: /Users/abhinavsingh/.proxy/cache. Flag only
applicable when cache plugin is used with on-disk
storage.
--cache-requests Default: False. Whether to also cache request packets.
--proxy-pool PROXY_POOL
List of upstream proxies to use in the pool
--enable-web-server Default: False. Whether to enable
Expand Down Expand Up @@ -2473,4 +2479,3 @@ options:
Proxy.py not working? Report at:
https://github.com/abhinavsingh/proxy.py/issues/new
```
``
1 change: 1 addition & 0 deletions proxy/common/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -149,6 +149,7 @@ def _env_threadless_compliant() -> bool:
DEFAULT_DATA_DIRECTORY_PATH, 'cache',
)
DEFAULT_CACHE_REQUESTS = False
DEFAULT_CACHE_BY_CONTENT_TYPE = False

# Cor plugins enabled by default or via flags
DEFAULT_ABC_PLUGINS = [
Expand Down
3 changes: 2 additions & 1 deletion proxy/common/flag.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,7 +398,8 @@ def initialize(
# FIXME: Necessary here until flags framework provides a way
# for flag owners to initialize
os.makedirs(args.cache_dir, exist_ok=True)
os.makedirs(os.path.join(args.cache_dir, 'response'), exist_ok=True)
os.makedirs(os.path.join(args.cache_dir, 'responses'), exist_ok=True)
os.makedirs(os.path.join(args.cache_dir, 'content'), exist_ok=True)

return args

Expand Down
3 changes: 2 additions & 1 deletion proxy/common/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,7 @@ def build_http_response(
headers: Optional[Dict[bytes, bytes]] = None,
body: Optional[bytes] = None,
conn_close: bool = False,
no_cl: bool = False,
) -> bytes:
"""Build and returns a HTTP response packet."""
line = [protocol_version, bytes_(status_code)]
Expand All @@ -131,7 +132,7 @@ def build_http_response(
if k.lower() == b'transfer-encoding':
has_transfer_encoding = True
break
if not has_transfer_encoding:
if not has_transfer_encoding and not no_cl:
headers[b'Content-Length'] = bytes_(len(body)) if body else b'0'
return build_http_pkt(line, headers, body, conn_close)

Expand Down
2 changes: 1 addition & 1 deletion proxy/core/connection/connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def connection(self) -> TcpOrTlsSocket:

def send(self, data: Union[memoryview, bytes]) -> int:
"""Users must handle BrokenPipeError exceptions"""
# logger.info(data)
# logger.info(data.tobytes())
return self.connection.send(data)

def recv(
Expand Down
32 changes: 31 additions & 1 deletion proxy/http/parser/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
http
"""
import gzip
from typing import Dict, List, Type, Tuple, TypeVar, Optional

from ..url import Url
Expand All @@ -22,7 +23,9 @@
from ..exception import HttpProtocolException
from ..protocols import httpProtocols
from ...common.flag import flags
from ...common.utils import text_, build_http_request, build_http_response
from ...common.utils import (
text_, bytes_, build_http_request, build_http_response,
)
from ...common.constants import (
CRLF, COLON, SLASH, HTTP_1_0, HTTP_1_1, WHITESPACE, DEFAULT_HTTP_PORT,
DEFAULT_DISABLE_HEADERS, DEFAULT_ENABLE_PROXY_PROTOCOL,
Expand Down Expand Up @@ -156,6 +159,33 @@ def set_url(self, url: bytes, allowed_url_schemes: Optional[List[bytes]] = None)
)
self._set_line_attributes()

def update_body(self, body: bytes, content_type: bytes) -> None:
"""This method must be used to update body after HTTP packet has been parsed.
Along with updating the body, this method also respects original
request content encoding, transfer encoding settings."""
# If outgoing request encoding is gzip
# also compress the body
if self.has_header(b'content-encoding'):
if self.header(b'content-encoding') == b'gzip':
body = gzip.compress(body)
else:
# We only work with gzip, for any other encoding
# type, remove the original header
self.del_header(b'content-encoding')
# If the request is of type chunked encoding
# add post data as chunk
if self.is_chunked_encoded:
body = ChunkParser.to_chunks(body)
self.del_header(b'content-length')
else:
self.add_header(
b'Content-Length',
bytes_(len(body)),
)
self.body = body
self.add_header(b'Content-Type', content_type)

@property
def http_handler_protocol(self) -> int:
"""Returns `HttpProtocols` that this request belongs to."""
Expand Down
7 changes: 4 additions & 3 deletions proxy/http/responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
build_http_response(
httpStatusCodes.OK,
reason=b'Connection established',
no_cl=True,
),
)

Expand All @@ -31,6 +32,7 @@
httpStatusCodes.BAD_REQUEST,
reason=b'Unsupported protocol scheme',
conn_close=True,
no_cl=True,
),
)

Expand All @@ -44,6 +46,7 @@
},
body=b'Proxy Authentication Required',
conn_close=True,
no_cl=True,
),
)

Expand All @@ -53,7 +56,6 @@
reason=b'BAD REQUEST',
headers={
b'Server': PROXY_AGENT_HEADER_VALUE,
b'Content-Length': b'0',
},
conn_close=True,
),
Expand All @@ -65,7 +67,6 @@
reason=b'NOT FOUND',
headers={
b'Server': PROXY_AGENT_HEADER_VALUE,
b'Content-Length': b'0',
},
conn_close=True,
),
Expand All @@ -77,7 +78,6 @@
reason=b'NOT IMPLEMENTED',
headers={
b'Server': PROXY_AGENT_HEADER_VALUE,
b'Content-Length': b'0',
},
conn_close=True,
),
Expand All @@ -92,6 +92,7 @@
},
body=b'Bad Gateway',
conn_close=True,
no_cl=True,
),
)

Expand Down
2 changes: 2 additions & 0 deletions proxy/plugin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@
Cloudflare
ws
onmessage
httpbin
localhost
"""
from .cache import CacheResponsesPlugin, BaseCacheResponsesPlugin
from .shortlink import ShortLinkPlugin
Expand Down
22 changes: 22 additions & 0 deletions proxy/plugin/cache/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,34 @@

from .store.base import CacheStore
from ...http.proxy import HttpProxyBasePlugin
from ...common.flag import flags
from ...http.parser import HttpParser
from ...common.constants import (
DEFAULT_CACHE_REQUESTS, DEFAULT_CACHE_BY_CONTENT_TYPE,
)


logger = logging.getLogger(__name__)


flags.add_argument(
'--cache-requests',
action='store_true',
default=DEFAULT_CACHE_REQUESTS,
help='Default: False. ' +
'Whether to also write request packets in the cache file.',
)

flags.add_argument(
'--cache-by-content-type',
action='store_true',
default=DEFAULT_CACHE_BY_CONTENT_TYPE,
help='Default: False. ' +
'Whether to extract content by type from responses. ' +
'Extracted content type is written to the cache directory e.g. video.mp4.',
)


class BaseCacheResponsesPlugin(HttpProxyBasePlugin):
"""Base cache plugin.
Expand Down
53 changes: 53 additions & 0 deletions proxy/plugin/cache/cache_responses.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,17 @@
:license: BSD, see LICENSE for more details.
"""
import os
import logging
import multiprocessing
from typing import Any, Dict, Optional

from .base import BaseCacheResponsesPlugin
from .store.disk import OnDiskCacheStore
from ...http.parser import HttpParser, httpParserTypes
from ...common.constants import SLASH


logger = logging.getLogger(__name__)


class CacheResponsesPlugin(BaseCacheResponsesPlugin):
Expand All @@ -39,3 +45,50 @@ def on_access_log(self, context: Dict[str, Any]) -> Optional[Dict[str, Any]]:
'cache_file_path': self.disk_store.cache_file_path,
})
return super().on_access_log(context)

def on_upstream_connection_close(self) -> None:
super().on_upstream_connection_close()
if self.flags.cache_by_content_type and \
self.disk_store.cache_file_path and \
self.disk_store.cache_file_name:
self.write_content_type(
self.disk_store.cache_file_path,
self.flags.cache_dir,
self.disk_store.cache_file_name,
self.flags.cache_requests,
)

@staticmethod
def write_content_type(
cache_file_path: str,
cache_dir: str,
content_file_name: str,
cache_requests: bool,
) -> Optional[str]:
if not cache_requests:
parser = HttpParser(httpParserTypes.RESPONSE_PARSER)
with open(cache_file_path, 'rb') as cache:
data = cache.read()
parser.parse(memoryview(data))
assert parser.is_complete
if parser.body_expected:
assert parser.body
content_type = parser.header(b'content-type') \
if parser.has_header(b'content-type') \
else b'text/plain'
extension = content_type.split(
SLASH, maxsplit=1,
)[-1].decode('utf-8')
content_file_path = os.path.join(
cache_dir, 'content',
'%s.%s' % (content_file_name, extension),
)
with open(content_file_path, 'wb') as content:
content.write(parser.body)
logger.info('Cached content file at %s', content_file_path)
return content_file_path
else:
# Last dumped packet is likely the response
# packet
pass
return None
18 changes: 6 additions & 12 deletions proxy/plugin/cache/store/disk.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,7 @@
from ....common.flag import flags
from ....http.parser import HttpParser
from ....common.utils import text_
from ....common.constants import (
DEFAULT_CACHE_REQUESTS, DEFAULT_CACHE_DIRECTORY_PATH,
)
from ....common.constants import DEFAULT_CACHE_DIRECTORY_PATH


logger = logging.getLogger(__name__)
Expand All @@ -32,28 +30,22 @@
'Flag only applicable when cache plugin is used with on-disk storage.',
)

flags.add_argument(
'--cache-requests',
action='store_true',
default=DEFAULT_CACHE_REQUESTS,
help='Default: False. ' +
'Whether to also cache request packets.',
)


class OnDiskCacheStore(CacheStore):

def __init__(self, uid: str, cache_dir: str, cache_requests: bool) -> None:
super().__init__(uid)
self.cache_dir = cache_dir
self.cache_requests = cache_requests
self.cache_file_name: Optional[str] = None
self.cache_file_path: Optional[str] = None
self.cache_file: Optional[BinaryIO] = None

def open(self, request: HttpParser) -> None:
self.cache_file_name = '%s-%s' % (text_(request.host), self.uid)
self.cache_file_path = os.path.join(
self.cache_dir,
'%s-%s.txt' % (text_(request.host), self.uid),
'%s.txt' % self.cache_file_name,
)
self.cache_file = open(self.cache_file_path, "wb")

Expand All @@ -69,5 +61,7 @@ def cache_response_chunk(self, chunk: memoryview) -> memoryview:

def close(self) -> None:
if self.cache_file:
self.cache_file.flush()
self.cache_file.close()
self.cache_file = None
logger.info('Cached response at %s', self.cache_file_path)
7 changes: 4 additions & 3 deletions proxy/plugin/modify_chunk_response.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,10 @@ def handle_upstream_chunk(self, chunk: memoryview) -> Optional[memoryview]:
self.response.parse(chunk)
# If response is complete, modify and dispatch to client
if self.response.is_complete:
# Avoid setting a body for responses where a body is not expected.
# Otherwise, example curl will report warnings.
if self.response.body_expected:
# Queue our custom chunk if response is chunked encoded
# otherwise queue the original response to client
if self.response.is_chunked_encoded:
self.response.body = b'\n'.join(self.DEFAULT_CHUNKS) + b'\n'
self.client.queue(memoryview(self.response.build_response()))
# Avoid returning chunk straight to client
return None
Loading

0 comments on commit bc577f3

Please sign in to comment.