Permalink
Browse files

Only strip the content-encoding headers in ipro recorder if it is safe

to
ignore them (eg. don't drop Content-Encoding: gzip if the content is
actually
gzipped).

fixes #1371

add two small checks to ipro recording gzipped content tests
  • Loading branch information...
crowell committed Aug 3, 2016
1 parent 306eebe commit 0bd84a160119f6e2dc04ec22e6b09907e875cdb9
@@ -925,6 +925,28 @@ NameVirtualHost localhost:@@APACHE_SECONDARY_PORT@@
AddOutputFilterByType DEFLATE text/css
</VirtualHost>

<VirtualHost localhost:@@APACHE_SECONDARY_PORT@@>
ServerName compressedcache.example.com
DocumentRoot "@@APACHE_DOC_ROOT@@"
ModPagespeedFileCachePath "@@MOD_PAGESPEED_CACHE@@"
ModPagespeed on
ModPagespeedRewriteLevel PassThrough
ModPagespeedEnableFilters rewrite_css
ModPagespeedHttpCacheCompressionLevel 9
</VirtualHost>

<VirtualHost localhost:@@APACHE_SECONDARY_PORT@@>
ServerName uncompressedcache.example.com
DocumentRoot "@@APACHE_DOC_ROOT@@"
ModPagespeedFileCachePath "@@MOD_PAGESPEED_CACHE@@"
ModPagespeed on
ModPagespeedRewriteLevel PassThrough
ModPagespeedEnableFilters rewrite_css
ModPagespeedHttpCacheCompressionLevel 0
AddOutputFilterByType DEFLATE text/css
DeflateCompressionLevel 1
</VirtualHost>

# Backend for ipro-proxy.example.com
Listen 127.0.0.1:@@APACHE_TERTIARY_PORT@@
NameVirtualHost 127.0.0.1:@@APACHE_TERTIARY_PORT@@
@@ -0,0 +1,21 @@
<?php
header("Connection: close\r\n");
header("Cache-Control: max-age=86400");
header("Pragma: ", true);
header("Content-Type: text/css");
header("Expires: " . gmdate("D, d M Y H:i:s", time() + 86400) . " GMT");
$data = ".peachpuff {background-color: peachpuff;}" .
"\n" .
"." . str_pad("", 10000, uniqid()) . " {background-color: antiquewhite;}\n";
$output = "\x1f\x8b\x08\x00\x00\x00\x00\x00" .
substr(gzcompress($data, 2), 0, -4) .
pack('V', crc32($data)) .
pack('V', mb_strlen($data, "latin1"));
header("Content-Encoding: gzip\r\n");
header("Content-Length: " . mb_strlen($output, "latin1"));
echo $output;
@@ -743,7 +743,7 @@ function scrape_header {
# Extracts the value from wget's emitted headers. We use " " as a delimeter
# here to avoid a leading space on the returned string. Note also that wget
# always generates "name: value\r", never "name:value\r".
grep -i "^$1:" | cut -d' ' -f2- | tr -d '\r'
tr -s '\r\n' '\n'| egrep -ia "^.?$1:" | rev | cut -d' ' -f 1 | rev
}

# Scrapes HTTP headers from stdin for Content-Length and returns the value.
@@ -26,6 +26,7 @@
#include "pagespeed/kernel/http/content_type.h"
#include "pagespeed/kernel/http/http_names.h"
#include "pagespeed/kernel/http/response_headers.h"
#include "pagespeed/kernel/util/gzip_inflater.h"

namespace net_instaweb {

@@ -240,12 +241,20 @@ void InPlaceResourceRecorder::DoneAndSetHeaders(
if (failure_) {
num_failed_->Add(1);
} else {
// We don't consider content-encoding to be valid here, since it can
// be captured post-mod_deflate with pre-deflate content. Also note
// that content-length doesn't have to be accurate either, since it can be
// due to compression; we do still use it for quickly reject since
// if gzip'd is too large uncompressed is likely too large, too.
response_headers->RemoveAll(HttpAttributes::kContentEncoding);
// We are skeptical of the correctness of the content-encoding here,
// since it can be captured post-mod_deflate with pre-deflate content.
// Also note that content-length doesn't have to be accurate either, since
// it can be due to compression; we do still use it for quickly reject since
// if gzip'd is too large uncompressed is likely too large, too. We sniff
// the content to make sure that the headers match the Content-Encoding.
StringPiece contents;
resource_value_.ExtractContents(&contents);
// TODO(jcrowell): remove this sniffing fix, and do a proper fix by merging
// the IPRO filters in mod_instaweb.cc and in ngx_pagespeed.
if (!GzipInflater::HasGzipMagicBytes(contents)) {
// Only remove these headers if the content is not gzipped.
response_headers->RemoveAll(HttpAttributes::kContentEncoding);
}
response_headers->RemoveAll(HttpAttributes::kContentLength);
resource_value_.SetHeaders(response_headers);
cache_->Put(url_, fragment_, request_properties_, http_options_,
@@ -107,17 +107,15 @@ class InPlaceResourceRecorderTest : public RewriteTestBase {
&value_out, &headers_out));
StringPiece contents;
EXPECT_TRUE(value_out.ExtractContents(&contents));

// gzip in prelim headers will cause us to decompress, gzip in final_headers
// will not.
EXPECT_EQ(header_time == kPrelimGzipHeader ? kUncompressedData : gzipped,
contents);
EXPECT_EQ(headers_out.IsGzipped() ? gzipped : kUncompressedData, contents);

// We should not have a content-encoding header since we either decompressed
// data ourselves or captured it before data was saved. Also no
// content-length since we may have done gunzip'ing.
EXPECT_FALSE(headers_out.Has(HttpAttributes::kContentEncoding));
EXPECT_FALSE(headers_out.Has(HttpAttributes::kContentLength));
EXPECT_TRUE(headers_out.DetermineContentType()->IsCompressible());

// TODO(jcrowell): Add test for non-compressible type.
}

void CheckCacheableContentType(const ContentType* content_type) {
@@ -1318,6 +1318,56 @@ if [ "$SECONDARY_HOSTNAME" != "" ]; then
fetch_until -save -recursive $URL \
'fgrep -c 150x100xOptPuzzle.jpg.pagespeed.ic.' 1

if [ -z "${DISABLE_PHP_TESTS:-}" ]; then
function php_ipro_record() {
local url="$1"
local max_cache_bytes="$2"
local cache_bytes_op="$3"
local cache_bytes_cmp="$4"
echo http_proxy=$SECONDARY_HOSTNAME $WGET_DUMP --save-headers $url
OUT=$(http_proxy=$SECONDARY_HOSTNAME $WGET_DUMP --save-headers $url)
check_from "$OUT" egrep -iq 'Content-Encoding: gzip'
# Now check that we receive the optimized content from cache (not double
# zipped). When the resource is optimized, "peachpuff" is rewritten to its
# hex equivalent.
http_proxy=$SECONDARY_HOSTNAME fetch_until $url 'fgrep -c ffdab9' 1
# The gzipped size is 175 with the default compression. Our compressed
# cache uses gzip -9, which will compress even better (below 150).
OUT=$(http_proxy=$SECONDARY_HOSTNAME $WGET -O - \
--header="Accept-Encoding: gzip" $url)
# TODO(jcrowell): clean up when ngx_pagespeed stops chunk-encoding static
# resources.
bytes_from_cache=$(echo "$OUT" | wc -c)
check [ $bytes_from_cache -lt $max_cache_bytes ]
check [ $bytes_from_cache $cache_bytes_op $cache_bytes_cmp ]
# Ensure that the Content-Encoding header matches the data that is sent.
OUT=$(http_proxy=$SECONDARY_HOSTNAME $WGET_DUMP \
--header='Accept-Encoding: gzip' $url \
| scrape_header 'Content-Encoding')
check_from "$OUT" fgrep -q 'gzip'
OUT=$(http_proxy=$SECONDARY_HOSTNAME $WGET -O - $url)
bytes_from_cache_uncompressed=$(echo "$OUT" | wc -c)
# The data should uncompressed, but minified at this point.
check [ $bytes_from_cache_uncompressed -gt 10000 ]
check_from "$OUT" grep -q "ffdab9"
# Ensure that the Content-Encoding header matches the data that is sent.
# In this case we didn't sent the Accept-Encoding header, so we don't
# expect the data to have any Content-Encoding header.
OUT=$(http_proxy=$SECONDARY_HOSTNAME $WGET_DUMP $url)
check_not_from "$OUT" egrep -q 'Content-Encoding'
}
start_test Cache Compression On: PHP Pre-Gzipping does not get destroyed by the cache.
URL="http://compressedcache.example.com/mod_pagespeed_test/php_gzip.php"
# The gzipped size is 175 with the default compression. Our compressed
# cache uses gzip -9, which will compress even better (below 150).
php_ipro_record "$URL" 150 "-lt" 175
start_test Cache Compression Off: PHP Pre-Gzipping does not get destroyed by the cache.
# With cache compression off, we should see about 175 for both the pre and
# post optimized resource.
URL="http://uncompressedcache.example.com/mod_pagespeed_test/php_gzip.php"
php_ipro_record "$URL" 200 "-gt" 150
fi

# Verify that downstream caches and rebeaconing interact correctly for images.
start_test lazyload_images,rewrite_images with downstream cache rebeaconing
HOST_NAME="http://downstreamcacherebeacon.example.com"

0 comments on commit 0bd84a1

Please sign in to comment.