From 89b58139a518d561a92fdbbb21835da45aa5cc37 Mon Sep 17 00:00:00 2001 From: Andrey Savchenko Date: Wed, 25 May 2016 17:49:34 +0300 Subject: [PATCH] Try to encode URLs in sitemap to RFC3986. --- inc/sitemaps/class-sitemaps-renderer.php | 50 +++++++++++++++++++++++- 1 file changed, 48 insertions(+), 2 deletions(-) diff --git a/inc/sitemaps/class-sitemaps-renderer.php b/inc/sitemaps/class-sitemaps-renderer.php index 0c914026342..955249f8df4 100644 --- a/inc/sitemaps/class-sitemaps-renderer.php +++ b/inc/sitemaps/class-sitemaps-renderer.php @@ -192,7 +192,7 @@ public function sitemap_url( $url ) { $url['loc'] = htmlspecialchars( $url['loc'] ); $output = "\t\n"; - $output .= "\t\t" . $url['loc'] . "\n"; + $output .= "\t\t" . $this->encode_url_rfc3986( $url['loc'] ) . "\n"; $output .= empty( $date ) ? '' : "\t\t" . htmlspecialchars( $date ) . "\n"; $output .= "\t\t" . $url['chf'] . "\n"; $output .= "\t\t" . str_replace( ',', '.', $url['pri'] ) . "\n"; @@ -208,7 +208,7 @@ public function sitemap_url( $url ) { } $output .= "\t\t\n"; - $output .= "\t\t\t" . esc_html( $img['src'] ) . "\n"; + $output .= "\t\t\t" . esc_html( $this->encode_url_rfc3986( $img['src'] ) ) . "\n"; if ( ! empty( $img['title'] ) ) { $title = _wp_specialchars( html_entity_decode( $img['title'], ENT_QUOTES, $this->charset ) ); @@ -235,4 +235,50 @@ public function sitemap_url( $url ) { */ return apply_filters( 'wpseo_sitemap_url', $output, $url ); } + + /** + * Apply some best effort conversion to comply with RFC3986. + * + * @param string $url URL to encode. + * + * @return string + */ + protected function encode_url_rfc3986( $url ) { + + if ( filter_var( $url, FILTER_VALIDATE_URL ) ) { + return $url; + } + + $path = parse_url( $url, PHP_URL_PATH ); + + if ( ! empty( $path ) && '/' !== $path ) { + + $encoded_path = explode( '/', $path ); + $encoded_path = array_map( 'rawurlencode', $encoded_path ); + $encoded_path = implode( '/', $encoded_path ); + $encoded_path = str_replace( '%7E', '~', $encoded_path ); // PHP <5.3. + + $url = str_replace( $path, $encoded_path, $url ); + } + + $query = parse_url( $url, PHP_URL_QUERY ); + + if ( ! empty( $query ) ) { + + parse_str( $query, $parsed_query ); + + if ( defined( 'PHP_QUERY_RFC3986' ) ) { // PHP 5.4+. + $parsed_query = http_build_query( $parsed_query, null, '&', PHP_QUERY_RFC3986 ); + } + else { + $parsed_query = http_build_query( $parsed_query, null, '&' ); + $parsed_query = str_replace( '+', '%20', $parsed_query ); + $parsed_query = str_replace( '%7E', '~', $parsed_query ); + } + + $url = str_replace( $query, $parsed_query, $url ); + } + + return $url; + } }