Skip to content

Commit

Permalink
Sanitized wayback url from wayback machine
Browse files Browse the repository at this point in the history
fix brave/brave-browser#32395

Only allow to load valid wayback url that has http/https scheme and
valid wayback domain.
  • Loading branch information
simonhong committed Aug 21, 2023
1 parent d56a5c8 commit 7dc3f0d
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 10 deletions.
32 changes: 29 additions & 3 deletions components/brave_wayback_machine/wayback_machine_url_fetcher.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
#include "services/network/public/cpp/simple_url_loader.h"
#include "services/network/public/mojom/fetch_api.mojom-shared.h"
#include "url/gurl.h"
#include "url/url_constants.h"

namespace {

Expand Down Expand Up @@ -57,7 +58,7 @@ WaybackMachineURLFetcher::~WaybackMachineURLFetcher() = default;

void WaybackMachineURLFetcher::Fetch(const GURL& url) {
const GURL wayback_fetch_url(std::string(kWaybackQueryURL) +
GetSanitizedURL(url).spec());
GetSanitizedInputURL(url).spec());
api_request_helper_->Request(
"GET", FixupWaybackQueryURL(wayback_fetch_url), std::string(),
"application/json",
Expand All @@ -77,15 +78,40 @@ void WaybackMachineURLFetcher::OnWaybackURLFetched(
}
auto* url_string = value_body.GetDict().FindStringByDottedPath(
"archived_snapshots.closest.url");

// Response doesn't have wayback url.
if (!url_string) {
client_->OnWaybackURLFetched(GURL::EmptyGURL());
return;
}

client_->OnWaybackURLFetched(GURL(*url_string));
client_->OnWaybackURLFetched(GetSanitizedWaybackURL(GURL(*url_string)));
}

GURL WaybackMachineURLFetcher::GetSanitizedWaybackURL(const GURL& url) const {
if (!url.is_valid()) {
return GURL::EmptyGURL();
}

if (!url.SchemeIsHTTPOrHTTPS()) {
return GURL::EmptyGURL();
}

if (url.host() != kWaybackHost) {
return GURL::EmptyGURL();
}

// Upgrade to https.
if (url.SchemeIs(url::kHttpScheme)) {
GURL::Replacements replacements;
replacements.SetSchemeStr(url::kHttpsScheme);
return url.ReplaceComponents(replacements);
}

return url;
}

GURL WaybackMachineURLFetcher::GetSanitizedURL(const GURL& url) const {
GURL WaybackMachineURLFetcher::GetSanitizedInputURL(const GURL& url) const {
GURL::Replacements replacements;
replacements.ClearRef();
replacements.ClearUsername();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ class WaybackMachineURLFetcher final {
api_request_helper::APIRequestResult api_request_result);

// Clear sensitive data such as username/password from |url|.
GURL GetSanitizedURL(const GURL& url) const;
GURL GetSanitizedInputURL(const GURL& url) const;

// Return empty GURL if |url| is not https/http and its domain is not
// archive.org.
GURL GetSanitizedWaybackURL(const GURL& url) const;

raw_ptr<Client> client_ = nullptr;
std::unique_ptr<api_request_helper::APIRequestHelper> api_request_helper_;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -102,23 +102,59 @@ class WaybackMachineURLFetcherUnitTest : public testing::Test {

TEST_F(WaybackMachineURLFetcherUnitTest, SanitizedResponse) {
SetResponseText("");
Fetch(GURL());
Fetch(GURL::EmptyGURL());
SetResponseText(
R"({"archived_snapshots":{"closest":{"url":"https://example.com/favicon.ico"}}})");
Fetch(GURL("https://example.com/favicon.ico"));
R"({"archived_snapshots":{"closest":{"url":"https://web.archive.org/favicon.ico"}}})");
Fetch(GURL("https://web.archive.org/favicon.ico"));
// broken json
SetResponseText(
R"(,{"archived_snapshots":{"closest":{"url":"https://example.com/favicon.ico"}}})");
Fetch(GURL());
R"(,{"archived_snapshots":{"closest":{"url":"https://web.archive.com/favicon.ico"}}})");
Fetch(GURL::EmptyGURL());
}

TEST_F(WaybackMachineURLFetcherUnitTest, InputURLSanitizeTest) {
constexpr char kInputURL[] = "http://myid:mypwd@test.com/";
constexpr char kSanitizedURL[] = "http://test.com/";
EXPECT_EQ(GURL(kSanitizedURL),
wayback_url_loader_->GetSanitizedURL(GURL(kInputURL)));
wayback_url_loader_->GetSanitizedInputURL(GURL(kInputURL)));

// Test sanitized url is passed to url loader.
TestFetchURL(GURL(kInputURL),
GURL(base::StrCat({kWaybackQueryURL, kSanitizedURL})));
}

TEST_F(WaybackMachineURLFetcherUnitTest, WaybackURLSanitizeTest) {
// Blocked non http/https sheme urls.
SetResponseText(
R"({"archived_snapshots":{"closest":{"url":"javascript:abcd"}}})");
Fetch(GURL::EmptyGURL());

SetResponseText(
R"({"archived_snapshots":{"closest":{"url":"javascript:abcd"}}})");
Fetch(GURL::EmptyGURL());

SetResponseText(
R"({"archived_snapshots":{"closest":{"url":"chrome://abcd"}}})");
Fetch(GURL::EmptyGURL());

SetResponseText(
R"({"archived_snapshots":{"closest":{"url":"brave://abcd"}}})");
Fetch(GURL::EmptyGURL());

SetResponseText(
R"({"archived_snapshots":{"closest":{"url":"file://abcd"}}})");
Fetch(GURL::EmptyGURL());

SetResponseText(
R"({"archived_snapshots":{"closest":{"url":"http://another_archive.org/favicon.ico"}}})");
Fetch(GURL::EmptyGURL());

SetResponseText(
R"({"archived_snapshots":{"closest":{"url":"http://web.archive.org/favicon.ico"}}})");
// Check above http url is upgraded to https.
Fetch(GURL("https://web.archive.org/favicon.ico"));

SetResponseText(
R"({"archived_snapshots":{"closest":{"url":"https://web.archive.org/favicon.ico"}}})");
Fetch(GURL("https://web.archive.org/favicon.ico"));
}

0 comments on commit 7dc3f0d

Please sign in to comment.