Skip to content

Commit

Permalink
Fix recursive downloads with '-c' option
Browse files Browse the repository at this point in the history
* bootstrap.conf: Enable safe-read module
* src/wget.c: Fix recursive downloads with '-c' option
* tests/Makefile.am: Add test-c-r.c
* tests/test-c-r.c: Add tests
  • Loading branch information
akashrawal authored and rockdaboot committed Jul 25, 2017
1 parent b3051c7 commit e0b1a6b
Show file tree
Hide file tree
Showing 4 changed files with 234 additions and 27 deletions.
1 change: 1 addition & 0 deletions bootstrap.conf
Expand Up @@ -94,6 +94,7 @@ realloc-posix
recv
recvfrom
rename
safe-read
safe-write
select
send
Expand Down
115 changes: 89 additions & 26 deletions src/wget.c
Expand Up @@ -48,6 +48,7 @@
#endif

#include "timespec.h" // gnulib gettime()
#include "safe-read.h"
#include "safe-write.h"
#include "wget_main.h"
#include "wget_log.h"
Expand Down Expand Up @@ -95,7 +96,8 @@ typedef struct {
static _statistics_t stats;

static int G_GNUC_WGET_NONNULL((1)) _prepare_file(wget_http_response_t *resp, const char *fname, int flag,
const char *uri, const char *original_url, int ignore_patterns);
const char *uri, const char *original_url, int ignore_patterns, wget_buffer_t *partial_content,
size_t max_partial_content);

static void
sitemap_parse_xml(JOB *job, const char *data, const char *encoding, wget_iri_t *base),
Expand Down Expand Up @@ -1592,7 +1594,7 @@ static void process_response(wget_http_response_t *resp)
}
}

if (resp->code == 200) {
if (resp->code == 200 || resp->code == 206) {
if (config.recursive && (!config.level || job->level < config.level + config.page_requisites)) {
if (resp->content_type && resp->body) {
if (!wget_strcasecmp_ascii(resp->content_type, "text/html")) {
Expand Down Expand Up @@ -1630,7 +1632,7 @@ static void process_response(wget_http_response_t *resp)
}
}
}
else if (resp->code == 304 && config.timestamping) { // local document is up-to-date
else if ((resp->code == 304 && config.timestamping) || resp->code == 416) { // local document is up-to-date
if (config.recursive && (!config.level || job->level < config.level + config.page_requisites) && job->local_filename) {
const char *ext;

Expand Down Expand Up @@ -2420,13 +2422,56 @@ static void set_file_mtime(int fd, time_t modified)
error_printf (_("Failed to set file date: %s\n"), strerror (errno));
}

// On windows, open() and fopen() return EACCES instead of EISDIR.
static int _wa_open(const char *fname, int flags, mode_t mode) {
int fd = open(fname, flags, mode);
#ifdef _WIN32
if (fd < 0 && errno == EACCES) {
DWORD attrs = GetFileAttributes(fname);
if (attrs != INVALID_FILE_ATTRIBUTES && (attrs & FILE_ATTRIBUTE_DIRECTORY))
errno = EISDIR;
}
#endif
return fd;
}

// Opens files uniquely
static int _open_unique(const char *fname, int flags, mode_t mode, int multiple, char *unique, size_t unique_len)
{
if (unique_len && unique[0]) {
return _wa_open(unique, flags, mode);
} else {
size_t fname_len, i, lim, n_digits;
int fd;

fd = _wa_open(fname, flags, mode);
if (fd >= 0)
return fd;

fname_len = strlen(fname);
if (unique_len < fname_len + 3)
return fd;

for (n_digits = unique_len - fname_len - 2, lim = 1; n_digits; n_digits--, lim *= 10)
;

for (i = 1; i < lim && fd < 0 && ((multiple && errno == EEXIST) || errno == EISDIR); i++) {
snprintf(unique, unique_len, "%s.%zu", fname, i);
fd = _wa_open(unique, flags, mode);
}

return fd;
}
}

static int G_GNUC_WGET_NONNULL((1)) _prepare_file(wget_http_response_t *resp, const char *fname, int flag,
const char *uri, const char *original_url, int ignore_patterns)
const char *uri, const char *original_url, int ignore_patterns, wget_buffer_t *partial_content,
size_t max_partial_content)
{
static wget_thread_mutex_t
savefile_mutex = WGET_THREAD_MUTEX_INITIALIZER;
char *alloced_fname = NULL;
int fd, multiple = 0, fnum, oflag = flag, maxloop;
int fd, multiple = 0, oflag = flag;
size_t fname_length;

if (!fname)
Expand Down Expand Up @@ -2564,38 +2609,52 @@ static int G_GNUC_WGET_NONNULL((1)) _prepare_file(wget_http_response_t *resp, co

// create the complete directory path
mkdir_path((char *) fname);
fd = open(fname, O_WRONLY | flag | O_CREAT | O_NONBLOCK | O_BINARY, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);
// debug_printf("1 fd=%d flag=%02x (%02x %02x %02x) errno=%d %s\n",fd,flag,O_EXCL,O_TRUNC,O_APPEND,errno,fname);

// find a non-existing filename
char unique[fname_length + 1];
*unique = 0;
for (fnum = 0, maxloop = 999; fd < 0 && ((multiple && errno == EEXIST) || errno == EISDIR || errno == EACCES) && fnum < maxloop; fnum++) {
snprintf(unique, sizeof(unique), "%s.%d", fname, fnum + 1);
fd = open(unique, O_WRONLY | flag | O_CREAT | O_NONBLOCK | O_BINARY, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH);

// Load partial content
if (partial_content) {
long long size = get_file_size(unique[0] ? unique : fname);
if (size > 0) {
fd = _open_unique(fname, O_RDONLY | O_BINARY, 0, multiple, unique, fname_length + 1);
if (fd >= 0) {
size_t rc;
if ((unsigned long long) size > max_partial_content)
size = max_partial_content;
wget_buffer_memset_append(partial_content, 0, size);
rc = safe_read(fd, partial_content->data, size);
if (rc == SAFE_READ_ERROR || (long long) rc != size) {
error_printf(_("Failed to load partial content from '%s' (errno=%d): %s\n"),
fname, errno, strerror(errno));
set_exit_status(3);
}
close(fd);
} else {
error_printf(_("Failed to load partial content from '%s' (errno=%d): %s\n"),
fname, errno, strerror(errno));
set_exit_status(3);
}
}
}

fd = _open_unique(fname, O_WRONLY | flag | O_CREAT | O_NONBLOCK | O_BINARY, S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH,
multiple, unique, fname_length + 1);
// debug_printf("1 fd=%d flag=%02x (%02x %02x %02x) errno=%d %s\n",fd,flag,O_EXCL,O_TRUNC,O_APPEND,errno,fname);

if (fd >= 0) {
ssize_t rc;

info_printf(_("Saving '%s'\n"), fnum ? unique : fname);
info_printf(_("Saving '%s'\n"), unique[0] ? unique : fname);

if (config.save_headers) {
if ((rc = write(fd, resp->header->data, resp->header->length)) != (ssize_t)resp->header->length) {
error_printf(_("Failed to write file %s (%zd, errno=%d)\n"), fnum ? unique : fname, rc, errno);
error_printf(_("Failed to write file %s (%zd, errno=%d)\n"), unique[0] ? unique : fname, rc, errno);
set_exit_status(3);
}
}
} else {
if (fd == -1) {
#ifdef _WIN32
// On windows, open() and fopen() return EACCES instead of EISDIR.
if (errno == EACCES) {
DWORD attrs = GetFileAttributes(unique);
if (attrs != INVALID_FILE_ATTRIBUTES && (attrs & FILE_ATTRIBUTE_DIRECTORY))
errno = EISDIR;
}
#endif
if (errno == EEXIST)
error_printf(_("File '%s' already there; not retrieving.\n"), fname);
else if (errno == EISDIR)
Expand All @@ -2609,7 +2668,7 @@ static int G_GNUC_WGET_NONNULL((1)) _prepare_file(wget_http_response_t *resp, co

if (config.xattr) {
FILE *fp;
fname = fnum ? unique : fname;
fname = unique[0] ? unique : fname;
if ((fp = fopen(fname, "ab"))) {
set_file_metadata(uri, original_url, resp->content_type, resp->content_type_encoding, fp);
fclose(fp);
Expand Down Expand Up @@ -2683,7 +2742,9 @@ static int _get_header(wget_http_response_t *resp, void *context)
resp->code == 206 ? O_APPEND : O_TRUNC,
ctx->job->iri->uri,
ctx->job->original_url->uri,
ctx->job->ignore_patterns);
ctx->job->ignore_patterns,
resp->code == 206 ? ctx->body : NULL,
ctx->max_memory);
if (ctx->outfd == -1)
ret = -1;
}
Expand Down Expand Up @@ -2820,9 +2881,11 @@ static wget_http_request_t *http_create_request(wget_iri_t *iri, JOB *job)
if (config.continue_download || config.timestamping) {
const char *local_filename = config.output_document ? config.output_document : job->local_filename;

if (config.continue_download)
wget_http_add_header_printf(req, "Range", "bytes=%lld-",
get_file_size(local_filename));
if (config.continue_download) {
long long file_size = get_file_size(local_filename);
if (file_size > 0)
wget_http_add_header_printf(req, "Range", "bytes=%lld-", file_size);
}

if (config.timestamping) {
time_t mtime = get_file_mtime(local_filename);
Expand Down
2 changes: 1 addition & 1 deletion tests/Makefile.am
Expand Up @@ -11,7 +11,7 @@ LDADD = libtest.la\
$(LIBS)

# test-iri-disabled removed since Wget2 always uses IRIs
WGET_TESTS = test-wget-1$(EXEEXT) test-restrict-ascii$(EXEEXT) test-i-http$(EXEEXT) test-i-https$(EXEEXT)\
WGET_TESTS = test-wget-1$(EXEEXT) test-c-r$(EXEEXT) test-restrict-ascii$(EXEEXT) test-i-http$(EXEEXT) test-i-https$(EXEEXT)\
test-np$(EXEEXT) test--spider-r$(EXEEXT) test-meta-robots$(EXEEXT) test-idn-robots$(EXEEXT) test-idn-meta$(EXEEXT)\
test-idn-cmd$(EXEEXT) test-iri$(EXEEXT) test-iri-percent$(EXEEXT) test-iri-list$(EXEEXT) test-iri-forced-remote$(EXEEXT)\
test-auth-basic$(EXEEXT) test-parse-rss$(EXEEXT) test--page-requisites$(EXEEXT)\
Expand Down
143 changes: 143 additions & 0 deletions tests/test-c-r.c
@@ -0,0 +1,143 @@
/*
* Copyright(c) 2017 Free Software Foundation, Inc.
*
* This file is part of libwget.
*
* Libwget is free software: you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation, either version 3 of the License, or
* (at your option) any later version.
*
* Libwget is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with libwget. If not, see <https://www.gnu.org/licenses/>.
*
*
* Testing recursive downloads with partial content
*
*/

#include <config.h>

#include <stdlib.h> // exit()
#include <string.h> // strlen()
#include "libtest.h"

static const char *mainpage = "\
<html>\n\
<head>\n\
<title>Main Page</title>\n\
</head>\n\
<body>\n\
<p>\n\
<a href=\"http://localhost:{{port}}/secondpage.html\">second page</a>.\n\
<a href=\"http://localhost:{{port}}/thirdpage.html\">third page</a>.\n\
</p>\n\
</body>\n\
</html>\n";

static const char *subpage = "\
<html>\n\
<head>\n\
<title>Main Page</title>\n\
</head>\n\
<body>\n\
<p>\n\
Some text\n\
</p>\n\
</body>\n\
</html>\n";

static const char *mainpage_partial = "\
<html>\n\
<head>\n\
<title>Main Page</title>\n\
</head>\n\
<body>\n\
<p>\n\
<a href=\"http://localhost:{{port}}/secondpage.html\">second page</a>.\n";

int main(void)
{
wget_test_url_t urls[]={
{ .name = "/index.html",
.code = "200 Dontcare",
.body = mainpage,
.headers = {
"Content-Type: text/html",
}
},
{ .name = "/secondpage.html",
.code = "200 Dontcare",
.body = subpage,
.headers = {
"Content-Type: text/html",
}
},
{ .name = "/thirdpage.html",
.code = "200 Dontcare",
.body = subpage,
.headers = {
"Content-Type: text/html",
}
},
{ .name = "/index_partial.html",
.code = "200 Dontcare",
.body = mainpage_partial,
.headers = {
"Content-Type: text/html",
}
}
};

wget_test_start_server
(WGET_TEST_RESPONSE_URLS, &urls, countof(urls), 0);

// test-c-r with no existing files
wget_test(
WGET_TEST_OPTIONS, "--continue --recursive --no-host-directories",
WGET_TEST_REQUEST_URL, "index.html",
WGET_TEST_EXPECTED_ERROR_CODE, 0,
WGET_TEST_EXPECTED_FILES, &(wget_test_file_t []) {
{ "index.html", urls[0].body },
{ "secondpage.html", urls[1].body },
{ "thirdpage.html", urls[2].body },
{ NULL } },
0);

// test-c-r with existing file
wget_test(
WGET_TEST_OPTIONS, "--continue --recursive --no-host-directories",
WGET_TEST_REQUEST_URL, "index.html",
WGET_TEST_EXPECTED_ERROR_CODE, 0,
WGET_TEST_EXISTING_FILES, &(wget_test_file_t []) {
{ "index.html", urls[0].body },
{ NULL } },
WGET_TEST_EXPECTED_FILES, &(wget_test_file_t []) {
{ "index.html", urls[0].body },
{ "secondpage.html", urls[1].body },
{ "thirdpage.html", urls[2].body },
{ NULL } },
0);

// test-c-r with existing partial file
wget_test(
WGET_TEST_OPTIONS, "--continue --recursive --no-host-directories",
WGET_TEST_REQUEST_URL, "index.html",
WGET_TEST_EXPECTED_ERROR_CODE, 0,
WGET_TEST_EXISTING_FILES, &(wget_test_file_t []) {
{ "index.html", urls[3].body },
{ NULL } },
WGET_TEST_EXPECTED_FILES, &(wget_test_file_t []) {
{ "index.html", urls[0].body },
{ "secondpage.html", urls[1].body },
{ "thirdpage.html", urls[2].body },
{ NULL } },
0);

exit(0);
}

0 comments on commit e0b1a6b

Please sign in to comment.