Skip to content

Commit

Permalink
{CRAWLTIMELIMIT} contains the -t arg. Need to get raw variable
Browse files Browse the repository at this point in the history
Add == before base64decode
  • Loading branch information
hieuhoang committed Nov 8, 2018
1 parent 5104317 commit 907434a
Show file tree
Hide file tree
Showing 2 changed files with 9 additions and 4 deletions.
8 changes: 6 additions & 2 deletions bitextor-identifyMIME.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,15 @@
content=fields[0]
#~Mime and encodign
m.setflags(16|1024)
magicoutput=m.buffer(base64.b64decode(content)).split(" ")
#print("content", content)
d = base64.b64decode(content + "==")
#print("d", d)

magicoutput=m.buffer(d).split(" ")
magicoutput[0]=magicoutput[0][:-1]
magicoutput.append(url)
try:
magicoutput.append(base64.b64encode(base64.b64decode(content).decode(magicoutput[1].split("=")[1].replace("unknown-8bit","iso-8859-1").replace('us-ascii','iso-8859-1')).encode("utf8")).decode("utf8"))
magicoutput.append(base64.b64encode(base64.b64decode(content + "==").decode(magicoutput[1].split("=")[1].replace("unknown-8bit","iso-8859-1").replace('us-ascii','iso-8859-1')).encode("utf8")).decode("utf8"))
print("\t".join(magicoutput))
except LookupError as e:
sys.stderr.write("Unknown character encoding in file "+url+": "+str(e)+"\n")
Expand Down
5 changes: 3 additions & 2 deletions snakemake/Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -350,11 +350,12 @@ rule httrack_download:
'{dir}'.format(dir=permanent)+'/{target}.httrack.warc.xz'
params:
url="http://{target}"
,
crawlTimeLimit = config["crawlTimeLimit"]
shell:
'mkdir -p {permanent}; '
'DIRNAME=$(mktemp -d {TMPDIR}/downloaded_websites.XXXXXX); '
'echo "{BITEXTOR}/bitextor-downloadweb.sh {params.url} $DIRNAME {CRAWLTIMELIMIT}"; '
'{BITEXTOR}/bitextor-downloadweb.sh {params.url} $DIRNAME {CRAWLTIMELIMIT}; '
'{BITEXTOR}/bitextor-downloadweb.sh {params.url} $DIRNAME {params.crawlTimeLimit}; '
'{BITEXTOR}/bitextor-webdir2warc.sh $DIRNAME | xz -c > {output}; '
'rm -rf $DIRNAME;'

Expand Down

0 comments on commit 907434a

Please sign in to comment.