Skip to content

Commit

Permalink
Merge a2b5cbc into 6d9de9a
Browse files Browse the repository at this point in the history
  • Loading branch information
dshunfen committed Dec 24, 2018
2 parents 6d9de9a + a2b5cbc commit 37a7f6b
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 27 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,6 @@
.ssh
*dsa*
.idea

Pipfile
Pipfile.lock
59 changes: 35 additions & 24 deletions puremagic/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import json
import binascii
from itertools import chain
from collections import namedtuple

__author__ = "Chris Griffith"
__version__ = "1.4"
Expand All @@ -25,6 +26,10 @@

here = os.path.abspath(os.path.dirname(__file__))

MAGIC_INFO_TYPES = ('byte_match', 'offset', 'extension', 'mime_type', 'name',)
PureMagic = namedtuple('PureMagic', MAGIC_INFO_TYPES)
PureMagicWithConfidence = namedtuple('PureMagicWithConfidence', (MAGIC_INFO_TYPES + ('confidence',)))


class PureError(LookupError):
"""Do not have that type of file in our databanks"""
Expand All @@ -34,20 +39,26 @@ def _magic_data(filename=os.path.join(here, 'magic_data.json')):
""" Read the magic file"""
with open(filename) as f:
data = json.load(f)
for x in data['headers']:
x[0] = binascii.unhexlify(x[0].encode('ascii'))
for x in data['footers']:
x[0] = binascii.unhexlify(x[0].encode('ascii'))
return data['headers'], data['footers']
headers = [_create_puremagic(x) for x in data['headers']]
footers = [_create_puremagic(x) for x in data['footers']]
return headers, footers


def _create_puremagic(x):
return PureMagic(byte_match=binascii.unhexlify(x[0].encode('ascii')),
offset=x[1],
extension=x[2],
mime_type=x[3],
name=x[4])


magic_header_array, magic_footer_array = _magic_data()


def _max_lengths():
""" The length of the largest magic string + its offset"""
max_header_length = max([len(x[0]) + x[1] for x in magic_header_array])
max_footer_length = max([len(x[0]) + abs(x[1])
max_header_length = max([len(x.byte_match) + x.offset for x in magic_header_array])
max_footer_length = max([len(x.byte_match) + abs(x.offset)
for x in magic_footer_array])
return max_header_length, max_footer_length

Expand All @@ -56,12 +67,12 @@ def _confidence(matches, ext=None):
""" Rough confidence based on string length and file extension"""
results = []
for match in matches:
con = (0.8 if len(match[0]) > 9 else
float("0.{0}".format(len(match[0]))))
if ext == match[0]:
con = (0.8 if len(match.extension) > 9 else
float("0.{0}".format(len(match.extension))))
if ext == match.extension:
con = 0.9
results.append(match + [con])
return sorted(results, key=lambda x: x[3], reverse=True)
results.append(PureMagicWithConfidence(**match._asdict(), confidence=con))
return sorted(results, key=lambda x: x.confidence, reverse=True)


def _identify_all(header, footer, ext=None):
Expand All @@ -71,17 +82,17 @@ def _identify_all(header, footer, ext=None):
# That way we do not try to identify bytes that don't exist
matches = list()
for magic_row in magic_header_array:
start = magic_row[1]
end = magic_row[1] + len(magic_row[0])
start = magic_row.offset
end = magic_row.offset + len(magic_row.byte_match)
if end > len(header):
continue
if header[start:end] == magic_row[0]:
matches.append([magic_row[2], magic_row[3], magic_row[4]])
if header[start:end] == magic_row.byte_match:
matches.append(magic_row)

for magic_row in magic_footer_array:
start = magic_row[1]
if footer[start:] == magic_row[0]:
matches.append([magic_row[2], magic_row[3], magic_row[4]])
start = magic_row.offset
if footer[start:] == magic_row.byte_match:
matches.append(magic_row)
if not matches:
raise PureError("Could not identify file")

Expand All @@ -94,8 +105,8 @@ def _magic(header, footer, mime, ext=None):
raise ValueError("Input was empty")
info = _identify_all(header, footer, ext)[0]
if mime:
return info[1]
return info[0] if not isinstance(info[0], list) else info[0][0]
return info.mime_type
return info.extension if not isinstance(info.extension, list) else info[0].extension


def _file_details(filename):
Expand Down Expand Up @@ -128,7 +139,7 @@ def ext_from_filename(filename):
except ValueError:
return ''
ext = ".{0}".format(ext)
all_exts = [x[2] for x in chain(magic_header_array, magic_footer_array)]
all_exts = [x.extension for x in chain(magic_header_array, magic_footer_array)]

if base[-4:].startswith("."):
# For double extensions like like .tar.gz
Expand Down Expand Up @@ -182,7 +193,7 @@ def magic_file(filename):
info = _identify_all(head, foot, ext_from_filename(filename))
except PureError:
info = []
info.sort(key=lambda x: x[3], reverse=True)
info.sort(key=lambda x: x.confidence, reverse=True)
return info


Expand All @@ -200,7 +211,7 @@ def magic_string(string, filename=None):
head, foot = _string_details(string)
ext = ext_from_filename(filename) if filename else None
info = _identify_all(head, foot, ext)
info.sort(key=lambda x: x[3], reverse=True)
info.sort(key=lambda x: x.confidence, reverse=True)
return info


Expand Down
6 changes: 3 additions & 3 deletions test/test_common_extensions.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def test_string_with_filename_hint(self):
def test_string_with_confidence(self):
"""String identification: magic_string |"""
ext = puremagic.magic_string(bytes(self.mp4magic))
self.assertEqual(self.expect_ext, ext[0][0])
self.assertEqual(self.expect_ext, ext[0].extension)
self.assertRaises(ValueError, puremagic.magic_string, "")

def test_magic_string_with_filename_hint(self):
Expand All @@ -65,7 +65,7 @@ def test_magic_string_with_filename_hint(self):
with open(filename, "rb") as f:
data = f.read()
ext = puremagic.magic_string(data, filename=filename)
self.assertEqual(".xlsx", ext[0][0])
self.assertEqual(".xlsx", ext[0].extension)

def test_not_found(self):
"""Bad file type via string |"""
Expand All @@ -79,7 +79,7 @@ def test_not_found(self):

def test_magic_file(self):
"""File identification with magic_file |"""
self.assertEqual(puremagic.magic_file(TGA_FILE)[0][0], ".tga")
self.assertEqual(puremagic.magic_file(TGA_FILE)[0].extension, ".tga")
open("test_empty_file", "w").close()
try:
self.assertRaises(ValueError,
Expand Down

0 comments on commit 37a7f6b

Please sign in to comment.