# Working with binary data

# Bytes and bytearray Type

In [1]:
text ="hello"
list(text)

['h', 'e', 'l', 'l', 'o']

In [2]:
data=b"hello"
list(data)

[104, 101, 108, 108, 111]

In [3]:
bytestr=bytes(b'abc')
# initializing a string with b 
# makes it a binary string
bytestr

b'abc'

In [4]:
bytestr[0]

97

In [5]:
bytestr[0] = 98

TypeError: 'bytes' object does not support item assignment

In [6]:
# Cast bytes to bytearray
mutable_bytes = bytearray(b'\x00\x0F')

# Bytearray allows modification
mutable_bytes[0] = 255
mutable_bytes.append(255)
print(mutable_bytes)

# Cast bytearray back to bytes
immutable_bytes = bytes(mutable_bytes)
print(immutable_bytes)

bytearray(b'\xff\x0f\xff')
b'\xff\x0f\xff'


# Writing Bytes to file

In [7]:
# Pass "wb" to write a new file, or "ab" to append
with open("test.txt", "wb") as binary_file:
    # Write text or bytes to the file
    binary_file.write("Write text by encoding\n".encode('utf8'))
    num_bytes_written = binary_file.write(b'\xDE\xAD\xBE\xEF')
    print("Wrote %d bytes." % num_bytes_written)

Wrote 4 bytes.


# Reading Bytes from a file

In [8]:
with open("test.txt", "rb") as binary_file:
    # Read the whole file at once
    data = binary_file.read()
    print(data)

b'Write text by encoding\n\xde\xad\xbe\xef'


# Reading file line by line

In [9]:
with open("test.txt", "rb") as text_file:
    # One option is to call readline() explicitly
    # single_line = text_file.readline()

    # It is easier to use a for loop to iterate each line
    for line in text_file:
        print(line)

b'Write text by encoding\n'
b'\xde\xad\xbe\xef'


# Getting size of a file

In [10]:
import os
file_length_in_bytes = os.path.getsize("test.txt")
print(file_length_in_bytes)

27


# Seeking a specific position in a file

In [11]:
# Seek can be called one of two ways:
#   x.seek(offset)
#   x.seek(offset, starting_point)

# starting_point can be 0, 1, or 2
# 0 - Default. Offset relative to beginning of file
# 1 - Start from the current position in the file
# 2 - Start from the end of a file (will require a negative offset)

with open("test.txt", "rb") as binary_file:
    # Seek a specific position in the file and read N bytes
    binary_file.seek(0, 0)  # Go to beginning of the file
    couple_bytes = binary_file.read(2)
    print(couple_bytes)

b'Wr'


# Getting system byte order

In [18]:
# Find out what byte order your system uses
import sys
print("Native byteorder: ", sys.byteorder)

Native byteorder:  little


# Integer to Bytes conversion

In [25]:
i = 16
# Create two byte from the integer 16
two_byte = i.to_bytes(2, byteorder='big', signed=False) 
print(two_byte)

b'\x00\x10'


In [26]:
i = 16
# Create two bytes from the integer
two_byte = i.to_bytes(2, byteorder='little', signed=False)
print(two_byte)

b'\x10\x00'


In [27]:
i = -16
# Create two bytes from the integer
two_byte = i.to_bytes(2, byteorder='little', signed=True)
print(two_byte)

b'\xf0\xff'


In [28]:
# Create bytes from a list of integers with values from 0-255
bytes_from_list = bytes([255, 254, 253, 252])
print(bytes_from_list)

b'\xff\xfe\xfd\xfc'


In [29]:
# Create a byte from a base 2 integer
one_byte = int('11110000', 2)
print(one_byte)

240


# Bytes to integer conversion

In [30]:
# Create an int from bytes. Default is unsigned.
some_bytes = b'\x00\xF0'
i = int.from_bytes(some_bytes, byteorder='big')
print(i)

240


In [31]:
# Create a signed int
i = int.from_bytes(b'\x00\x0F', byteorder='big', signed=True)
print(i)

15


# Character Encoding

In [32]:
# Binary to Text
binary_data = b'I am text.'
text = binary_data.decode('utf-8')
print(text)

I am text.


In [33]:
binary_data = bytes([65, 66, 67])  
# ASCII values for A, B, C
text = binary_data.decode('utf-8')
print(text)

ABC


In [34]:
# Text to Binary
message = "Hello"  # str
binary_message = message.encode('utf-8')
print(type(binary_message))  # bytes

<class 'bytes'>


# Format Strings

In [35]:
a_byte = b'\xff'  # 255
i = ord(a_byte)   # Get the integer value of the byte

bin = "{0:b}".format(i) # binary: 11111111
hex = "{0:x}".format(i) # hexadecimal: ff
oct = "{0:o}".format(i) # octal: 377

print(i)
print(bin)
print(hex)
print(oct)

255
11111111
ff
377


# Bitwise Operations

In [36]:
byte1 = int('11110000', 2)  # 240
byte2 = int('00001111', 2)  # 15
byte3 = int('01010101', 2)  # 85

# AND
print(byte1 & byte2)

0


In [37]:
# OR
print(byte1 | byte2)

255


In [38]:
# XOR
print(byte1 ^ byte3)

165


In [39]:
# Shifting right will lose the right-most bit
print(byte2 >> 3)

1


In [40]:
# Shifting left will add a 0 bit on the right side
print(byte2 << 1)

30


In [41]:
# See if a single bit is set
bit_mask = int('00000001', 2)  # Bit 1
print(bit_mask & byte1)  # Is bit set in byte1?
print(bit_mask & byte2)  # Is bit set in byte2?

0
1


# Struct Module

In [42]:
import struct
# struct.pack () - Packing values to Python byte-string (byte object)
# The first parameter is the format string. Here it specifies the data is structured
# with a single four-byte integer followed by two characters.
# The rest of the parameters are the values for each item in order
binary_data = struct.pack("icc", 8499000, b'A', b'Z')
print(binary_data)

b'8\xaf\x81\x00AZ'


In [43]:
# When unpacking, you receive a tuple of all data in the same order
tuple_of_data = struct.unpack("icc", binary_data)
print(tuple_of_data)

(8499000, b'A', b'Z')


In [46]:
import struct
print("The size of 3 integer is :", struct.calcsize('iii'))
print("The size of 5 char is :", struct.calcsize('ccccc'))
print("The total size is :", struct.calcsize('ffiicc'))

The size of 3 integer is : 12
The size of 5 char is : 5
The total size is : 18


# Examples

In [3]:
# diff.py - Do two files match?
import sys

with open('BAND1.jpg', 'rb') as file1, \
        open('BAND2.jpg', 'rb') as file2:
    data1 = file1.read()
    data2 = file2.read()

if data1 != data2:
    print("Files do not match.")
else:
    print("Files match.")

Files match.


In [1]:
#is_jpeg.py - Does the file have a JPEG binary signature?
import sys
import binascii

jpeg_signatures = [
    binascii.unhexlify(b'FFD8FFD8'),
    binascii.unhexlify(b'FFD8FFE0'),
    binascii.unhexlify(b'FFD8FFE1')
]

with open('BAND1.jpg', 'rb') as file:
    first_four_bytes = file.read(4)

    if first_four_bytes in jpeg_signatures:
        print("JPEG detected.")
    else:
        print("File does not look like a JPEG.")

JPEG detected.
