# P4 to Python Hashings, etc.

In [20]:
from crccheck.crc import Crc32, CrcXmodem, Crc16Genibus
from crccheck.checksum import Checksum32
import numpy as np
import bitarray
import binascii
import socket
import binascii
from bitstring import BitArray

RR_A = 1
RR_CNAME = 5
LABEL_LENGTH = 7
LABEL_LENGTH_BITS = LABEL_LENGTH * 8

def hex_to_binary(my_hexdata, num_of_bits):
    scale = 16 ## equals to hexadecimal
    return bin(int(my_hexdata, scale))[2:].zfill(num_of_bits)

def str_to_binary(label):
    ba = bitarray.bitarray()
    ba.frombytes(label.encode('utf-8'))
    return ba.tolist()

def bitstring_to_bytes(s):
    '''
    From string of bits to bytearray
    '''
    return int(s, 2).to_bytes((len(s) + 7) // 8, byteorder='big')
    
def calc_crc_16_IP(ip):
    '''
    Takes IP address (regular IP format 1.1.1.1), casts it to 56-bit (according to the label length)
    Returns the hash in hexadecimal
    '''
    ip_to_bytes = binascii.hexlify(socket.inet_aton(ip))
    ip_to_hex = ip_to_bytes.decode("utf-8")
    c = BitArray(hex=ip_to_hex)
    full_label = "0"*20 + c.bin
    binary = bitstring_to_bytes(full_label)
    crc_genibus = Crc16Genibus()
    crc_genibus.process(binary)
    return crc_genibus.finalhex()


ip_addr = "192.168.200.10"
# print("HASH OF IP ", ip_addr, calc_crc_16_IP(ip_addr))

# """

def calc_crc_16_P4(url):
    ''' 
    Takes a url and returns the hashes of every label in a list 
    The hashing is based on the P4 program
    '''
    url = url.split(".")
    hashes = []
    try:
        for label in url:
            full_label = np.zeros((LABEL_LENGTH_BITS))
            bit_index = LABEL_LENGTH_BITS
            label_index = 0

            # part 1
            if not len(label)%2 == 0:
                parse_chars = label[label_index]
                full_label[bit_index - 8:bit_index] = str_to_binary(parse_chars)
                label_index += 1 # 1 char parsed
            bit_index -= 8

            # part 2
            if len(label) == 2 or len(label) == 3 or len(label) == 6 or len(label) == 7:
                # parse two characters (16 bits)
                parse_chars = ''.join(label[label_index: label_index + 2])
                full_label[bit_index - 16:bit_index] = str_to_binary(parse_chars)
                label_index += 2 # 2 chars parsed
            bit_index -= 16

            # part 4
            if (len(label) >= 4 and len(label) <= 7):
                # parse 4 characters
                parse_chars = ''.join(label[label_index: label_index + 4])
                full_label[bit_index - 32:bit_index] = str_to_binary(parse_chars)
                label_index += 4 # 4 chars parsed
            bit_index -= 32

            full_label = [str(int(a)) for a in full_label] # from list of float to list of str
            full_label = ''.join(list(full_label)) # concat to string
            binary = bitstring_to_bytes(full_label)
            crc_genibus = Crc16Genibus()
            crc_genibus.process(binary)
            hashes.append(crc_genibus.finalhex())
    except Exception as e:
        print(e)
        return -1
        
    return hashes


# print("hash of abc.efg.acdefgh", calc_crc_16_P4("abc.efg.abcdefgh"))
# print()

def hash_concat_hashes(url, initial_hash_str):
    '''
    Takes:
    url: the domain that needs to be hashed (per label), such as "NS2.AMERICATELNET.COM.PE"
    initial_hash_str: the initial hash in hexademical form (16bit) represented in a string, such as "0000" or "14e5"
    NOTE: THE URL MUST BE 4 LABELS
    
    Returns:
    list: the hash concatenation of the whole URL, aka hash_concat_hashes
    '''
    def binarystring_to_binarylist(word):
        binary_list = []
        for b in word:
            if b == '0':
                binary_list.append(False)
            else:
                binary_list.append(True)
                
        return binary_list
    
    hashes = calc_crc_16_P4(url)
    # loop over all the labels
    for label in url.split("."):
        # loop over every 7 characters of the label
        flag = 0
        while flag == 0:
            full_label = np.zeros((32))
            if len(label) <= 7:
                hashes = calc_crc_16_P4(label)
                flag = 1
            else:
                hashes = calc_crc_16_P4(label[:7])
                label = label[7:]
            
            aa = [a for a in hex_to_binary(initial_hash_str, 16)]
            full_label[0:16] = aa
            aa = [a for a in hex_to_binary(hashes[0], 16)]
            full_label[16:32] = aa

            full_label = [str(int(a)) for a in full_label]
            full_label = ''.join(list(full_label))
            binary = bitstring_to_bytes(full_label)
            crc_genibus = Crc16Genibus()
            crc_genibus.process(binary)

            initial_hash_str = crc_genibus.finalhex()

    return initial_hash_str


# print("Hash concat Hashes", hash_concat_hashes("abc.googleee.doodleee", "0000"))

def hash_last_label(last_label_str):
    '''
    Takes:
    url: the domain that needs to be hashed (per label), such as "NS2.AMERICATELNET.COM.PE"
    initial_hash_str: the initial hash in hexademical form (16bit) represented in a string, such as "0000" or "14e5"
    NOTE: THE URL MUST BE 4 LABELS
    
    Returns:
    list: the hash concatenation of the whole URL, aka hash_concat_hashes
    '''
    initial_hash_str = "0000"
    
    while True:
        full_label = np.zeros((32))
        if len(last_label_str) == 0:
            break
        elif len(last_label_str) <= LABEL_LENGTH:
            hashes = calc_crc_16_P4(last_label_str)
        else:
            hashes = calc_crc_16_P4(last_label_str[:LABEL_LENGTH])

        if hashes != -1:
            aa = [a for a in hex_to_binary(initial_hash_str, 16)]
            full_label[0:16] = aa
            aa = [a for a in hex_to_binary(hashes[0], 16)]
            full_label[16:32] = aa

            full_label = [str(int(a)) for a in full_label]
            full_label = ''.join(list(full_label))
            binary = bitstring_to_bytes(full_label)
            crc_genibus = Crc16Genibus()
            crc_genibus.process(binary)

            initial_hash_str = crc_genibus.finalhex()
            
            last_label_str = last_label_str[min(LABEL_LENGTH, len(last_label_str)):]

        else:
            return "-1"

    return initial_hash_str

# print("Hash_last_label", hash_last_label("googleee"))

# abc
# print("a".encode('utf-8').hex())
# print("bc".encode('utf-8').hex())
# print("ab".encode('utf-8').hex())

# # googleee
# print("g".encode('utf-8').hex())
# print("go".encode('utf-8').hex())
# print("oo".encode('utf-8').hex())
# print("og".encode('utf-8').hex())
# print("gl".encode('utf-8').hex())
# print("le".encode('utf-8').hex())

# e
# print("e".encode('utf-8').hex())

print(hash_last_label("xn--ygbi2ammx"))
# """

5d19


# Calculating the hex value (in P4) of TLDs 

In [24]:
valid_tlds = []
with open('tlds-alpha-by-domain.txt') as f_r:
    i = 0
    for l in f_r:
        if i == 0:
            i = 1
            continue
        tld = l.lower().strip()
        valid_tlds.append(tld)
        if len(tld) > 7:
            print(tld)

tld_to_P4hex = {}
f_w = open("tlds_to_P4hex.txt", 'w')
for tld in valid_tlds:
    tld_to_P4hex[tld] = hash_last_label(tld)
    f_w.write(tld + "," + tld_to_P4hex[tld])
    f_w.write("\n")

abudhabi
accenture
accountant
accountants
airforce
alfaromeo
allfinanz
allstate
americanexpress
americanfamily
amsterdam
analytics
apartments
aquarelle
associates
attorney
bananarepublic
barcelona
barclaycard
barclays
barefoot
bargains
baseball
basketball
blackfriday
blockbuster
bloomberg
bnpparibas
boehringer
boutique
bradesco
bridgestone
broadway
brussels
builders
business
calvinklein
capetown
capitalone
catering
catholic
christmas
cipriani
cityeats
cleaning
clinique
clothing
commbank
community
computer
construction
consulting
contractors
cookingchannel
creditcard
creditunion
cuisinella
delivery
deloitte
democrat
diamonds
directory
discount
discover
download
education
engineer
engineering
enterprises
equipment
ericsson
etisalat
eurovision
exchange
extraspace
fairwinds
feedback
fidelity
financial
firestone
firmdale
foodnetwork
football
foundation
fresenius
frontdoor
frontier
furniture
goldpoint
goodyear
grainger
graphics
guardian
hdfcbank
healthcare
helsinki
hisamitsu
holdings
homedep

In [22]:
print(tld_to_P4hex)
print(calc_crc_16_P4("xn--tiq49xqyj.com"))

{'aaa': '4ef2', 'aarp': '0631', 'abarth': '4224', 'abb': '3bfc', 'abbott': '15d7', 'abbvie': 'f33f', 'abc': '4d48', 'able': '9232', 'abogado': '6958', 'abudhabi': '5cf7', 'ac': '0fc1', 'academy': 'ffd3', 'accenture': 'a9c1', 'accountant': '7746', 'accountants': '4b99', 'aco': 'a82b', 'actor': '4f36', 'ad': '5eec', 'adac': '7512', 'ads': '9b6a', 'adult': 'ec2d', 'ae': '2858', 'aeg': 'e06d', 'aero': 'd6db', 'aetna': '79bb', 'af': 'b384', 'afl': '10a0', 'africa': '7ba9', 'ag': 'c530', 'agakhan': '384b', 'agency': '3f96', 'ai': '676a', 'aig': '6b46', 'airbus': '7e72', 'airforce': '2a3a', 'airtel': '2bf1', 'akdn': '9b34', 'al': 'db2f', 'alfaromeo': '071b', 'alibaba': 'd723', 'alipay': '57df', 'allfinanz': '0122', 'allstate': '24e5', 'ally': '1d44', 'alsace': 'e05a', 'alstom': 'ca79', 'am': 'ad9b', 'amazon': '79a5', 'americanexpress': '9302', 'americanfamily': '92c6', 'amex': '34fb', 'amfam': '635c', 'amica': 'd19e', 'amsterdam': 'cf8e', 'analytics': 'e8ab', 'android': '6fe7', 'anquan': '5ac