In [42]:
import socket
import re
import sys
import signal

In [44]:
class WebProxy:
    def __init__(self, host, port):
        self.host = host
        self.port = port
        self.data_list = list()
        self.running = True

        # reserve namespace
        self.socket = None

    def signal_handler(self, sig, frame):
        print('kill caught, closing socket..!')
        self.socket.close()
        self.running = False
        sys.exit(0)

    def start_listener(self):
        # register interrupt handler
        signal.signal(signal.SIGINT, self.signal_handler)

        # bind socket to port and start listening
        self.socket = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        self.socket.bind((self.host, self.port))
        self.socket.listen()
        print(f"process listening on port {self.port}...")

        
    
    def run(self):
        while self.running:
            connection, address = self.socket.accept()
            with connection as c:
                print(f"connected established by {address}..!")
                while True:
                    data = c.recv(1024)
                    self.data_list.append(data)

                    print(f"data received:\n{data}\n")
                    if not data:
                        break

                    c.sendall(data)

class HttpParser:
    def __init__(self, data):
        self.raw = data
        self.dict = dict()

        # decode raw byte string using iso-8859-1 encoding
        self.string = self._decode_bytes()
        self._parse_string()

    def _decode_bytes(self, encoding: str = "iso-8859-1"):
        return self.raw.decode(encoding)
    
    def _parse_string(self):
        # split the decoded string along line breaks
        data_line_list = self.string.splitlines()
        
        # extract the first line of the request and parse it separately
        header_line = data_line_list.pop(0)

        # parse the data
        self._parse_header(header_line)
        self._parse_body(data_line_list)

    def _parse_header(self, header_line: str):
        # split the header on whitespace
        method, path, version = header_line.split()

        self.dict['Method'] = method
        self.dict['Path'] = path
        self.dict['Version'] = version

    def _parse_body(self, data_line_list: list):
        for data_line in data_line_list:
            # if line has contents
            if data_line:
                field_name, field_val = re.split(':', data_line, 1)
                # strip both of leading/trailing whitespace and add to the dict
                self.dict[field_name.strip()] = field_val.strip()
            
HOST = "127.0.0.1"
PORT = 7713
web_proxy = WebProxy(HOST, PORT)
web_proxy.start_listener()
web_proxy.run()

process listening on port 7713...
connected established by ('127.0.0.1', 58051)..!
data received:
b'GET /google.com HTTP/1.1\r\nHost: 127.0.0.1:7713\r\nUser-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8\r\nAccept-Language: en-US,en;q=0.5\r\nAccept-Encoding: gzip, deflate, br\r\nDNT: 1\r\nSec-GPC: 1\r\nConnection: keep-alive\r\nUpgrade-Insecure-Requests: 1\r\nSec-Fetch-Dest: document\r\nSec-Fetch-Mode: navigate\r\nSec-Fetch-Site: none\r\nSec-Fetch-User: ?1\r\n\r\n'

kill caught, closing socket..!


SystemExit: 0

In [45]:
test_data = web_proxy.data_list[0]
test_parsed_data = HttpParser(test_data)
test_parsed_data.dict

{'Method': 'GET',
 'Path': '/google.com',
 'Version': 'HTTP/1.1',
 'Host': '127.0.0.1:7713',
 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0',
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
 'Accept-Language': 'en-US,en;q=0.5',
 'Accept-Encoding': 'gzip, deflate, br',
 'DNT': '1',
 'Sec-GPC': '1',
 'Connection': 'keep-alive',
 'Upgrade-Insecure-Requests': '1',
 'Sec-Fetch-Dest': 'document',
 'Sec-Fetch-Mode': 'navigate',
 'Sec-Fetch-Site': 'none',
 'Sec-Fetch-User': '?1'}

In [6]:
valid_data_list = [d for d in data_list if d]
valid_data_list

[b'GET /google.com HTTP/1.1\r\nHost: 127.0.0.1:7713\r\nUser-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8\r\nAccept-Language: en-US,en;q=0.5\r\nAccept-Encoding: gzip, deflate, br\r\nDNT: 1\r\nSec-GPC: 1\r\nConnection: keep-alive\r\nUpgrade-Insecure-Requests: 1\r\nSec-Fetch-Dest: document\r\nSec-Fetch-Mode: navigate\r\nSec-Fetch-Site: none\r\nSec-Fetch-User: ?1\r\n\r\n']

In [12]:
test_get = valid_data_list[0]
# http is encoded with iso-8859-1
test_str = test_get.decode("iso-8859-1")

In [13]:
print(test_str)

GET /google.com HTTP/1.1
Host: 127.0.0.1:7713
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
DNT: 1
Sec-GPC: 1
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: none
Sec-Fetch-User: ?1




In [33]:
data = HttpParser(test_get)
data

<__main__.HttpData at 0x7fc2b952e2e0>

['GET', '/google.com', 'HTTP/1.1']