In [1]:
import socket
# import threading
import re
import sys
import signal
from dataclasses import dataclass

In [8]:
class WebProxy:
    def __init__(self, proxy_host, proxy_port, max_clients: int = 1):
        self.proxy_host = proxy_host
        self.proxy_port = proxy_port

    def start(self):
        # bind socket to port and start listening
        with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
            s.bind((self.proxy_host, self.proxy_port))
            s.listen()
            print(f"listening on port {self.proxy_port}...")

            while True:
                # wait for a connection
                connection, address = s.accept()
                print(f"client {address} connected...")

                with connection as c:
                    while True:
                        # receive data from the connection
                        data = c.recv(1024)

                        # if an empty byte-string was received, end the loop
                        if not data:
                            print("closing connection...")
                            break

                        parsed_data = HttpParser(data)
                        print(f"data received:\n{parsed_data.dict}\n")

                        parsed_data.reformat()


class HttpParser:
    def __init__(self, data):
        self.raw = data
        self.dict = dict()

        # decode raw byte string using iso-8859-1 encoding
        self.string = self._decode_bytes()
        print(data)
        self._parse_string()

    def _decode_bytes(self, encoding: str = "iso-8859-1"):
        return self.raw.decode(encoding)
    
    def _parse_string(self):
        # split the decoded string along line breaks
        data_line_list = self.string.splitlines()
        
        # extract the first line of the request and parse it separately
        header_line = data_line_list.pop(0)

        # parse the data
        self._parse_header(header_line)
        self._parse_body(data_line_list)

    def _parse_header(self, header_line: str):
        # split the header on whitespace
        method, path, version = header_line.split()

        self.dict['Method'] = method
        self.dict['Path'] = path
        self.dict['Version'] = version

    def _parse_body(self, data_line_list: list):
        for data_line in data_line_list:
            # if line has contents
            if data_line:
                field_name, field_val = re.split(':', data_line, 1)
                # strip both of leading/trailing whitespace and add to the dict
                self.dict[field_name.strip()] = field_val.strip()

    def reformat(self):
        data_dict = self.dict.copy()

        header_template = "{} {} {}\r\n"
        data_line_template = "{}: {}\r\n"

        # remove header values from the data_dict
        method = data_dict.pop('Method')
        old_path = data_dict.pop('Path')
        version = data_dict.pop('Version')

        # get the new path and host and update the dict with them
        new_path, new_host = self._reformat_path_host(old_path)
        data_dict['Host'] = new_host

        # format new header
        out_str = header_template.format(
            method, 
            new_path, 
            version
        )
        for k, v in data_dict.items():
            # format the current data line and append it to the string
            data_line = data_line_template.format(k, v)
            out_str += data_line

        # append the final \r\n to the string and format it as a bytestring
        out_str += "\r\n"
        out_byte_str = out_str.encode()

        print("out:\n",out_byte_str)

    def _reformat_path_host(self, old_path):
        if old_path == '/':
            # raise some exception/return a message asking the user to specify a URL
            print('implement this :)')

        # add a slash to the end if it's missing
        if old_path[-1] != "/":
            old_path += "/"

        # find the index of the first "/"
        split_index = old_path.find('/', 1)

        # split the string on the split index
        new_host = old_path[:split_index]
        new_path = old_path[split_index:]
        return new_path, new_host
            
HOST = "127.0.0.1"
PORT = 7713
web_proxy = WebProxy(HOST, PORT)





In [9]:
web_proxy.start()

listening on port 7713...
client ('127.0.0.1', 64601) connected...
b'GET /google.com HTTP/1.1\r\nHost: 127.0.0.1:7713\r\nUser-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8\r\nAccept-Language: en-US,en;q=0.5\r\nAccept-Encoding: gzip, deflate, br\r\nDNT: 1\r\nSec-GPC: 1\r\nConnection: keep-alive\r\nUpgrade-Insecure-Requests: 1\r\nSec-Fetch-Dest: document\r\nSec-Fetch-Mode: navigate\r\nSec-Fetch-Site: none\r\nSec-Fetch-User: ?1\r\n\r\n'
data received:
{'Method': 'GET', 'Path': '/google.com', 'Version': 'HTTP/1.1', 'Host': '127.0.0.1:7713', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8', 'Accept-Language': 'en-US,en;q=0.5', 'Accept-Encoding': 'gzip, deflate, br', 'DNT': '1', 'Sec-GPC': '1', 'Connect

KeyboardInterrupt: 

In [45]:
test_data = web_proxy.data_list[0]
test_parsed_data = HttpParser(test_data)
test_parsed_data.dict

{'Method': 'GET',
 'Path': '/google.com',
 'Version': 'HTTP/1.1',
 'Host': '127.0.0.1:7713',
 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0',
 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8',
 'Accept-Language': 'en-US,en;q=0.5',
 'Accept-Encoding': 'gzip, deflate, br',
 'DNT': '1',
 'Sec-GPC': '1',
 'Connection': 'keep-alive',
 'Upgrade-Insecure-Requests': '1',
 'Sec-Fetch-Dest': 'document',
 'Sec-Fetch-Mode': 'navigate',
 'Sec-Fetch-Site': 'none',
 'Sec-Fetch-User': '?1'}

In [5]:
import threading
import time
import random

def foo(a, b):
    _ = a + b
    time.sleep(random.randint(1, 5))
    thread_list.remove(threading.current_thread())

thread_list = []
a = 2
b = 2

for i in range(20):
    t = threading.Thread(target=foo, args=[a, b])
    thread_list.append(t)

for t in thread_list:
    t.start()

while thread_list:
    print(len(thread_list))
    time.sleep(0.1)

20
20
20
20
20
20
20
20
20
20
16
16
16
16
16
16
16
16
16
16
12
12
12
12
12
12
12
12
12
7
7
7
7
7
7
7
7
7
7
1
1
1
1
1
1
1
1
1


In [19]:
test_path = "/google.com"

# add a slash to the end if it's missing
if test_path[-1] != "/":
    test_path += "/"

# find the index of the first "/"
split_index = test_path.find('/', 1)

# split the string on the split index
new_host = test_path[:split_index]
new_path = test_path[split_index:]

print(new_host)
print(new_path)

/google.com
/


In [6]:
valid_data_list = [d for d in data_list if d]
valid_data_list

[b'GET /google.com HTTP/1.1\r\nHost: 127.0.0.1:7713\r\nUser-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0\r\nAccept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8\r\nAccept-Language: en-US,en;q=0.5\r\nAccept-Encoding: gzip, deflate, br\r\nDNT: 1\r\nSec-GPC: 1\r\nConnection: keep-alive\r\nUpgrade-Insecure-Requests: 1\r\nSec-Fetch-Dest: document\r\nSec-Fetch-Mode: navigate\r\nSec-Fetch-Site: none\r\nSec-Fetch-User: ?1\r\n\r\n']

In [12]:
test_get = valid_data_list[0]
# http is encoded with iso-8859-1
test_str = test_get.decode("iso-8859-1")

In [13]:
print(test_str)

GET /google.com HTTP/1.1
Host: 127.0.0.1:7713
User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:124.0) Gecko/20100101 Firefox/124.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
DNT: 1
Sec-GPC: 1
Connection: keep-alive
Upgrade-Insecure-Requests: 1
Sec-Fetch-Dest: document
Sec-Fetch-Mode: navigate
Sec-Fetch-Site: none
Sec-Fetch-User: ?1




In [33]:
data = HttpParser(test_get)
data

<__main__.HttpData at 0x7fc2b952e2e0>

['GET', '/google.com', 'HTTP/1.1']