SSH - ssh -i ~/.ssh/loan-parser-pem.pem ubuntu@3.16.113.93
Local Server - python -m http.server 8001


# EC2 Setup - (Update System)
- sudo apt update
- $ sudo apt upgrade -y
- sudo add-apt-repository ppa:deadsnakes/ppa -y :  Personal Package Archive -> Sometiimes Ubuntu doesn't have ppython version we need, let's use access newer python versions
```bash
# Add deadsnakes PPA (provides Python 3.12)
sudo add-apt-repository ppa:deadsnakes/ppa -y

# Update package list
sudo apt update

# Install Python 3.12
sudo apt install python3.12 python3.12-venv python3.12-dev -y

# Verify
python3.12 --version
```

- Install Poetry:
```bash
# Install Poetry
curl -sSL https://install.python-poetry.org | python3.12 -

# Add to PATH
echo 'export PATH="/home/ubuntu/.local/bin:$PATH"' >> ~/.bashrc
source ~/.bashrc

# Verify
poetry --version
```

- Install Git: sudo apt install git -y
- Screen (For running processes in background): sudo apt install screen -y
- HTOP(monitor system resources): sudo apt install htop -y
- Copy files via SSH:
```bash
# On your local machine (new terminal)
# Create tar of your project (excluding venv, __pycache__)
cd /path/to/legal-contract-parser
tar -czf legal-contract-parser.tar.gz \
    --exclude='.venv' \
    --exclude='__pycache__' \
    --exclude='*.pyc' \
    --exclude='.git' \
    .

# Upload to EC2
scp -i ~/Downloads/my-key.pem legal-contract-parser.tar.gz ubuntu@YOUR-EC2-IP:~

# Back on EC2
cd ~
tar -xzf legal-contract-parser.tar.gz -C legal-contract-parser
cd legal-contract-parser
```

Tmux:
```bash
# Create session for FastAPI
tmux new -s fastapi
# Inside: run your FastAPI server
cd ~/legal-contract-parser
poetry run uvicorn src.api.main:app --host 0.0.0.0 --port 8000

# Detach: Ctrl+B, then D

# Create session for Streamlit
tmux new -s streamlit
# Inside: run Streamlit
cd ~/legal-contract-parser
poetry run streamlit run streamlit_app.py --server.port 8501 --server.address 0.0.0.0

# Detach: Ctrl+B, then D

# List sessions
tmux ls

# Reattach to see logs
tmux attach -t fastapi
tmux attach -t streamlit

# Kill a session
tmux kill-session -t fastapi

#Cool tmux Features:

#Split screen horizontally:
Ctrl+B, then \"

#Split screen vertically:
Ctrl+B, then %

#Switch between panes:**
Ctrl+B, then arrow keys
```

In [6]:
from pathlib import Path
from random import sample

from src.parsers.pdf_parser import parse_pdf, PDFParser
from src.utils.dataset import CUADDataset

In [2]:
import logging

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)

In [3]:
# create dataset instance
cuad = CUADDataset()

sample_pdf = cuad.pdf_dir / cuad.get_sample_contracts(n=1)[0]

print(sample_pdf)

/Users/angadb/Documents/Angad - Personal/Projects/Generative AI/CUAD_v1/full_contract_pdf/CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf


In [7]:
# instantiate the parser
parser = PDFParser(extract_tables=True)

In [9]:
parser_doc = parser.parse(sample_pdf)

2025-11-25 15:03:45,878 - src.parsers.pdf_parser - INFO - Parsing PDF: CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf
2025-11-25 15:03:46,292 - src.parsers.pdf_parser - INFO - Successfully parsed CybergyHoldingsInc_20140520_10-Q_EX-10.27_8605784_EX-10.27_Affiliate Agreement.pdf: 16 pages, 16 tables


In [12]:
print(parser_doc.num_pages)
# print(parser_doc.tables)
print(parser_doc.metadata)

16
{'title': '', 'author': '', 'subject': '', 'creator': 'Aspose Ltd.', 'producer': 'Aspose.Pdf for .NET 17.6', 'creation_date': '', 'modification_date': "D:20200127152739-05'00'"}


In [13]:
text_only = parser.parse_text_only(sample_pdf)

In [16]:
text_only[:500]

'Exhibit 10.27\nMARKETING AFFILIATE AGREEMENT\nBetween:\nBirch First Global Investments Inc.\nAnd\nMount Knowledge Holdings Inc.\nDated: May 8, 2014\n1\nSource: CYBERGY HOLDINGS, INC., 10-Q, 5/20/2014\n\n\n1.\n2.\n2.1\n2.2\nThis\xa0Marketing\xa0Affiliate\xa0Agreement\xa0(the\xa0“Agreement”)\xa0is\xa0entered\xa0into this 8th day of May\n2014, by and between BIRCH FIRST GLOBAL INVESTMENTS INC., a corporation incorporated\nin the U.S. Virgin Islands, with its main place of business located 9100 Havensight, Port of Sale, Ste.\n15/16,\xa0St.\xa0Tho'

In [17]:
# Test smaller pieces/class funcitons manually

metadata = parser._extract_metadata(sample_pdf)
pages = parser._parse_pages(sample_pdf)

print(metadata)
print(len(pages))
print(pages[0].text[:200])


{'title': '', 'author': '', 'subject': '', 'creator': 'Aspose Ltd.', 'producer': 'Aspose.Pdf for .NET 17.6', 'creation_date': '', 'modification_date': "D:20200127152739-05'00'"}
16
Exhibit 10.27
MARKETING AFFILIATE AGREEMENT
Between:
Birch First Global Investments Inc.
And
Mount Knowledge Holdings Inc.
Dated: May 8, 2014
1
Source: CYBERGY HOLDINGS, INC., 10-Q, 5/20/2014

