In [1]:
import requests
from bs4 import BeautifulSoup
import re

In [2]:
def create_slug(product_name):
    # Convert to lowercase
    product_name = product_name.lower()
    # Replace . with -
    product_name = product_name.replace('.', '-')
    # Replace ′ x with -x-
    product_name = re.sub(r"′\s*x\s*", '-x-', product_name)
    # Replace ' with nothing (remove it)
    product_name = product_name.replace("'", '')
    # Replace ' x ' (space-x-space) with x (compact format)
    product_name = re.sub(r'\s+x\s+', 'x', product_name)
    # Replace spaces with hyphens
    product_name = product_name.replace(' ', '-')
    # Remove other special characters except hyphens
    product_name = re.sub(r'[^a-z0-9\-]', '', product_name)
    # Ensure there are no double hyphens
    product_name = re.sub(r'-{2,}', '-', product_name)
    # Remove trailing hyphens
    return product_name.strip('-')

# Test the function
product1 = input()

slug1 = create_slug(product1)

print(f"URL 1: https://icecastlefh.com/product/{slug1}")

8′ x 16′ Mille Lacs
URL 1: https://icecastlefh.com/product/8-x-16-mille-lacs


In [3]:
# Base URL of the product page
product_url = f"https://icecastlefh.com/product/{slug1}"

# Make a request to the product page
response = requests.get(product_url)
response.raise_for_status()  # Ensure the request was successful

# Parse the HTML content
soup = BeautifulSoup(response.text, 'html.parser')

# Find <a> tags where href contains the uploads path and the text contains "Floor Plan"
pdf_link_tag = soup.find('a', href=lambda href: href and '/wp-content/uploads/' in href, string=lambda text: text and 'Floor Plan' in text)

# Check if the PDF link was found and construct the full URL
if pdf_link_tag and 'href' in pdf_link_tag.attrs:
    href = pdf_link_tag['href']
    # Check if href is a full URL or a relative path
    if href.startswith("http"):
        pdf_url = href
    else:
        pdf_url = "https://icecastlefh.com" + href

    print(f"PDF URL: {pdf_url}")
else:
    print("PDF link not found.")



PDF URL: https://icecastlefh.com/wp-content/uploads/2024/01/8-x-16-Mille-Lacs_2023.pdf


Way 1

In [4]:
import requests
import io
from pdf2image import convert_from_bytes
import pytesseract
from PIL import Image

# URL of the PDF

# Fetch the PDF content from the URL
response = requests.get(pdf_url)
response.raise_for_status()

# Convert PDF to images
pages = convert_from_bytes(response.content)

# Process each image with OCR
for i, page in enumerate(pages):
    # Convert the page to text using Tesseract OCR
    text = pytesseract.image_to_string(page, lang='eng')
    print(f"--- Page {i + 1} ---")
    print(text)
    print("\n")

--- Page 1 ---
Dealer:
Siding Color:
Decal Color:

S.G. Color:
Customer:
Order Date:

Tank Light
y

68" Jack Knife Sofa
&
30” Upper Bunk

110V >

Tracker Cabinet

Production #

Folding Door

&

30” Upper Bunk

Bay Window

48” Dinette

Frame Options
MFG:

Galvanized

No:[_]

Yes:[]

doyia}uno> papunoy

syauiqe> addy

A140} 83 M

700g esnoH Usi4

8 X 16V Mille Lacs
~ 2023

Single Axle Frame

Hand Crank Winch System
6 Holes with Lights

Cedar Interior

Flat Ceiling

Carpet

A/C

LED Package

Wire & Brace for TV
Digital TV Antenna

Wire for Portable Satellite
Stereo w/4 speakers

12V Recepts

110V Recepts

Bath w/toilet seat

Tracker Cabinets

Rounded wheel well countertop
3 Burner cook-top w/oven - Glass Door
Range hood w/vent

Small stainless sink w/cover

Flat panel doors w/New Hinges
Camo cushions & curtains
35,000 BTU Furnace

2 ~ 30” Bunks

48” Dinette

68” Jack-knife sofa

Bay Window w/Cushion

New Countertops






Way 2

In [4]:
import requests
import PyPDF2
import io

In [5]:
# Fetch the PDF content from the URL
response = requests.get(pdf_url)
response.raise_for_status()

# Read the PDF from memory using PyPDF2
with io.BytesIO(response.content) as pdf_file:
    reader = PyPDF2.PdfReader(pdf_file)
    for page in reader.pages:
        text = page.extract_text()
        print(text)  # Print or process the text as needed

6.5 X 10V Grandpa’s 
Hideout ~ 2023
Production #Single Axle FrameHand Crank Winch System4 HolesFactory Select PanelingCedar TrimFlat CeilingCarpet
LED Package
12V Recept110V Recepts
6 ‘ Upper cabinet
Flat Panel Doors w/New Hinges20,000 BTU Furnace68” Jack-knife sofaNew CountertopsDealer:
Siding Color:Decal Color:S.G. Color:Customer:Order Date:
Fish House Door
68” Jack Knife Sofa
30 X 1430 X 42 Egress14 X 27V12VV110V
6’ Upper CabinetConverter
20,000 BTU
Furnace
V
V
EXT 110VTank Light
V 12v Switches


Way 3

In [24]:
import requests
import io
import pdfplumber
from langchain.schema import Document
from langchain.text_splitter import RecursiveCharacterTextSplitter

# Fetch the PDF content from the URL
response = requests.get(pdf_url)
response.raise_for_status()

# Extract text using pdfplumber directly from the in-memory PDF content
with pdfplumber.open(io.BytesIO(response.content)) as pdf:
    full_text = ""
    for page in pdf.pages:
        full_text += page.extract_text() + "\n\n"

# Create a LangChain Document object
document = Document(page_content=full_text)

# Split text for processing if needed
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
docs = text_splitter.split_documents([document])

# Print the processed documents
for doc in docs:
    print(doc.page_content)


Dealer:
6.5 X 10V Grandpa’s
Siding Color:
Decal Color: Hideout ~ 2023
S.G. Color:
Single Axle Frame
Customer:
Hand Crank Winch System
Order Date:
4 Holes
Factory Select Paneling
Fi
s Cedar Trim
h
H Flat Ceiling
o
u Carpet
s
e
D
o
LED Package
o
r
12V Recept
110V Recepts
6 ‘ Upper cabinet
Flat Panel Doors w/New Hinges
20,000 BTU Furnace
68” Jack-knife sofa
New Countertops
68” Jack Knife Sofa
Production #
30 X 14
ssergE
24
X
03
V
14
X
27
12V
V
Tank Light
V
EXT 110V
V
Converter
20,000 BTU
Furnace
110V
6’ Upper Cabinet
V 12v Switches
