In [3]:
pip install opencv-python easyocr pandas matplotlib numpy

Collecting easyocr
  Downloading easyocr-1.7.2-py3-none-any.whl.metadata (10 kB)
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.6.7-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.9 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (8.6 kB)
Collecting ninja (from easyocr)
  Downloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (5.1 kB)
Downloading easyocr-1.7.2-py3-none-any.whl (2.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m65.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ninja-1.13.0-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (180 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.7/180.7 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyclipper-1.4.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (978 kB)
[2K   

In [4]:
!pip install pdf2image poppler-utils

Collecting pdf2image
  Downloading pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Collecting poppler-utils
  Downloading poppler_utils-0.1.0-py3-none-any.whl.metadata (883 bytes)
Downloading pdf2image-1.17.0-py3-none-any.whl (11 kB)
Downloading poppler_utils-0.1.0-py3-none-any.whl (9.2 kB)
Installing collected packages: poppler-utils, pdf2image
Successfully installed pdf2image-1.17.0 poppler-utils-0.1.0


In [5]:
!apt-get install poppler-utils

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
  poppler-utils
0 upgraded, 1 newly installed, 0 to remove and 2 not upgraded.
Need to get 186 kB of archives.
After this operation, 697 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 poppler-utils amd64 22.02.0-2ubuntu0.12 [186 kB]
Fetched 186 kB in 0s (961 kB/s)
Selecting previously unselected package poppler-utils.
(Reading database ... 117540 files and directories currently installed.)
Preparing to unpack .../poppler-utils_22.02.0-2ubuntu0.12_amd64.deb ...
Unpacking poppler-utils (22.02.0-2ubuntu0.12) ...
Setting up poppler-utils (22.02.0-2ubuntu0.12) ...
Processing triggers for man-db (2.10.2-1) ...


In [6]:
import cv2
import re
import easyocr
import numpy as np
import pandas as pd
from pdf2image import convert_from_path

In [7]:
pdf_path = "/content/UCS-test.pdf"
pages = convert_from_path(pdf_path)


page_images = []
for i, p in enumerate(pages):
    fname = f"/content/page_{i}.png"
    p.save(fname, "PNG")
    page_images.append(fname)
reader = easyocr.Reader(["en"], gpu=False)


all_text = []
for p in page_images:
    all_text += reader.readtext(cv2.imread(p), detail=0)


full_text = " ".join(all_text).lower()
is_lab = any(k in full_text for k in ["sample data", "diameter", "axial deformation", "corrected area"])
print("Document type:", "LAB UCS REPORT" if is_lab else "BOREHOLE UCS LOG")
ucs_pages=[]
for i,p in enumerate(page_images):
    t=" ".join(reader.readtext(cv2.imread(p),detail=0)).lower()
    if any(k in t for k in ["stress", "strain", "compressive", "ucs"]):
        ucs_pages.append(i)
def extract_curve(img_path):
    img=cv2.imread(img_path)
    gray=cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
    blur=cv2.GaussianBlur(gray,(5,5),0)
    edges=cv2.Canny(blur,50,150)
    cnts,_=cv2.findContours(edges,cv2.RETR_LIST,cv2.CHAIN_APPROX_NONE)


    if not cnts:
        return None,None


    curve=max(cnts,key=lambda c:len(c))
    pts=curve.reshape(-1,2)


    h,w=gray.shape
    MAX_STRESS=0.188
    MAX_STRAIN=15


    data=[]
    for x,y in pts:
        strain=(x/w)*MAX_STRAIN
        stress=((h-y)/h)*MAX_STRESS
        data.append([strain,stress])


    df=pd.DataFrame(data,columns=["Strain","Stress"])
    UCS=df["Stress"].max()*1000
    fs=df.loc[df["Stress"].idxmax(),"Strain"]
    return UCS,fs
if is_lab:
    ucs_match=re.search(r"qu\s*=\s*(\d+)\s*kpa",full_text)
    UCS=float(ucs_match.group(1)) if ucs_match else extract_curve(page_images[ucs_pages[0]])[0]
    fs=15
    wc = re.search(r"(w\s*/?\s*c|water\s*content)[^0-9]{0,20}(\d+\.?\d*)\s*%", full_text)
    water = float(wc.group(2)) if wc else None
    dd=re.search(r"(\d+\.?\d*)\s?g/cc",full_text)
    density=float(dd.group(1)) if dd else None


    final=pd.DataFrame([[UCS,fs,water,density]],
                       columns=["UCS (kPa)","Failure Strain (%)","Water Content (%)","Dry Density (g/cc)"])
else:
    depth_pages=[]
    for i,p in enumerate(page_images):
        t=" ".join(reader.readtext(cv2.imread(p),detail=0)).lower()
        if "depth" in t or "borehole" in t:
            depth_pages.append(i)


    depth_map={}
    for i in depth_pages:
        for _,txt,_ in reader.readtext(cv2.imread(page_images[i])):
            m=re.search(r"(depth|bh)[^0-9]{0,10}(\d+\.?\d*)\s?m",txt.lower())
            if m:
                depth_map[i]=float(m.group(2))


    rows=[]
    for i in ucs_pages:
        prev=[p for p in depth_map if p<=i]
        depth=depth_map[max(prev)] if prev else None
        UCS,fs=extract_curve(page_images[i])
        rows.append([depth,UCS,fs])


    final=pd.DataFrame(rows,columns=["Depth (m)","UCS (kPa)","Failure Strain (%)"])
final.to_csv("/content/ucs_output.csv",index=False)
print(final)








Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |--------------------------------------------------| 0.0% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.1% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.2% CompleteProgress: |--------------------------------------------------| 0.3% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.4% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.5% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.6% CompleteProgress: |--------------------------------------------------| 0.7% Complet



Document type: LAB UCS REPORT
   UCS (kPa)  Failure Strain (%)  Water Content (%)  Dry Density (g/cc)
0      188.0                  15              19.16               1.589
