In [156]:
import pytesseract
import cv2
import numpy as np
import requests

In [157]:
def load_image_from_url(image_url):
    """Open and display an image from a URL using OpenCV."""
    try:
        # Fetch the image from the URL
        response = requests.get(image_url, stream=True)
        response.raise_for_status()  # Raise an error for failed requests

        # Convert the raw bytes into a NumPy array
        image_array = np.asarray(bytearray(response.content), dtype=np.uint8)

        # Decode the image using OpenCV
        image = cv2.imdecode(image_array, cv2.IMREAD_COLOR)

        if image is None:
            raise ValueError("Failed to decode the image")
        return image  # Return the OpenCV image object if further processing is needed

    except Exception as e:
        print(f"Error opening image: {e}")
        return None

In [158]:
def image_segmentation(image):
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    
    # Apply Gaussian Blur
    blur = cv2.GaussianBlur(gray, (11, 11), 0)
    
    
    # Thresholding with Otsu's binarization
    _, thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) 
    
    
    # Create a rectangular kernel for dilation
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 9))
    
    # Dilate the image to connect text regions
    dilate = cv2.dilate(thresh, kernel, iterations=5)
    
    
    # Find contours
    cnts, _ = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
    cnts = sorted(cnts, key=lambda x: cv2.boundingRect(x)[0])

    results = []

    heights = [cv2.boundingRect(c)[3] for c in cnts]
    widths = [cv2.boundingRect(c)[2] for c in cnts]

    # Calculate median height and width
    median_h = np.median(heights)
    median_w = np.median(widths)

    # Set filtering limits as some factor of the median size
    min_h = median_h * 0.5  # Allow half the median size for minimum height
    min_w = median_w * 0.5  # Allow half the median size for minimum width

    
    # Iterate through contours and extract text from ROIs
    for c in cnts:
        x, y, w, h = cv2.boundingRect(c)
        
        if h > 10 and w > 50:
            roi = image[y:y+h, x:x+w]
            
            ocr_result = pytesseract.image_to_string(roi)
            
            results.extend(ocr_result.split("\n"))
            
            cv2.rectangle(image, (x, y), (x+w, y+h), (36, 255, 12), 2)
    
    # Return the modified image and OCR results
    return image, results


In [159]:
url = "https://media.licdn.com/dms/image/v2/D5622AQFDypGK7vZzpw/feedshare-shrink_800/feedshare-shrink_800/0/1729678450155?e=1732752000&v=beta&t=0TUW9YEO_sKRUew98QHR04eswM132heVgEE3E3q1moI"
image = load_image_from_url(url)

In [160]:
processed_image, text_results = image_segmentation(image)



In [161]:
cv2.imshow("Processed Image", processed_image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [162]:
for line in text_results:
    print(line)

¢ 0-2 Years Experience
¢ Mumbai, Maharashtra
¢ Fulltime

Apply Now
jobs.abekus.ai


New Job Alert

° 8LPA
=2 Years Experience.
¢ Mumbai, Maharashtra suet

¢ Fulltime
4

Company

SMEST Capital Pvt. Ltd.



