## Google API

In [None]:
import pathlib
import textwrap
from dotenv import load_dotenv
import os

load_dotenv()

In [19]:
import google.generativeai as genai
from IPython.display import display
from IPython.display import Markdown

def to_markdown(text):
  text = text.replace('•', '  *')
  return Markdown(textwrap.indent(text, '> ', predicate=lambda _: True))

GOOGLE_API_KEY=os.getenv('GOOGLE_API_KEY')

genai.configure(api_key=GOOGLE_API_KEY)

![](./Street_1330_2000x.jpg)

In [20]:
from typing import Sequence

from google.cloud import vision


def analyze_image_from_uri(
    image_uri: str,
    feature_types: Sequence,
) -> vision.AnnotateImageResponse:
    client = vision.ImageAnnotatorClient()

    image = vision.Image()
    image.source.image_uri = image_uri
    features = [vision.Feature(type_=feature_type) for feature_type in feature_types]
    request = vision.AnnotateImageRequest(image=image, features=features)

    response = client.annotate_image(request=request)

    return response


def print_labels(response: vision.AnnotateImageResponse):
    print("=" * 80)
    for label in response.label_annotations:
        print(
            f"{label.score:4.0%}",
            f"{label.description:5}",
            sep=" | ",
        )
        

In [None]:
image_uri = "https://www.neoncreations.co.uk/cdn/shop/products/Street_1330_2000x.jpg"
features = [vision.Feature.Type.LABEL_DETECTION]

response = analyze_image_from_uri(image_uri, features)
print_labels(response)

In [5]:
def print_text(response: vision.AnnotateImageResponse):
    print("=" * 80)
    for annotation in response.text_annotations:
        vertices = [f"({v.x},{v.y})" for v in annotation.bounding_poly.vertices]
        print(
            f"{repr(annotation.description):42}",
            ",".join(vertices),
            sep=" | ",
        )
        

In [None]:
image_uri = "https://static.easycanvasprints.com/Upload/mkt/PLA/ECP/BAS_SEM_20170824_MetalStretSigns_2Up_Green_Texthere.jpg" 
features = [vision.Feature.Type.TEXT_DETECTION]

response = analyze_image_from_uri(image_uri, features)
print_text(response)

In [7]:
def detect_language(text: str) -> dict:
    """Detects the text's language."""
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.detect_language(text)

    print(f"Text: {text}")
    print("Confidence: {}".format(result["confidence"]))
    print("Language: {}".format(result["language"]))

    return result

In [None]:
[detect_language(response.text_annotations[i+1].description) for i in range(len(response.text_annotations)-1)]

In [9]:
def translate_text(target: str, text: str) -> dict:
    """Translates text into the target language.

    Target must be an ISO 639-1 language code.
    See https://g.co/cloud/translate/v2/translate-reference#supported_languages
    """
    from google.cloud import translate_v2 as translate

    translate_client = translate.Client()

    if isinstance(text, bytes):
        text = text.decode("utf-8")

    # Text can also be a sequence of strings, in which case this method
    # will return a sequence of results for each text.
    result = translate_client.translate(text, target_language=target)

    print("Text: {}".format(result["input"]))
    print("Translation: {}".format(result["translatedText"]))
    print("Detected source language: {}".format(result["detectedSourceLanguage"]))

    return result


In [None]:
response=[translate_text("ko", response.text_annotations[i+1].description) for i in range(len(response.text_annotations)-1)]

In [None]:
print([response[i]['translatedText'] for i in range(len(response))])

In [19]:
translated_text=[response[i]['translatedText'] for i in range(len(response))]

In [None]:
translated_text

In [26]:
with open('image_translated.txt', 'w') as f:
    for i in range(len(translated_text)):
        f.write(translated_text[i])
        f.write("\n")

## 로컬

In [55]:
from PIL import Image, ImageFilter, ImageEnhance
import pytesseract
import numpy as np
import cv2
from typing import List, Tuple
import torch
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

CLASSES = ["person", "bicycle", "car", "motorbike", "aeroplane", "bus",
           "train", "truck", "boat", "traffic light", "fire hydrant",
           "stop sign", "parking meter", "bench", "bird", "cat", "dog",
           "horse", "sheep", "cow", "elephant", "bear", "zebra",
           "giraffe", "backpack", "umbrella", "handbag", "tie",
           "suitcase", "frisbee", "skis", "snowboard", "sports ball",
           "kite", "baseball bat", "baseball glove", "skateboard",
           "surfboard", "tennis racket", "bottle", "wine glass", "cup",
           "fork", "knife", "spoon", "bowl", "banana", "apple",
           "sandwich", "orange", "broccoli", "carrot", "hot dog",
           "pizza", "donut", "cake", "chair", "sofa", "pottedplant",
           "bed", "diningtable", "toilet", "tvmonitor", "laptop",
           "mouse", "remote", "keyboard", "cell phone", "microwave",
           "oven", "toaster", "sink", "refrigerator", "book", "clock",
           "vase", "scissors", "teddy bear", "hair drier", "toothbrush"]

# 모델 캐싱
model_cache = {}

In [56]:
def get_yolov5_model(model_size: str='yolov5x'):
    """
    지정된 크기의 YOLOv5 모델을 로드하고 캐시합니다.
    
    Args:
        model_size (str): 사용할 YOLOv5 모델 크기.
    
    Returns:
        torch.hub.Model: 로드된 YOLOv5 모델.
    """
    if model_size not in model_cache:
        model_cache[model_size] = torch.hub.load('ultralytics/yolov5', model_size, pretrained=True)
    return model_cache[model_size]


In [57]:
def upscale_image(image_path: str, scale: float=1.5) -> np.ndarray:
    """
    이미지를 업스케일링하여 해상도를 높입니다.
    
    Args:
        image_path (str): 이미지 파일의 로컬 경로.
        scale (float): 업스케일링 비율.
    
    Returns:
        np.ndarray: 업스케일링된 이미지.
    """
    image = cv2.imread(image_path)
    upscaled_image = cv2.resize(image, None, fx=scale, fy=scale, interpolation=cv2.INTER_CUBIC)
    return upscaled_image


In [58]:
def enhance_contrast(image: np.ndarray) -> np.ndarray:
    """
    이미지의 대비를 향상시킵니다.
    
    Args:
        image (np.ndarray): 입력 이미지.
    
    Returns:
        np.ndarray: 대비가 향상된 이미지.
    """
    lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    cl = clahe.apply(l)
    enhanced_lab = cv2.merge((cl, a, b))
    enhanced_image = cv2.cvtColor(enhanced_lab, cv2.COLOR_LAB2BGR)
    return enhanced_image


In [59]:
def remove_noise(image: np.ndarray) -> np.ndarray:
    """
    이미지에서 노이즈를 제거합니다.
    
    Args:
        image (np.ndarray): 입력 이미지.
    
    Returns:
        np.ndarray: 노이즈가 제거된 이미지.
    """
    denoised_image = cv2.medianBlur(image, 3)
    return denoised_image


In [60]:
def preprocess_image_full(image_path: str, scale: float=1.5) -> np.ndarray:
    """
    전체 전처리 과정을 수행하여 이미지를 준비합니다.
    
    Args:
        image_path (str): 이미지 파일의 로컬 경로.
        scale (float): 업스케일링 비율.
    
    Returns:
        np.ndarray: 전처리된 이미지.
    """
    image = upscale_image(image_path, scale)
    image = enhance_contrast(image)
    image = remove_noise(image)
    return image


In [76]:
def analyze_image_labels_yolov5_optimized(image_path: str, model_size: str='yolov5x', confidence_threshold: float=0.3, iou_threshold: float=0.4) -> List[Tuple[str, float]]:
    """
    최적화된 YOLOv5을 사용하여 이미지에서 라벨을 인식합니다.
    
    Args:
        image_path (str): 이미지 파일의 로컬 경로.
        model_size (str): 사용할 YOLOv5 모델 크기.
        confidence_threshold (float): 예측 신뢰도 임계값.
        iou_threshold (float): NMS의 IoU 임계값.
    
    Returns:
        List[Tuple[str, float]]: (라벨, 신뢰도) 튜플의 리스트.
    """
    model = get_yolov5_model(model_size)
    results = model(image_path)
    labels = []
    
    for *box, conf, cls in results.xyxy[0]:
        label = model.names[int(cls)]
        labels.append((label, conf.item()))
    
    # 중복 라벨 제거 및 신뢰도 평균
    label_dict = {}
    for label, conf in labels:
        if label in label_dict:
            label_dict[label].append(conf)
        else:
            label_dict[label] = [conf]
    
    final_labels = []
    for label, confs in label_dict.items():
        avg_conf = sum(confs) / len(confs)
        final_labels.append((label, avg_conf))
    
    # 신뢰도 높은 순으로 정렬
    final_labels = sorted(final_labels, key=lambda x: x[1], reverse=True)
    
    return final_labels

In [62]:
def filter_specific_classes(labels: List[Tuple[str, float]], specific_classes: List[str]) -> List[Tuple[str, float]]:
    """
    특정 클래스만을 필터링하여 반환합니다.
    
    Args:
        labels (List[Tuple[str, float]]): (라벨, 신뢰도) 튜플 리스트.
        specific_classes (List[str]): 필터링할 클래스 이름 리스트.
    
    Returns:
        List[Tuple[str, float]]: 필터링된 (라벨, 신뢰도) 튜플 리스트.
    """
    return [label for label in labels if label[0] in specific_classes]

In [69]:
def analyze_image_easyocr(image_path: str) -> str:
    """
    EasyOCR을 사용하여 이미지에서 텍스트를 추출합니다.
    
    Args:
        image_path (str): 이미지 파일의 로컬 경로.
    
    Returns:
        str: 추출된 텍스트.
    """
    import easyocr
    reader = easyocr.Reader(['en'])
    result = reader.readtext(image_path, detail=0, paragraph=True)
    return '\n'.join(result)


In [64]:
def print_labels(labels: List[Tuple[str, float]], max_labels: int=10):
    """
    예측된 라벨을 최대 max_labels개까지 출력합니다.
    
    Args:
        labels (List[Tuple[str, float]]): (라벨, 신뢰도) 튜플의 리스트.
        max_labels (int): 출력할 최대 라벨 수.
    """
    print("=" * 80)
    # 라벨 리스트가 max_labels보다 작을 경우를 대비하여 슬라이싱
    for label, confidence in labels[:max_labels]:
        print(f"{confidence*100:5.1f}% | {label:20}")
    print("=" * 80)
        

In [65]:
def print_text(text: str):
    """
    추출된 텍스트를 출력합니다.
    
    Args:
        text (str): 추출된 텍스트.
    """
    print("=" * 80)
    print(text)
    print("=" * 80)

In [66]:
def visualize_labels(image_path: str, labels: List[Tuple[str, float]], confidence_threshold: float=0.3):
    """
    이미지에 탐지된 라벨과 바운딩 박스를 시각화합니다.
    
    Args:
        image_path (str): 이미지 파일의 로컬 경로.
        labels (List[Tuple[str, float]]): (라벨, 신뢰도) 튜플 리스트.
        confidence_threshold (float): 시각화할 신뢰도 임계값.
    """
    model = get_yolov5_model('yolov5x')
    results = model(image_path, conf=confidence_threshold, iou=0.4)
    image = cv2.imread(image_path)
    
    for *box, conf, cls in results.xyxy[0]:
        if conf < confidence_threshold:
            continue
        label = model.names[int(cls)]
        x1, y1, x2, y2 = map(int, box)
        cv2.rectangle(image, (x1, y1), (x2, y2), (0, 255, 0), 2)
        cv2.putText(image, f"{label} {conf:.2f}", (x1, y1 - 10),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36,255,12), 2)
    
    plt.figure(figsize=(12, 8))
    plt.imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()

In [67]:
image_path="/Users/janghyeonbin/OCR/notebook/Street_1330_2000x.jpg"

preprocessed_image = preprocess_image_full(image_path, scale=1.5)
temp_preprocessed_path = "temp_preprocessed_image.jpg"
cv2.imwrite(temp_preprocessed_path, cv2.cvtColor(np.array(preprocessed_image), cv2.COLOR_RGB2BGR))

True

In [70]:
extracted_text_easyocr = analyze_image_easyocr(temp_preprocessed_path)
print_text(extracted_text_easyocr)

0sc
U
IL EE


In [71]:
extracted_text_easyocr = analyze_image_easyocr(image_path)
print_text(extracted_text_easyocr)

STREET


In [81]:
labels_yolov5 = analyze_image_labels_yolov5_optimized(
        image_path, model_size='yolov5x', confidence_threshold=0.3, iou_threshold=0.4
    )

  with amp.autocast(autocast):


In [82]:
print(f"인식된 라벨의 수: {len(labels_yolov5)}")
print(f"인식된 라벨: {labels_yolov5}")

인식된 라벨의 수: 0
인식된 라벨: []


In [83]:
specific_classes = ['person', 'car', 'bicycle']  # 관심 있는 클래스 목록
labels_filtered = filter_specific_classes(labels_yolov5, specific_classes)

In [84]:
print_labels(labels_filtered, max_labels=5)



In [85]:
image_path="/Users/janghyeonbin/OCR/notebook/BAS_SEM_20170824_MetalStretSigns_2Up_Green_Texthere.jpg"

In [87]:
extracted_text_easyocr = analyze_image_easyocr(image_path)
print_text(extracted_text_easyocr)

AVE YOUR TEXT HERE BLVD


In [90]:
def translate_text_multilingual(source: str, target: str, text: str) -> str:
    """
    Translates text from source to target language using HuggingFace M2M100 model.

    Args:
        source (str): Source language code (ISO 639-1, e.g., 'en' for English).
        target (str): Target language code (ISO 639-1, e.g., 'ko' for Korean).
        text (str): Text to translate.

    Returns:
        str: Translated text.
    """
    from transformers import M2M100ForConditionalGeneration, M2M100Tokenizer

    model_name = "facebook/m2m100_418M"

    tokenizer = M2M100Tokenizer.from_pretrained(model_name)
    model = M2M100ForConditionalGeneration.from_pretrained(model_name)

    # 소스 언어 설정
    tokenizer.src_lang = source

    # 입력 텍스트 토크나이징
    encoded = tokenizer(text, return_tensors="pt")

    # 번역 수행
    generated_tokens = model.generate(**encoded, forced_bos_token_id=tokenizer.get_lang_id(target))

    # 번역된 토큰 디코딩
    translated_text = tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]

    print(f"Text: {text}")
    print(f"Translation: {translated_text}")

    return translated_text

In [92]:
translated_text=translate_text_multilingual("en", "ko", extracted_text_easyocr)

tokenizer_config.json:   0%|          | 0.00/298 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/3.71M [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.14k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/908 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.94G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/233 [00:00<?, ?B/s]

Text: AVE YOUR TEXT HERE BLVD
Translation: 당신의 글은 여기 BLVD
