In [51]:
import pdfplumber
import pdf2image
import pytesseract

pdf_path = "Resume_UCSC.pdf"

with pdfplumber.open(pdf_path) as pdf:
    for i, page in enumerate(pdf.pages):
        text = page.extract_text()
        if text:  # 如果能直接提取文字
            print(f"第 {i+1} 頁的文字（直接擷取）：\n{text}")
        else:  # 如果不能提取文字，則使用 OCR
            image = pdf2image.convert_from_path(pdf_path, first_page=i+1, last_page=i+1)[0]
            text = pytesseract.image_to_string(image, lang="eng")
            print(f"第 {i+1} 頁的文字（OCR 擷取）：\n{text}")


第 1 頁的文字（直接擷取）：
Jou-Yi Lee
65RioRoblesE,SanJose,CA95134•zoelee19991226@gmail.com•+1408-618-9437•+886978-716-05
ABOUTME
Aspiringcomputerscientistwithastrongbackgroundinengineeringandresearch,aimingtoleverageskillsinmachine
learning,robotics,andsoftwaredevelopmentforinnovativeprojectsinnaturallanguageprocessing.
SKILLS
TechnicalSkills
✓ ProgrammingLanguages: C++,Python,MATLAB
✓ SoftwareandTools: PyTorch,Selenium,Scrapy,OpenCV,SolidWorks,AutoCAD,Ansys
✓ MachineLearningandAI:PyTorch,TensorFlow,Scikit-learn,NLTK,SpaCy
✓ EmbeddedSystems: RaspberryPi,Arduino
✓ MechanicalDesign: 3DPrinting,PlasticandMetalPartsDesign
SoftSkills
✓ Self-drivenlearning,Innovation,Problem-solving,Communication
RESEARCHPROJECTS
FeatureEngineeringandModelEvaluationforE-commerce Sep2024-Nov2024
Advisor: JalalMahmud,R&D
✓ Optimizedclassifiersforsentimentanalysisandproductcategorization.
RelationExtractionfromNaturalLanguage Sep2024-Nov2024
Advisor: AmitaMisra,Amazon
✓ Builtdeeplearningmodelsforrelationextractioninconve

In [9]:
from langdetect import detect

def detect_language(text):
    try:
        lang = detect(text)
        return lang
    except:
        return "unknown"

sample_text = "你好 ?"
detected_lang = detect_language(sample_text)
print(f"偵測語言：{detected_lang}")


偵測語言：zh-cn


In [28]:
import pdfplumber
import pdf2image
import pytesseract
import re
from langdetect import detect
from easyocr import Reader
from transformers import TrOCRProcessor, VisionEncoderDecoderModel
from PIL import Image

# TrOCR 模型（手寫辨識）
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten", ignore_mismatched_sizes=True)

# EasyOCR（單獨處理不同語言）
reader_chinese_tra = Reader(['ch_tra', 'en'])
reader_chinese_sim = Reader(['ch_sim', 'en'])
reader_japanese = Reader(['ja', 'en'])
reader_korean = Reader(['ko', 'en'])
reader_russian = Reader(['ru', 'en'])
reader_arabic = Reader(['ar', 'en'])

def detect_language(text):
    """偵測語言"""
    try:
        return detect(text)
    except:
        return "unknown"

def easyocr_multilang(image):
    """使用 EasyOCR，強制加入單詞間距"""
    final_text = []
    
    def process_reader(reader, lang_name):
        print(f"正在使用 {lang_name} OCR...")
        results = reader.readtext(image)
        text = " ".join([res[1] for res in results])  # 強制加入空格
        final_text.append(text)

    process_reader(reader_chinese_tra, "繁體中文")
    process_reader(reader_chinese_sim, "簡體中文")
    process_reader(reader_japanese, "日文")
    process_reader(reader_korean, "韓文")
    process_reader(reader_russian, "俄文")
    process_reader(reader_arabic, "阿拉伯文")

    return " ".join(final_text)

def fix_spacing(text):
    """自動修正沒有空格的單詞"""
    text = re.sub(r'([a-z])([A-Z])', r'\1 \2', text)  # 小寫 + 大寫之間補上空格
    text = re.sub(r'(\w)([,.!?])', r'\1 \2', text)  # 單詞與標點符號之間加入空格
    text = re.sub(r'(\D)(\d)', r'\1 \2', text)  # 非數字+數字之間加入空格
    return text

def ocr_pipeline(pdf_path):
    """OCR 處理流程"""
    extracted_text = ""

    with pdfplumber.open(pdf_path) as pdf:
        for i, page in enumerate(pdf.pages):
            text = page.extract_text()
            if text:  # ✅ 內嵌文字可直接提取
                text = text.replace("\n", " ").replace("•", " ")  # 修正 PDF 內嵌格式問題
                detected_lang = detect_language(text)
                print(f"第 {i+1} 頁 - 語言：{detected_lang}（內嵌文字）")
                extracted_text += text + "\n"
            else:
                # 🚀 PDF 轉換為圖片
                image = pdf2image.convert_from_path(pdf_path, first_page=i+1, last_page=i+1)[0]

                # ✅ 使用 Tesseract OCR，確保單詞之間有間格
                text_ocr = pytesseract.image_to_string(
                    image, lang="eng+chi_tra+jpn+kor+ara+rus",
                    config="--oem 3 --psm 6 -c preserve_interword_spaces=1"
                )
                detected_lang = detect_language(text_ocr)

                # ✅ 如果 Tesseract OCR 結果太短，改用 EasyOCR
                if len(text_ocr.strip()) < 10:
                    text_easyocr = easyocr_multilang(image)
                    detected_lang = detect_language(text_easyocr)
                    text_ocr = text_easyocr

                # ✅ 如果是手寫字或難識別內容，改用 TrOCR
                if detected_lang in ["en", "zh-cn", "zh-tw", "ja"]:
                    image = image.convert("RGB")
                    pixel_values = processor(image, return_tensors="pt").pixel_values
                    generated_ids = model.generate(pixel_values)
                    text_trocr = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
                    text_ocr = text_trocr

                print(f"第 {i+1} 頁 - 語言：{detected_lang}")
                extracted_text += text_ocr + "\n"

    # 修正間隔問題
    extracted_text = fix_spacing(extracted_text)
    return extracted_text

# 測試 OCR
pdf_path = "Resume_UCSC.pdf"
text_output = ocr_pipeline(pdf_path)

print("\n📝 最終 OCR 輸出：")
print(text_output)


Some weights of VisionEncoderDecoderModel were not initialized from the model checkpoint at microsoft/trocr-base-handwritten and are newly initialized: ['encoder.pooler.dense.bias', 'encoder.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


第 1 頁 - 語言：en（內嵌文字）
第 2 頁 - 語言：en（內嵌文字）

📝 最終 OCR 輸出：
Jou-Yi Lee  65Rio Robles E ,San Jose ,CA 95134 zoelee 19991226@gmail .com + 1408- 618- 9437 + 886978- 716- 05 ABOUTME Aspiringcomputerscientistwithastrongbackgroundinengineeringandresearch ,aimingtoleverageskillsinmachine learning ,robotics ,andsoftwaredevelopmentforinnovativeprojectsinnaturallanguageprocessing . SKILLS Technical Skills ✓ Programming Languages: C++,Python ,MATLAB ✓ Softwareand Tools: Py Torch ,Selenium ,Scrapy ,Open CV ,Solid Works ,Auto CAD ,Ansys ✓ Machine Learningand AI:Py Torch ,Tensor Flow ,Scikit-learn ,NLTK ,Spa Cy ✓ Embedded Systems: Raspberry Pi ,Arduino ✓ Mechanical Design:  3DPrinting ,Plasticand Metal Parts Design Soft Skills ✓ Self-drivenlearning ,Innovation ,Problem-solving ,Communication RESEARCHPROJECTS Feature Engineeringand Model Evaluationfor E-commerce Sep 2024-Nov 2024 Advisor: Jalal Mahmud ,R&D ✓ Optimizedclassifiersforsentimentanalysisandproductcategorization . Relation Extractionfrom Natural 

In [67]:
import pdfplumber
import pdf2image
import pytesseract
import re
from wordsegment import load, segment

# 加載 wordsegment 的統計模型
load()

def fix_spacing_with_regex(text):
    """使用 Regular Expression 修正標點、數字與文字格式（不影響 Gmail & Phone）"""
    
    # **先找到 Email 和電話號碼**
    email_pattern = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
    phone_pattern = r'(\+?\d{1,4}[-.\s]?\d{2,4}[-.\s]?\d{3,4}[-.\s]?\d{3,4})'
    
    all_matches = []
    for match in re.finditer(email_pattern, text):
        all_matches.append((match.start(), match.end(), match.group()))
    for match in re.finditer(phone_pattern, text):
        all_matches.append((match.start(), match.end(), match.group()))
    
    all_matches.sort(key=lambda x: x[0])

    fixed_parts = []
    last_end = 0

    for start, end, value in all_matches:
        normal_text = text[last_end:start]
        if normal_text.strip():  
            # **修正格式**
            normal_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', normal_text)  
            normal_text = re.sub(r'(\w)([,.!?])', r'\1 \2', normal_text)  

            # **核心修正點**
            normal_text = re.sub(r'([a-zA-Z]+)(\d+)([a-zA-Z]+)(\d+)', r'\1 \2 \3 \4', normal_text)  # **處理「文字+數字+文字+數字」**
            normal_text = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', normal_text)  # **處理「文字+數字」**
            normal_text = re.sub(r'(\d+)([a-zA-Z]+)', r'\1 \2', normal_text)  # **處理「數字+文字」**
            normal_text = re.sub(r'(\D)(\d)', r'\1 \2', normal_text)  # **處理「非數字+數字」**
            normal_text = re.sub(r'(\d)(\D)', r'\1 \2', normal_text)  # **處理「數字+非數字」**

            fixed_parts.append(normal_text)

        # **直接添加原始 Email 和 Phone**
        fixed_parts.append(value)
        last_end = end

    if last_end < len(text):
        remaining_text = text[last_end:]
        remaining_text = re.sub(r'([a-z])([A-Z])', r'\1 \2', remaining_text)  
        remaining_text = re.sub(r'(\w)([,.!?])', r'\1 \2', remaining_text)  

        # **核心修正點**
        remaining_text = re.sub(r'([a-zA-Z]+)(\d+)([a-zA-Z]+)(\d+)', r'\1 \2 \3 \4', remaining_text)  # **處理「文字+數字+文字+數字」**
        remaining_text = re.sub(r'([a-zA-Z]+)(\d+)', r'\1 \2', remaining_text)  # **處理「文字+數字」**
        remaining_text = re.sub(r'(\d+)([a-zA-Z]+)', r'\1 \2', remaining_text)  # **處理「數字+文字」**
        remaining_text = re.sub(r'(\D)(\d)', r'\1 \2', remaining_text)  # **處理「非數字+數字」**
        remaining_text = re.sub(r'(\d)(\D)', r'\1 \2', remaining_text)  # **處理「數字+非數字」**

        fixed_parts.append(remaining_text)

    return " ".join(fixed_parts)


def preserve_format(text):
    """保留 Gmail 和電話號碼，對其他部分使用 wordsegment 修正單詞間距"""
    
    # **更嚴謹的 Gmail 和電話號碼匹配**
    email_pattern = r'([a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,})'
    phone_pattern = r'(\+?\d{1,4}[-.\s]?\d{2,4}[-.\s]?\d{3,4}[-.\s]?\d{3,4})'

    # **先找到 Email 和電話號碼**
    all_matches = []
    
    for match in re.finditer(email_pattern, text):
        all_matches.append((match.start(), match.end(), match.group()))

    for match in re.finditer(phone_pattern, text):
        all_matches.append((match.start(), match.end(), match.group()))

    # **依照出現順序排序**
    all_matches.sort(key=lambda x: x[0])

    # **分割文本，確保 Email 和 Phone 不進入 `wordsegment`**
    segmented_parts = []
    last_end = 0

    for start, end, value in all_matches:
        # 取出上次匹配後的普通文字部分，並用 wordsegment 處理
        normal_text = text[last_end:start]
        if normal_text.strip():  # 避免多餘的空白段落
            segmented_parts.append(" ".join(segment(normal_text)))

        # 直接添加原始的 Email 或 Phone，不做任何處理
        segmented_parts.append(value)

        last_end = end

    # **處理最後一段文字（如果有的話）**
    if last_end < len(text):
        remaining_text = text[last_end:]
        segmented_parts.append(" ".join(segment(remaining_text)))

    # **組合回原始文本**
    return " ".join(segmented_parts)

pdf_path = "Resume_UCSC.pdf"

with pdfplumber.open(pdf_path) as pdf:
    for i, page in enumerate(pdf.pages):
        text = page.extract_text()
        if text:  
            # **先修正標點與數字格式**
            fixed_text = fix_spacing_with_regex(text)

            # **再修正單詞間距**
            fixed_text = preserve_format(fixed_text)

            print(f"第 {i+1} 頁的文字（直接擷取 + 修正空格）：\n{fixed_text}")
        else:  
            # **如果無法解析，則使用 OCR**
            image = pdf2image.convert_from_path(pdf_path, first_page=i+1, last_page=i+1)[0]
            text = pytesseract.image_to_string(image, lang="eng")

            # **先修正標點與數字格式**
            fixed_text = fix_spacing_with_regex(text)

            # **再修正單詞間距**
            fixed_text = preserve_format(fixed_text)

            print(f"第 {i+1} 頁的文字（OCR 擷取 + 修正空格）：\n{fixed_text}")


第 1 頁的文字（直接擷取 + 修正空格）：
jouyilee65rioroblese san jose ca95134 zoelee19991226@gmail.com  +1408-618-9437  +886978-716 05 about me aspiring computer scientist with a strong background in engineering and research aiming to leverage skills in machine learning robotics and software development for innovative projects in natural language processing skills technical skills programming languages cpython matlab software and tools py torch selenium scrap y opencv solidworks autocad ansys machine learning and a ipy torch tensor flow sci kit learn nl tk spacy embedded systems raspberry pi arduino mechanical design 3d printing plastic and metal parts design soft skills self driven learning innovation problem solving communication research projects feature engineering and model evaluation for ecommerce sep2024nov2024 advisor jalal mahmud rd optimized classifiers for sentiment analysis and product categorization relation extraction from natural language sep2024nov2024 advisor amita misra amazon built d