In [1]:
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine

text="My phone number is 212-555-5555"

# Set up the engine, loads the NLP module (spaCy model by default) 
# and other PII recognizers
analyzer = AnalyzerEngine()

# Call analyzer to get results
results = analyzer.analyze(text=text,
                           entities=["PHONE_NUMBER"],
                           language='en')
print(results)

# Analyzer results are passed to the AnonymizerEngine for anonymization

anonymizer = AnonymizerEngine()

anonymized_text = anonymizer.anonymize(text=text,analyzer_results=results)

print(anonymized_text)

[type: PHONE_NUMBER, start: 19, end: 31, score: 0.75]
text: My phone number is <PHONE_NUMBER>
items:
[
    {'start': 19, 'end': 33, 'entity_type': 'PHONE_NUMBER', 'text': '<PHONE_NUMBER>', 'operator': 'replace'}
]



In [2]:
# 直接把某些词定义为某个类型的实体
from presidio_analyzer import AnalyzerEngine, PatternRecognizer

# 1. 创建 deny-list
company_list = ["Presidio", "OpenAI"]

# 2. 新建一个 PatternRecognizer，把 deny-list 关联到 COMPANY_NAME 实体
company_recognizer = PatternRecognizer(supported_entity="COMPANY_NAME", deny_list=company_list)

# 3. 创建分析引擎，并注册自定义识别器
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(company_recognizer)

# 4. 输入要分析的文本
text = "OpenAI 和 Presidio 都是做人工智能相关的软件公司。"

# 5. 执行分析
results = analyzer.analyze(text=text, language="en")  # 英文检测效果最佳

# 6. 输出结果
print("被识别的敏感词：")
for res in results:
    print(f"{text[res.start:res.end]} -> {res.entity_type} (置信度: {res.score:.2f})")


被识别的敏感词：
OpenAI -> COMPANY_NAME (置信度: 1.00)
Presidio -> COMPANY_NAME (置信度: 1.00)


In [3]:
# 使用正则表达式检测
from presidio_analyzer import AnalyzerEngine, PatternRecognizer, Pattern

# 1. 定义员工号的正则表达式
employee_id_pattern = Pattern(name="employee_id_pattern", regex=r"EMP-\d{6}", score=0.8)
employee_id_recognizer = PatternRecognizer(
    supported_entity="EMPLOYEE_ID",
    patterns=[employee_id_pattern]
)

# 2. 定义自定义邮箱正则（比如只识别 mycompany.com 的邮箱）
custom_email_pattern = Pattern(name="my_email_pattern", regex=r"\b[A-Za-z0-9._%+-]+@mycompany\.com\b", score=0.9)
custom_email_recognizer = PatternRecognizer(
    supported_entity="CUSTOM_EMAIL",
    patterns=[custom_email_pattern]
)

# 3. 创建 Presidio 引擎并注册两个自定义识别器
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(employee_id_recognizer)
analyzer.registry.add_recognizer(custom_email_recognizer)

# 4. 测试文本
text = "员工号为EMP-123456，邮箱是 zhangsan@mycompany.com 或 test@gmail.com，工号 EMP-654321 也有效。"

# 5. 分析
results = analyzer.analyze(text=text, language="en")  # 英文环境即可

# 6. 输出识别结果
print("被识别的敏感信息：")
for res in results:
    print(f"{text[res.start:res.end]} -> {res.entity_type} (置信度: {res.score:.2f})")


被识别的敏感信息：
zhangsan@mycompany.com -> EMAIL_ADDRESS (置信度: 1.00)
test@gmail.com -> EMAIL_ADDRESS (置信度: 1.00)
zhangsan@mycompany.com -> CUSTOM_EMAIL (置信度: 0.90)
EMP-123456 -> EMPLOYEE_ID (置信度: 0.80)
EMP-654321 -> EMPLOYEE_ID (置信度: 0.80)
mycompany.com -> URL (置信度: 0.50)
gmail.com -> URL (置信度: 0.50)
123456 -> US_DRIVER_LICENSE (置信度: 0.01)
654321 -> US_DRIVER_LICENSE (置信度: 0.01)


In [4]:
# 自定义识别器示例：识别数字
from typing import List
from presidio_analyzer import EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts
from presidio_analyzer import AnalyzerEngine


class NumbersRecognizer(EntityRecognizer):

    expected_confidence_level = 0.7  # expected confidence level for this recognizer

    def load(self) -> None:
        """No loading is required."""
        pass

    def analyze(
        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
    ) -> List[RecognizerResult]:
        """
        Analyzes test to find tokens which represent numbers (either 123 or One Two Three).
        """
        results = []

        # iterate over the spaCy tokens, and call `token.like_num`
        for token in nlp_artifacts.tokens:
            if token.like_num:
                result = RecognizerResult(
                    entity_type="NUMBER",
                    start=token.idx,
                    end=token.idx + len(token),
                    score=self.expected_confidence_level,
                )
                results.append(result)
        return results


# Instantiate the new NumbersRecognizer:
new_numbers_recognizer = NumbersRecognizer(supported_entities=["NUMBER"])

text3 = "Roberto lives in Five 10 Broad st."
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(new_numbers_recognizer)

numbers_results2 = analyzer.analyze(text=text3, language="en")
print("Results:")
print("\n".join([str(res) for res in numbers_results2]))

Results:
type: PERSON, start: 0, end: 7, score: 0.85
type: DATE_TIME, start: 17, end: 24, score: 0.85
type: NUMBER, start: 17, end: 21, score: 0.7
type: NUMBER, start: 22, end: 24, score: 0.7


In [None]:
# 自定义邮箱识别器示例：识别邮箱地址
from typing import List
from presidio_analyzer import EntityRecognizer, RecognizerResult
from presidio_analyzer.nlp_engine import NlpArtifacts
from presidio_analyzer import AnalyzerEngine
import re

class EmailRecognizer(EntityRecognizer):
    # 设定邮箱识别的置信分数
    expected_confidence_level = 0.85

    def load(self) -> None:
        """邮箱识别器无需加载外部模型，直接跳过。"""
        pass

    def analyze(
        self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts
    ) -> List[RecognizerResult]:
        """
        检查文本中所有匹配邮箱正则表达式的部分，标记为 EMAIL。
        """
        results = []

        # 定义简单的邮箱正则表达式（可根据实际需求增强）
        pattern = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")
        # 遍历所有匹配项
        for match in pattern.finditer(text):
            start = match.start()
            end = match.end()
            # 记录识别结果
            result = RecognizerResult(
                entity_type="EMAIL",
                start=start,
                end=end,
                score=self.expected_confidence_level,
            )
            results.append(result)
        return results

# 实例化邮箱识别器，并注册到Presidio分析引擎
new_email_recognizer = EmailRecognizer(supported_entities=["EMAIL"])

text4 = "请联系 john.doe@example.com 或 support@openai.com 获取帮助。"
analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(new_email_recognizer)

email_results = analyzer.analyze(text=text4, language="en")

print("识别结果：")
for res in email_results:
    print(f"{text4[res.start:res.end]} -> {res.entity_type} (置信度: {res.score:.2f})")


识别结果：
john.doe@example.com -> EMAIL_ADDRESS (置信度: 1.00)
support@openai.com -> EMAIL_ADDRESS (置信度: 1.00)
john.doe@example.com -> EMAIL (置信度: 0.85)
support@openai.com -> EMAIL (置信度: 0.85)
john.do -> URL (置信度: 0.50)
example.com -> URL (置信度: 0.50)
openai.com -> URL (置信度: 0.50)


In [None]:
# 在特定上下文中识别邮箱地址
from presidio_analyzer import PatternRecognizer, Pattern, AnalyzerEngine

# 定义邮箱正则和上下文关键词
email_pattern = Pattern("email_pattern", r"[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+", 0.8)
context = ["email", "mail"]  # 只在这些关键词附近触发

email_recognizer = PatternRecognizer(
    supported_entity="EMAIL",
    patterns=[email_pattern],
    context=context
)

analyzer = AnalyzerEngine()
analyzer.registry.add_recognizer(email_recognizer)

text = "My email is john.doe@example.com. You can also find my address somewhere else."
results = analyzer.analyze(text=text, language="en")
for res in results:
    print(text[res.start:res.end], "->", res.entity_type, res.score)


john.doe@example.com. -> EMAIL 1.0
john.doe@example.com -> EMAIL_ADDRESS 1.0
john.do -> URL 0.5
example.com -> URL 0.5
