In [38]:
from langchain_community.chat_models import ChatOllama
import os

from langchain_unstructured import UnstructuredLoader
from langchain_community.embeddings.huggingface import HuggingFaceBgeEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
llm = ChatOllama(model = "llama3.1")

In [39]:
import torch

In [40]:
import csv
import sys
file_path = 'data.csv'
texts = []
buffer = []
csv.field_size_limit(sys.maxsize)

with open(file_path, newline='', encoding='UTF-8-sig') as f:
     reader = csv.DictReader(f, delimiter=',')
     for i, row in enumerate(reader):
        
          text = f"{row['상표한글명']}, {row['상표영문명']}, {row['상품류별버전']}, {row['류']}, {row['유사군']}, {row['지정상품한글명']}, {row['유사군']}"
          
          buffer.append(text)
          if (i + 1) % 20 == 0:
               combined_text = "\n".join(buffer)
               texts.append(combined_text)
               buffer = [] 
     if buffer:
          texts.append("\n".join(buffer)) 
# for index, block in enumerate(texts):
#     print(f"Block {index + 1}:\n{block}\n")    

In [41]:
file_path = "test.pdf"
print("文件路径:", os.path.abspath(file_path))


文件路径: /home/wangao/RAG/test.pdf


In [42]:

from langchain.schema import Document
import pdfplumber
from langchain.text_splitter import CharacterTextSplitter

loader = UnstructuredLoader(file_path)
docs = loader.load()  
page_docs = []
text_splitter = CharacterTextSplitter(chunk_size=500, chunk_overlap=50)

with pdfplumber.open(file_path) as pdf:
    for page_number, page in enumerate(pdf.pages):
        page_content = page.extract_text() 
        if page_content: 
            if len(page_content) > 500:
                split_texts = text_splitter.split_text(page_content)
                for text in split_texts:
                    page_docs.append({
                        "page_number": page_number + 1,
                        "content": text
                    })
            else:
               
                page_docs.append({
                    "page_number": page_number + 1,
                    "content": page_content
                })







In [43]:
from langchain.vectorstores import Chroma
from langchain.vectorstores import FAISS

documents = [
    Document(page_content=doc["content"], metadata={"page_number": doc["page_number"]})
    for doc in page_docs
]

model_name = "BAAI/bge-m3"
model_kwargs = {"device": "cuda"}  
encode_kwargs = {"normalize_embeddings": True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

vector_store = FAISS.from_documents(documents, embeddings)




INFO: Load pretrained SentenceTransformer: BAAI/bge-m3


In [44]:
retriever = vector_store.as_retriever(
    search_type="similarity_score_threshold",
    search_kwargs={
        "k": 5,
        "score_threshold": 0.1,
    },
)

In [54]:
from langchain.prompts import PromptTemplate

prompt = PromptTemplate.from_template("""
You are an expert in determining whether a trademark is suspected of plagiarism and meets qualification standards. Only answer "Y" or "N" based on the provided trademark law document (`example.pdf`) and the registered data (`data.csv`). Do not provide explanations or any additional information.
**Important: Do not provide any other information or explanations. Only output `Y` for available or `N` for not available.**

Trademark: {trademark}
Availability:
""")

In [55]:
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser

chain = (
    {"trademark": RunnablePassthrough()}  # 传入输入参数
    | prompt
    | llm
    | StrOutputParser()
)

In [58]:
file_path2 = "/home/wangao/RAG/TB_KT10_bulk_testset_26.txt"
input_file_path = "/home/wangao/RAG/TB_KT10_bulk_testset_26.txt"
output_file_path = "/home/wangao/RAG/trademark_availability_results.txt"

# Open the input and output files
with open(input_file_path, "r", encoding="utf-8") as input_file, open(output_file_path, "w", encoding="utf-8") as output_file:
    for line in input_file:
        trademark = line.strip()  # Remove any extra whitespace/newlines
        result = chain.invoke({"trademark": trademark})  # Get the model's output
        
        # Ensure only "Y" or "N" is written, or default to "N"
        availability = result if result in ["Y", "N"] else "N"
        
        # Write the result to the output file
        output_file.write(f" {availability}\n")


In [None]:
reponse2 = chain.invoke("""is "심판청구서 등의 각하에 관한 적용례" in the list?""")
print(reponse2)

Yes

The context contains a series of documents with metadata and page content related to patent law, specifically regarding trademark registration and dispute resolution. In one of the documents (Document(metadata={'page_number': 38}), page_content='...'), there is a section on "심판청구서 등의 각하에 관한 적용례" which translates to "Application examples for dismissal of petitions, etc.".

This suggests that the context indeed contains information related to "심판청구서 등의 각하에 관한 적용례".
