In [1]:
import argparse
import json
import re
from pathlib import Path
from typing import Literal
from tqdm import tqdm
from langchain_ollama import ChatOllama
from pydantic import BaseModel, Field

In [4]:
# Create Pydantic model for response object
class TitleCheckResponse(BaseModel):
    reasoning: str = Field(..., description="The reasoning behind the title match decision.")
    is_match: bool = Field(..., description="Indicates whether the titles match or not.")

In [None]:
prompt_template = """You are an expert researcher and data analyst. Your task is to determine if the following titles refer to the same academic paper.

## Understanding
- A noisy dataset contained titles of academic papers often contained typos, formatting issues, author names, or other variations making the titel different than the canonical title in the publisher's database
- Fuzzy matching attempted to resolve the canonical title for each dataset title

## Task
Given a dataset title and a candidate canonical title, determine if they refer to the same academic paper or not.

## Examples
- Dataset Title: "A NeuralProbabilistic Language ModelSmith,Joe et al"
  Candidate Title: "A Neural Probabilistic Language Model"
  reasoning: ignoring the author info in the dataset title, it almost exactly matches the candidate title
  is_match: true

- Dataset Title: ".]"
  Candidate Title: "New Advances in Computational Linguistics"
  reasoning: the dataset title is mis-encoded since '.]' could not realistically be a title for a paper. Impossible to tell if the candidate title refers to the same intended paper
  is_match: false

- Dataset Title: "Natural Language Processing"
  Candidate Title: "Natural Language Processing Benefits from Attention Mechanisms"
  reasoning: the dataset title is extremely generic so I cannot confidently say that the candidate title is a match
  is_match: false

## Input
- Dataset Title: "{dataset_title}"
- Candidate Title: "{candidate_title}"

Write your answer in JSON format with the keys 'reasoning' for your reasoning and 'is_match' for your determination (true if they match, false otherwise).
"""

In [11]:
model_name = "mistral:7b"
llm = ChatOllama(model=model_name, temperature=0.0).with_structured_output(TitleCheckResponse)

In [7]:
dataset_title = "CoCoCo: Online Extraction of Russian Multiword Expressions"
candidate_title = "Online Extraction of Russian Multiword Expressions"
response = llm.invoke(
    prompt_template.format(
        dataset_title=dataset_title,
        candidate_title=candidate_title
    )
)

In [24]:
import re

pattern = re.compile(r"WARNING - Multiple matches on '(.+?)'; choosing (.+?)$")
line = "2025-12-19 12:38:09,012 - WARNING - Multiple matches on 'Cb or not Cb? Centering theory applied to NLGRodger KibbleInformation Technology Research InstituteUniversity of BrightonLewes RoadBrighton BN2 4GJ'; choosing Proceedings of the First Workshop on Language-driven Deliberation Technology (DELITE) @ LREC-COLING 2024"
match = pattern.search(line)
print(f"Number of match groups: {len(match.groups())}")

Number of match groups: 2


In [29]:
response.model_dump()

{'reasoning': "The dataset title contains an extra 'CoCo' at the beginning which can be considered a typo or formatting issue. After removing the extra 'CoCo', the remaining titles are identical.",
 'is_match': True}

In [14]:
from citeline.llm.llm_function import LLMFunction

titlechecker = LLMFunction(
    model_name=model_name,
    prompt_path="../src/citeline/llm/prompts/reference_title_check.txt",
    output_model=TitleCheckResponse
)

In [15]:
response = titlechecker({'dataset_title': dataset_title, 'candidate_title': candidate_title})

LLM response: reasoning="The dataset title contains an extra 'CoCo' at the beginning which can be considered a typo or formatting issue. After removing the extra 'CoCo', the remaining titles are identical." is_match=True
