In [None]:
from transformers import AutoTokenizer, AutoModel

# 加载原始 tokenizer 和模型
model_name = "sentence-transformers/all-distilroberta-v1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# 常见的 LaTeX 数学符号和命令（可以根据需要扩展）
latex_tokens = [
    # 基本运算符
    "\\frac", "\\sqrt", "\\cdot", "\\times", "\\div", "\\pm", "\\mp", "\\ast", "\\star",

    # 关系符
    "\\leq", "\\geq", "\\neq", "\\approx", "\\equiv", "\\sim", "\\propto",

    # 箭头符号
    "\\rightarrow", "\\leftarrow", "\\Rightarrow", "\\Leftarrow", "\\leftrightarrow",

    # 极限与积分
    "\\sum", "\\prod", "\\int", "\\lim", "\\infty", "\\partial", "\\nabla", "\\oint",

    # 集合与逻辑
    "\\in", "\\notin", "\\subset", "\\supset", "\\subseteq", "\\supseteq",
    "\\cup", "\\cap", "\\exists", "\\forall", "\\neg", "\\land", "\\lor",

    # 函数与运算符
    "\\sin", "\\cos", "\\tan", "\\csc", "\\sec", "\\cot",
    "\\log", "\\ln", "\\exp",

    # 希腊字母（小写）
    "\\alpha", "\\beta", "\\gamma", "\\delta", "\\epsilon", "\\zeta", "\\eta",
    "\\theta", "\\iota", "\\kappa", "\\lambda", "\\mu", "\\nu", "\\xi",
    "\\pi", "\\rho", "\\sigma", "\\tau", "\\upsilon", "\\phi", "\\chi", "\\psi", "\\omega",

    # 希腊字母（大写）
    "\\Gamma", "\\Delta", "\\Theta", "\\Lambda", "\\Xi", "\\Pi", "\\Sigma", "\\Phi", "\\Psi", "\\Omega",

    # 括号
    "\\left(", "\\right)", "\\left[", "\\right]", "\\left\\{", "\\right\\}",

    # 其他
    "\\dots", "\\ldots", "\\cdots", "\\vdots", "\\ddots",
    "\\text", "\\mathrm", "\\mathbb", "\\mathbf", "\\mathcal",
]

# 添加 token
num_added = tokenizer.add_tokens(latex_tokens)
print(f"共添加了 {num_added} 个 LaTeX token")

# 调整模型的 embedding 层
model.resize_token_embeddings(len(tokenizer))
print("模型 embedding 已调整为新词表大小。")


In [None]:
from sentence_transformers import InputExample
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer

# model = AutoModel.from_pretrained("./output/latex-finetuned/checkpoint-1101")
# org_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-distilroberta-v1")
# tokenizer = AutoTokenizer.from_pretrained("./init_model")

ss = "\\frac{a}{b} + \\sqrt{x}"
sy = "\\frac{x}{y} = c"         
f1 = "\\frac{a}{b} + \\sqrt{x}"
f2 = "\\sqrt{y} + \\frac{1}{x}"                                                                                    
# print(model(org_tokenizer(ss, return_tensors="pt")))
model = SentenceTransformer("./output/right_model/checkpoint-1101")
org_model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")
# smodel.fit()
# print(model.tokenizer.tokenize("\\frac{a}{b} + \\sqrt{x}"))
org_embeddings = org_model.encode([f1, f2])
print(org_model.similarity(org_embeddings[0], org_embeddings[1]))
embeddings = model.encode([f1, f2], convert_to_tensor=True)
# print(embeddings)
text = "We consider the task of determining a soccer player’s ability for a given event type, for example, scoring a goal. We propose an interpretable Bayesian inference approach that centres on variational inference methods. We implement a Poisson model to capture occurrences of event types, from which we infer player abilities. Our approach also allows the visualisation of differences between players, for a specific ability, through the marginal posterior variational densities. We then use these inferred player abilities to extend the Bayesian hierarchical model of Baio and Blangiardo (2010), which captures a team’s scoring rate (the rate at which they score goals). We apply the resulting scheme to the English Premier League, capturing player abilities over the 2013/2014 season, before using output from the hierarchical model to predict whether over or under 2.5 goals will be scored in a given fixture or not in the 2014/2015 season."
print(len(model.tokenizer.tokenize(text)))
# print(model.similarity(embeddings[0], embeddings[1]))

In [None]:
from bs4 import BeautifulSoup
import os
import json

def parse_paper(html_path):
    """
    解析单篇 HTML，提取标题、摘要、正文等，返回一个 dict。
    """
    with open(html_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html5lib')

    # 1. 提取标题
    title_tag = soup.find('h1', class_='ltx_title_document')
    title = title_tag.get_text(strip=True) if title_tag else ''

    # 2. 提取摘要
    abstract_div = soup.find('div', class_='ltx_abstract')
    abstract = ''
    if abstract_div:
        p = abstract_div.find('p')
        abstract = p.get_text(strip=True) if p else ''

    # 3. 提取正文：取正文容器下所有段落，过滤掉摘要相同内容
    body_div = soup.find('div', class_='ltx_page_content')
    paragraphs = []
    if body_div:
        for p in body_div.find_all('p', class_='ltx_p'):
            text = p.get_text(strip=True)
            if text and text != abstract:
                paragraphs.append(text)
    body = '\n\n'.join(paragraphs)

    return {
        'title': title,
        'abstract': abstract,
        'body': body
    }

if __name__ == '__main__':
    # 指定当前目录下的唯一 HTML 文件名
    html_file = '2209.00010.html'
    if not os.path.isfile(html_file):
        raise FileNotFoundError(f"未找到文件: {html_file}")

    # 解析并打印结果
    result = parse_paper(html_file)
    print(json.dumps(result, ensure_ascii=False, indent=2))


In [None]:
import json

data = []
with open("./latex_pairs_20000.json", "r") as fr:
    data = json.load(fr)

with open("./latex_pairs_200001.jsonl", "w") as fw:
    for d in data:
        temp = {"text1": d["texts"][0], "text2": d["texts"][1], "label": d["label"]}
        fw.write(json.dumps(temp))
        fw.write("\n")

In [None]:
from sentence_transformers import SentenceTransformer
import os
import json
from tqdm import tqdm
import torch
import pickle


model = SentenceTransformer("sentence-transformers/allenai-specter")

In [None]:
from arxiv2text import arxiv2text

url = "https://arxiv.org/pdf/2209.00010.pdf"
text = arxiv2text(url)
print(text)

In [None]:
import re
import unicodedata
import html
from bs4 import BeautifulSoup
from bs4 import NavigableString
import html
import unicodedata
import re

def clean_text(text):
    # 清洗函数：保留 Unicode，去格式标记、转义字符串等
    text = html.unescape(text)
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r'[\ud800-\udfff]', '', text)
    text = re.sub(r'[\x00-\x1F\x7F]', '', text)
    for ch in ['\u200b', '\u200c', '\u200d', '\ufeff',
               '\u202a', '\u202b', '\u202c', '\u202d', '\u202e']:
        text = text.replace(ch, '')
    text = re.sub(r'start_[A-Z_]+', '', text)
    text = re.sub(r'end_[A-Z_]+', '', text)
    text = re.sub(r'\b(?:italic|bold|math|symbol|script|cal|frak|roman)_([a-zA-Z0-9Δ-]+)\b', r'\1', text)
    text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)
    text = re.sub(r'\\U[0-9a-fA-F]{8}', '', text)
    return ' '.join(text.split())

def parse_paper(html_path):
    with open(html_path, 'r', encoding='utf-8') as f:
        text = f.read()
        soup = BeautifulSoup(text, 'html5lib')
    body = soup.get_text()
    intro_index = body.find('Introduction')
    ref_index = body.find('Reference')
    if intro_index != -1 and ref_index != -1 and intro_index < ref_index:
        body = body[intro_index:ref_index].strip()
    # print(intro_index, ref_index)
    text = re.sub(r'\n{2,}', '[PARA]', body)
    text = text.replace('\n', '')

    # 第三步：把段落标记还原为一个换行符
    cleaned_text = text.replace('[PARA]', '\n')
    return cleaned_text
    # text_list = [str(s) for s in soup.descendants if isinstance(s, NavigableString)]

    # # 拼接成一个完整的字符串
    # full_text = ''.join(text_list).strip()

    # print()
    # return full_text

text = parse_paper(r"C:\Users\86159\Downloads\ar5iv_1710-2209\ar5iv\2209\2209.00009.html")
print(text)


In [6]:

from bs4 import BeautifulSoup
import os
import json
from tqdm import tqdm
from transformers import AutoTokenizer
import re
import unicodedata
import html

def clean_text(text):
    # 清洗函数：保留 Unicode，去格式标记、转义字符串等
    text = html.unescape(text)
    text = unicodedata.normalize("NFKC", text)
    text = re.sub(r'[\x00-\x1F\x7F]', '', text)
    text = re.sub(r'start_[A-Z_]+', '', text)
    text = re.sub(r'end_[A-Z_]+', '', text)
    text = re.sub(r'\b(?:italic|bold|math|symbol|script|cal|frak|roman)_([a-zA-Z0-9Δ-]+)\b', r'\1', text)
    text = re.sub(r'\\u[0-9a-fA-F]{4}', '', text)
    return ' '.join(text.split())

def parse_paper(html_path):
    with open(html_path, 'r', encoding='utf-8') as f:
        soup = BeautifulSoup(f, 'html5lib')

    body = soup.get_text()
    intro_index = body.find('Introduction')
    ref_index = body.find('Reference')
    if intro_index != -1 and ref_index != -1 and intro_index < ref_index:
        body = body[intro_index:ref_index].strip()

    text = re.sub(r'\n{2,}', '[PARA]', body)
    text = text.replace('\n', '')
    cleaned_text = text.replace('[PARA]', '\n')
    cleaned_text = clean_text(cleaned_text)
    return cleaned_text

text = parse_paper(r"C:\Users\86159\Downloads\ar5iv_1710-2209\ar5iv\1710\1710.00001.html")
print(text)


IntroductionWithin this paper we look to determine the ability of those playerswho play in the English Premier League. The Premier League is anannual soccer league established in 1992 and is the mostwatched soccer league in the world (Yueh, 2014; Curley and Roeder, 2016).It is made up of 20 teams, who, over the course of a season, play everyother team twice (both home and away), giving a total of 380 fixtureseach year. It is the top division of English soccer, and every yearthe bottom 3 teams are relegated to be replaced by 3 teams from thenext division down (the Championship). In recent times the PremierLeague has also become known as the richest league in the world(Deloitte, 2016), through both foreign investment and alucrative deal for television rights (Cave and Miller, 2016; Rumsby, 2016; BBC Business, 2016).Whilst there is growing financial competition from China, the Premierleague arguably still attracts some of the best players in the world.Staying in the Premier league (by avo

In [8]:
import json

with open(r"C:\vscode_project\latex_tokenizer\discrete_data\all_paper_info.jsonl", "r") as fr:
    org_set = set([json.loads(line)["id"] for line in fr.readlines()])

with open(r"C:\vscode_project\latex_tokenizer\paper_body\paper_body_part1old.jsonl", "r") as fr:
    new_set = set([json.loads(line)["id"] for line in fr.readlines()])

print(new_set.difference(org_set))

set()


In [None]:
import json

with open(r"C:\vscode_project\latex_tokenizer\discrete_data\paper_body_part1old.jsonl", "r") as fr:
    data = [json.loads(line) for line in fr.readlines()]

now_id = data[0]["id"]
temp = []
file_index = 1
for item in data:
    if now_id == item["id"]:
        temp.append(item)
    else:
        if len(temp) > 20000:
            with open(r"C:\vscode_project\latex_tokenizer\paper_body\paper_body_part" + str(file_index) + ".jsonl", "a") as fw:
                for t in temp:
                    fw.write(json.dumps(t))
                    fw.write("\n")
            temp = []
            file_index += 1
        now_id = item["id"]
        temp.append(item)

with open(r"C:\vscode_project\latex_tokenizer\paper_body\paper_body_part" + str(file_index) + ".jsonl", "a") as fw:
    for t in temp:
        fw.write(json.dumps(t))
        fw.write("\n")



FileNotFoundError: [Errno 2] No such file or directory: 'C:\\vscode_project\\latex_tokenizer\\paper_body\\paper_body_part1old.jsonl'

In [17]:
from sentence_transformers import SentenceTransformer
from transformers import AutoModel, AutoTokenizer
import json
ss = "\\frac{a}{b} + \\sqrt{x}"
sy = "\\frac{x}{y} = c"         
f1 = "\\frac{a}{b} + \\sqrt{x}"
f2 = "\\sqrt{y} + \\frac{1}{x}"                                                                                    

model = SentenceTransformer("sentence-transformers/allenai-specter")
org_model = SentenceTransformer("sentence-transformers/all-distilroberta-v1")

# paragraphs = [
#     "Deep learning has revolutionized NLP in recent years.",
#     "Transformers are the current state-of-the-art in many NLP tasks.",
#     "This paper proposes a novel architecture that builds on BERT.",
#     "I'm looking for a paper about Newton's Third law",
# ]
with open(r"C:\vscode_project\latex_tokenizer\discrete_data\all_paper_info.jsonl", "r") as fr:
    data = [json.loads(line) for line in fr.readlines()]

paragraphs = []
for i in range(5):
    paragraphs.append(data[i]["abstract"])
org_embeddings = org_model.encode(paragraphs)
print(org_model.similarity(org_embeddings, org_embeddings))
embeddings = model.encode(paragraphs)
print(model.similarity(embeddings, embeddings))

tensor([[ 1.0000,  0.0672,  0.0900,  0.1508,  0.1351],
        [ 0.0672,  1.0000, -0.0577,  0.1068, -0.0119],
        [ 0.0900, -0.0577,  1.0000, -0.0062,  0.1532],
        [ 0.1508,  0.1068, -0.0062,  1.0000,  0.0119],
        [ 0.1351, -0.0119,  0.1532,  0.0119,  1.0000]])
tensor([[1.0000, 0.5937, 0.4557, 0.6475, 0.6362],
        [0.5937, 1.0000, 0.5404, 0.7036, 0.6582],
        [0.4557, 0.5404, 1.0000, 0.5765, 0.6901],
        [0.6475, 0.7036, 0.5765, 1.0000, 0.7137],
        [0.6362, 0.6582, 0.6901, 0.7137, 1.0000]])
