In [1]:
!nvidia-smi

Sun Mar  9 22:34:18 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       On  |   00000000:00:04.0 Off |                    0 |
| N/A   59C    P8             11W /   70W |       3MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [5]:
import os
import re
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

In [6]:
# API server address
API_URL = "http://localhost:11434/api/generate"

In [7]:
import chardet

def extract_pre_abstract_content(tex_file_path):
    """
    Parses a .tex file:
    1. Removes LaTeX comments (everything after '%').
    2. Prioritizes extracting content from institution-related tags.
    3. If no institution tags are found, extracts content before the abstract.
    4. If there is no abstract, returns the first one-third of the document.
    """
    try:
        with open(tex_file_path, "r", encoding="utf-8") as f:
            content = f.read()
    except UnicodeDecodeError:
        with open(tex_file_path, "rb") as f:
            raw_data = f.read(10000)  
            result = chardet.detect(raw_data)
            detected_encoding = result["encoding"]
        
        print(f"Detected encoding: {detected_encoding} for file {tex_file_path}")
        try:
            with open(tex_file_path, "r", encoding=detected_encoding, errors="replace") as f:
                content = f.read()
        except Exception as e:
            print(f"Failed to read {tex_file_path} with detected encoding {detected_encoding}: {e}")
            return ""  

    #  Remove LaTeX comments
    content = re.sub(r"(?<!\\)%.*", "", content)

    institution_patterns = [
        r"\\affiliation{([^}]*)}",
        r"\\institute{([^}]*)}",
        r"\\address{([^}]*)}",
        r"\\inst{([^}]*)}",
        r"\\author\s*{[^}]+}{([^}]+)}"
    ]

    extracted_institutions = []
    for pattern in institution_patterns:
        matches = re.findall(pattern, content)
        if matches:
            extracted_institutions.extend(matches)

    if extracted_institutions:
        return "\n".join(set(extracted_institutions)).strip()

    # Extract the content before abstract
    match = re.split(r"\\begin\s*{\s*abstract\s*}|\\s*\\section\s*{\s*Abstract\s*}", content, maxsplit=1, flags=re.IGNORECASE)

    if len(match) > 1:
        return match[0].strip()

    # Return 1/3 content
    content_length = len(content)
    if content_length > 0:
        one_third_length = content_length // 3
        return content[:one_third_length].strip()

    return ""


In [111]:
match_txt = extract_pre_abstract_content('2311_tex/2311.05478/Submission4-v3.tex')
print(match_txt)

\documentclass[conference,9pt]{IEEEtran}
\IEEEoverridecommandlockouts

\usepackage{cite}
\usepackage{amsmath,amssymb,amsfonts}
\usepackage{algorithmic}
\usepackage{graphicx}
\usepackage{textcomp}
\usepackage{xcolor}
\usepackage{algorithm}
\usepackage{booktabs}
\usepackage{multirow}
\usepackage{array}
\usepackage{colortbl}
\usepackage{makecell}
\usepackage{pifont}
\usepackage{makecell}
\usepackage{xcolor,colortbl}

\def\BibTeX{{\rm B\kern-.05em{\sc i\kern-.025em b}\kern-.08em
    T\kern-.1667em\lower.7ex\hbox{E}\kern-.125emX}}


    \colorlet{mygreen}{green!60!gray}

    \newif\ifcomments


\commentstrue


\ifcomments

    \newcommand{\BTcomm}[1]{\textcolor{mygreen}{{#1}}}
    \newcommand{\MB}[1]{\textcolor{magenta}{{MB: #1}}}
    \newcommand{\fjwresp}[1]{\textcolor{blue}{#1}}
    \newcommand{\fjwrespv}[1]{\textcolor{red}{#1}}

\else
    \newcommand{\BTcomm}[1]{}
    \newcommand{\MB}[1]{}
    \newcommand{\fjwresp}[1]{}
    \newcommand{\fjwrespv}[1]{}

\fi







\begin{document}

\title

### ATTENTION: FUNCTION preprocess_tex_content DO NOT USE ANY MORE

In [6]:
def preprocess_tex_content(content):
    """
    Preprocesses LaTeX text by removing elements that may mislead an LLM, 
    such as section titles, headings, and figure captions.
    """
    # Remove \title{} to prevent misinterpretation
    content = re.sub(r"\\title{.*?}", "", content, flags=re.DOTALL)
    
    # Remove \section{} and \subsection{} to avoid incorrect classification
    content = re.sub(r"\\(sub)*section{.*?}", "", content, flags=re.DOTALL)
    
    # Remove \caption{} to prevent confusion with institution names
    content = re.sub(r"\\caption{.*?}", "", content, flags=re.DOTALL)
    
    # Remove LaTeX comments
    content = re.sub(r"%.*", "", content)

    return content.strip()


In [109]:
txt = preprocess_tex_content(match_txt)
print(txt)

\usepackage[utf8]{inputenc}
\usepackage{latexsym}
\usepackage{amscd, amsfonts,
  eucal, mathrsfs, amsmath, amssymb, amsthm}
\input xy
\xyoption{all}
\def\li{\baselineskip}



\usepackage{fullpage}
\usepackage{tikz-cd}
\usepackage{extarrows}
\tikzset{
  labl/.style={anchor=south, rotate=90, inner sep=.5mm}
}
\tikzset{
  labr/.style={anchor=north, rotate=90, inner sep=1mm}
}

\usepackage{mathtools}
\usepackage{bm}

\usepackage{array}
\setlength{\extrarowheight}{4pt}

\setlength{\multlinegap}{50pt}



\usepackage{color}


\usepackage[cal=boondoxo, scr=boondoxo]{mathalfa}

\usepackage[sort,nocompress]{cite}




\usepackage{hyperref}




\frenchspacing

\newcommand{\perf}{\text{\rm perf}}

\DeclareMathOperator{\BaseLocus}{Bs}
\DeclareMathOperator{\End}{End}

\newcommand{\Comp}{\operatorname{\text{\rm Comp}}}
\newcommand{\ConnComp}{\operatorname{\text{\rm ConnComp}}}


\newcommand{\pic}[0]{\operatorname{Pic}}
\newcommand{\chow}[0]{\operatorname{Chow}}
\newcommand{\tsum}[0]{\textstyle{\sum}}


In [129]:
import re

def clean_deepseek_response(response_text):
    # print("🔥 Original LLM Response:")
    # print(response_text)
    # print("=" * 50)
    
    # 1) Remove the <think>...</think> block
    cleaned_text = re.sub(r"<think>.*?</think>", "", response_text, flags=re.DOTALL).strip()
    
    # 2) Split by lines and remove empty lines
    lines = cleaned_text.splitlines()
    lines = [line.strip() for line in lines if line.strip()]
    
    # 3) (Optional) Remove numbering at the beginning such as "1. "
    #    If you want to keep the original lines, skip this step
    institution_lines = [re.sub(r"^\d+\.\s*", "", line) for line in lines]
    
    # 4) If there are no lines left, return "null"
    extracted_institutions = "\n".join(institution_lines) if institution_lines else "null"
    
    print("✅ Extracted Institution Names:")
    print(extracted_institutions)
    print("=" * 50)
    
    return extracted_institutions


### THIS PROMPT IS FOR THE DEEPSEEK 7b

In [9]:
PROMPT_TEMPLATE = (
    "Extract the academic institutions that the authors of the following LaTeX document are affiliated with.\n\n"
    "### STRICT OUTPUT REQUIREMENTS:\n"
    "1. Extract ONLY the academic institution names associated with the authors, with no additional text.\n"
    "2. Ignore research collaborations, projects, or experiments. \n"
    "3. Format: Each academic institution must be numbered on a new line, exactly as follows:\n"
    "   1. Institution Name 1\n"
    "   2. Institution Name 2\n"
    "4. If NO academic institutions can be found, return exactly:\n"
    "   null\n"
    "5. DO NOT include explanations, descriptions, or any other text—ONLY the numbered list or 'null'.\n\n"
    "### INPUT TEXT:\n"
    "{input_text}\n\n"
)


### THIS PROMPT IS FOR THE DEEPSEEK 1.5B

In [139]:
PROMPT_TEMPLATE = (
    "### TASK:\n"
    "Extract the name of the institution (university or institute, etc.) to which the author belongs in the given article.\n"
    "If no institution is found, output exactly `null`.\n\n"
    "There is no need to consider the correspondence between the author and the institution, only the name of the institution needs to be extracted."

    "### RULES:\n"
    "1. Do not include author names, emails, or any other text in your final answer.\n"
    "2. Output each institution on a new line, numbered sequentially."
    "3. The only two possibilities for the output are null or the name of the organization."

    "### Article :\n"
    "{input_text}\n\n"
     "Once you find all the author and their institutions, just stop thinking and output the institutino. Do Not think too much"
)


In [140]:
import requests
import time
def query_deepseek_api(input_text):
    """
    Sends a request to the DeepSeek API to extract potential institution names.
    """
    # input_text = preprocess_tex_content(input_text)
    prompt = PROMPT_TEMPLATE.format(input_text=input_text)
    start_time = time.time()  

    payload = {
        "model": "deepseek-r1:7b",
        "prompt": prompt,
        "stream": False,  # Retrieve the full response
        "temperature": 0.1,
        "top_k": 40,
        "top_p": 1.0,
        "repeat_penalty": 1.1,
        "seed": 42,
    }
    
    response = requests.post(API_URL, json=payload)
    end_time = time.time()
    
    timecost = end_time - start_time

    if response.status_code == 200:
        data = response.json()
        clean_response = clean_deepseek_response(data.get("response", ""))
        print(f"Execution time: {timecost:.4f} seconds")
        return clean_response
    else:
        print("API request failed:", response.status_code, response.text)
        return None


In [141]:
query_deepseek_api(match_txt)

✅ Extracted Institution Names:
Jinan University
University of Siena
Execution time: 6.8690 seconds


'Jinan University\nUniversity of Siena'

In [94]:
# import os

# def process_tex_files(root_dir, max_files=5):
#     """
#     Processes multiple .tex files and store the results up to max_files directories).
#     """
#     results = []
#     counter = 0  # Counter to limit processing to max_files directories

#     for folder in os.listdir(root_dir):
#         folder_path = os.path.join(root_dir, folder)
#         if not os.path.isdir(folder_path):
#             continue  # Skip non-directory entries

#         tex_files = [f for f in os.listdir(folder_path) if f.endswith(".tex")]

#         if not tex_files:
#             continue  # Skip directories without .tex files

#         # Limit processing to max_files directories
#         if counter >= max_files:
#             break

#         counter += 1

#         # Select the .tex file(s) to process
#         if "main.tex" in tex_files:
#             tex_files = ["main.tex"]  # Prioritize processing main.tex
#         else:
#             tex_files = tex_files  # Process all .tex files

#         found_institutions = False  # Flag to track if any institution names were found

#         for tex_file in tex_files:
#             tex_path = os.path.join(folder_path, tex_file)
#             print(f"Processing: {tex_path}")

#             # Extract relevant content from the .tex file
#             extracted_text = extract_pre_abstract_content(tex_path)
#             if not extracted_text:
#                 continue

#             # Query the DeepSeek API
#             institutions = query_deepseek_api(extracted_text)

#             # Split response into lines and remove empty entries
#             if institutions and institutions.strip().lower() != "null":
#                 for institution in institutions.split("\n"):
#                     clean_name = institution.strip()
#                     if clean_name:  # Ensure only non-empty names are stored
#                         results.append((folder, clean_name))
#                         found_institutions = True  # Mark that institutions were found
                        
#         # If no institution names were found in any .tex file, store "null"
#         if not found_institutions:
#             results.append((folder, "null"))
#         print(results)

#     return results


In [149]:
def process_single_folder(folder_path, folder):
    """
    step：
    1. If there is a main.tex file in the folder, only process main.tex.
    2. If there is no main.tex, then as soon as an institution name is successfully extracted,
       stop processing subsequent .tex files.
    """
    results = []
    tex_files = [f for f in os.listdir(folder_path) if f.endswith(".tex")]
    if "main.tex" in tex_files:
        tex_files = ["main.tex"]

    found_institutions = False

    for tex_file in tex_files:
        if found_institutions:
            break

        tex_path = os.path.join(folder_path, tex_file)
        print(f"[Thread] Processing: {tex_path}")
        
        extracted_text = extract_pre_abstract_content(tex_path)
        if not extracted_text:
            continue

        institutions = query_deepseek_api(extracted_text)
        if institutions and institutions.strip().lower() != "null":
            for institution in institutions.split("\n"):
                clean_name = institution.strip()
                if clean_name:
                    results.append((folder, clean_name))
                    found_institutions = True
            # Once the first file with an institution is found, break immediately
            break

    if not found_institutions:
        results.append((folder, "null"))

    return results


In [153]:
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

def process_tex_files(root_dir, max_files=5, max_workers=4):

    start_time = time.time() 

    # Identify folders that contain .tex files
    valid_folders = []
    for folder in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, folder)
        if os.path.isdir(folder_path):
            tex_files = [f for f in os.listdir(folder_path) if f.endswith(".tex")]
            if tex_files:
                valid_folders.append(folder)

    # Limit the maximum number of folders to process
    valid_folders = valid_folders[:max_files]

    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_folder = {
            executor.submit(process_single_folder, os.path.join(root_dir, folder), folder): folder
            for folder in valid_folders
        }

        for future in as_completed(future_to_folder):
            folder = future_to_folder[future]
            try:
                folder_results = future.result()
                results.extend(folder_results)
            except Exception as e:
                print(f"Error processing folder '{folder}': {e}")

    total_time = time.time() - start_time
    print(f"🤔🤔Total processing time: {total_time:.2f} seconds")

    return results


In [154]:
def save_to_excel(results, output_file):
    """
    Save the results to an Excel file.
    """
    df = pd.DataFrame(results, columns=["File ID", "Institution"])
    df.to_excel(output_file, index=False)
    print(f"Results have been saved to {output_file}")


In [152]:
# def save_to_xml(results, output_file):
#     """ XML """
#     root = ET.Element("Institutions")

#     for file_id, institution in results:
#         # make sure institution is not empty
#         if not institution.strip():
#             continue

#         entry = ET.SubElement(root, "Entry")
#         ET.SubElement(entry, "FileID").text = file_id
#         ET.SubElement(entry, "Institution").text = institution

#     tree = ET.ElementTree(root)
#     tree.write(output_file, encoding="utf-8", xml_declaration=True)
#     print(f"The result has been saved to {output_file}")


In [157]:


root_directory = "2311_tex"
output_excel = "affiliations_test.xlsx"


results = process_tex_files(root_directory, max_files=4, max_workers=4)
save_to_excel(results, output_excel)


[Thread] Processing: 2311_tex/2311.06812/final.tex
[Thread] Processing: 2311_tex/2311.14992/final_version.tex
[Thread] Processing: 2311_tex/2311.12617/main.tex
[Thread] Processing: 2311_tex/2311.17452/preamble.tex
Detected encoding: GB2312 for file 2311_tex/2311.14992/final_version.tex
✅ Extracted Institution Names:
null
Execution time: 21.2317 seconds
✅ Extracted Institution Names:
null
Execution time: 33.0065 seconds
[Thread] Processing: 2311_tex/2311.17452/abel_nat.tex
✅ Extracted Institution Names:
Shenzhen Future Network of Intelligence Institute (FNii-Shenzhen)
School of Science and Engineering (SSE), The Chinese University of Hong Kong, China
Shenzhen International Graduate School, Tsinghua University, China
Simon Fraser University
Guangdong Provincial Key Laboratory of Future Networks of Intelligence
Execution time: 37.7870 seconds
✅ Extracted Institution Names:
College of New Energy, China University of Petroleum (East China)
College of New Energy, China University of Petroleu

In [158]:
import pandas as pd

df = pd.read_excel("affiliations_test.xlsx")
df

Unnamed: 0,File ID,Institution
0,2311.12617,
1,2311.06812,Shenzhen Future Network of Intelligence Instit...
2,2311.06812,"School of Science and Engineering (SSE), The C..."
3,2311.06812,"Shenzhen International Graduate School, Tsingh..."
4,2311.06812,Simon Fraser University
5,2311.06812,Guangdong Provincial Key Laboratory of Future ...
6,2311.14992,"College of New Energy, China University of Pet..."
7,2311.14992,"College of New Energy, China University of Pet..."
8,2311.14992,"College of New Energy, China University of Pet..."
9,2311.14992,Department of Electronic and Computer Engineer...


In [161]:
match_txt = extract_pre_abstract_content('2311_tex/2311.12617/main.tex')
print(match_txt)

$^{1


In [8]:
match_txt = extract_pre_abstract_content('2311_tex/2311.17452/abel_nat.tex')
print(match_txt)

\documentclass[12pt, a4paper]{amsart}
\input{preamble.tex}


\author{Yuya Sasaki}
\title{Nonnatural automorphisms of the Hilbert scheme of two points of
some simple abelian variety}

\begin{document}
