In [14]:
import os
import re
import requests
import pandas as pd
import xml.etree.ElementTree as ET
import time
from concurrent.futures import ThreadPoolExecutor, as_completed

In [15]:
import chardet
from tqdm import tqdm

def extract_pre_abstract_content(tex_file_path):
    """
    Parses a .tex file:
    - Removes LaTeX comments
    - Extracts institution names
    - Extracts text before the abstract
    """
    try:
        with open(tex_file_path, "r", encoding="utf-8") as f:
            content = f.read()
    except UnicodeDecodeError:
        with open(tex_file_path, "rb") as f:
            raw_data = f.read(10000)  
            result = chardet.detect(raw_data)
            detected_encoding = result["encoding"]
        try:
            with open(tex_file_path, "r", encoding=detected_encoding, errors="replace") as f:
                content = f.read()
        except Exception as e:
            tqdm.write(f"❌ Failed to read {tex_file_path} with detected encoding {detected_encoding}: {e}")
            return ""  

    # Remove LaTeX comments
    content = re.sub(r"(?<!\\)%.*", "", content)

    # Extract institution names
    institution_patterns = [
        r"\\affiliation{([^}]*)}",
        r"\\institute{([^}]*)}",
        r"\\address{([^}]*)}",
        r"\\inst{([^}]*)}",
        r"\\author\s*{[^}]+}{([^}]+)}"
    ]
    

    extracted_institutions = []
    for pattern in institution_patterns:
        matches = re.findall(pattern, content)
        if matches:
            extracted_institutions.extend(matches)

    if extracted_institutions:
        return "\n".join(set(extracted_institutions)).strip()

    # Extract text before the abstract
    match = re.split(r"\\begin\s*{\s*abstract\s*}|\\s*\\section\s*{\s*Abstract\s*}", content, maxsplit=1, flags=re.IGNORECASE)
    if len(match) > 1:
        return match[0].strip()

    # Return the first third of the document as a fallback
    content_length = len(content)
    if content_length > 0:
        one_third_length = content_length // 3
        return content[:one_third_length].strip()

    return ""

In [32]:
import os

def list_files(directory="."):
    return [f for f in os.listdir(directory) if os.path.isfile(os.path.join(directory, f))]

files = list_files("2311_tex/2311.12617")
print(files)


['main.tex']


In [33]:
match_txt = extract_pre_abstract_content('2311_tex/2311.12617/main.tex')
print(match_txt)

$^{1


In [17]:
PROMPT_TEMPLATE = (
    "Extract the institutions that the authors of the following LaTeX document are affiliated with.\n\n"
    "### STRICT OUTPUT REQUIREMENTS:\n"
    "1. Extract ONLY the institution names associated with the authors, with no additional text.\n"
    "2. Ignore research collaborations, projects, or experiments. \n"
    "3. Format: Each academic institution must be numbered on a new line, exactly as follows:\n"
    "   1. Institution Name 1\n"
    "   2. Institution Name 2\n"
    "4. If NO institutions can be found, return exactly:\n"
    "   null\n"
    "5. DO NOT include explanations, descriptions, or any other text—ONLY the numbered list or 'null'.\n\n"
    "### INPUT TEXT:\n"
    "{input_text}\n\n"
)


In [18]:
import time
import vertexai
from vertexai.generative_models import GenerativeModel

PROJECT_ID = "arxiv-development" 
vertexai.init(project=PROJECT_ID, location="us-central1")

model = GenerativeModel("gemini-1.5-flash-002")

def query_gemini_api(input_text):
    """
    Sends a request to the Gemini API to extract potential institution names.
    """
    prompt = PROMPT_TEMPLATE.format(input_text=input_text)
    
    start_time = time.time()  
    response = model.generate_content(prompt)  
    end_time = time.time()  
    
    timecost = end_time - start_time 

    if response and response.text:
        clean_response = response.text  
        # print(f"Execution time: {timecost:.4f} seconds")
        return clean_response
    else:
        print("API request failed or empty response")
        return None


In [11]:
query_gemini_api(match_txt)

NameError: name 'match_txt' is not defined

In [12]:
import os
import vertexai
from vertexai.generative_models import GenerativeModel

PROJECT_ID = "arxiv-development"
vertexai.init(project=PROJECT_ID, location="us-central1")

model = GenerativeModel("gemini-1.5-flash-002")

def estimate_total_tokens(root_dir, max_files=500):
    """
     Estimates the total number of tokens for the max_files file
    """
    total_tokens = 0
    valid_folders = []
    
    for folder in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, folder)
        if os.path.isdir(folder_path):
            tex_files = [f for f in os.listdir(folder_path) if f.endswith(".tex")]
            if tex_files:
                valid_folders.append(folder)

    valid_folders = valid_folders[:max_files]

    for folder in valid_folders:
        folder_path = os.path.join(root_dir, folder)
        tex_files = [f for f in os.listdir(folder_path) if f.endswith(".tex")]

        if "main.tex" in tex_files:
            tex_files = ["main.tex"]

        for tex_file in tex_files:
            tex_path = os.path.join(folder_path, tex_file)
            extracted_text = extract_pre_abstract_content(tex_path)
            if not extracted_text:
                continue
            
            response = model.count_tokens(extracted_text)
            total_tokens += response.total_tokens

    print(f"Estimated Total Tokens for {max_files} folders: {total_tokens}")
    return total_tokens

root_directory = "2311_tex"
estimated_tokens = estimate_total_tokens(root_directory, max_files=1000)


Estimated Total Tokens for 1000 folders: 1876526


In [19]:
def process_single_folder(folder_path, folder, pbar, lock):
    results = []
    tex_files = [f for f in os.listdir(folder_path) if f.endswith(".tex")]
    if "main.tex" in tex_files:
        tex_files = ["main.tex"]

    found_institutions = False

    for tex_file in tex_files:
        if found_institutions:
            break  

        tex_path = os.path.join(folder_path, tex_file)
        
        extracted_text = extract_pre_abstract_content(tex_path)
        if not extracted_text:
            with lock:
                pbar.update(1)  
            continue

        institutions = query_gemini_api(extracted_text)
        if institutions and institutions.strip().lower() != "null":
            for institution in institutions.split("\n"):
                clean_name = institution.strip()
                if clean_name:
                    results.append((folder, clean_name))
                    found_institutions = True
            break  

        with lock:
            pbar.update(1) 

    if not found_institutions:
        results.append((folder, "null"))

    return results


In [20]:
import os
import time
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
import threading  

def process_tex_files(root_dir, max_files=500, max_workers=5):
    start_time = time.time()
    valid_folders = []

    for folder in os.listdir(root_dir):
        folder_path = os.path.join(root_dir, folder)
        if os.path.isdir(folder_path):
            tex_files = [f for f in os.listdir(folder_path) if f.endswith(".tex")]
            if tex_files:
                valid_folders.append(folder)  
    valid_folders = valid_folders[:max_files] 
    estimated_tex_files = len(valid_folders)  

    results = []
    lock = threading.Lock() 

    with ThreadPoolExecutor(max_workers=max_workers) as executor, tqdm(total=estimated_tex_files, desc="Processing .tex files") as pbar:
        future_to_folder = {
            executor.submit(process_single_folder, os.path.join(root_dir, folder), folder, pbar, lock): folder
            for folder in valid_folders
        }

        for future in as_completed(future_to_folder):
            folder = future_to_folder[future]
            try:
                folder_results = future.result()
                results.extend(folder_results)
            except Exception as e:
                print(f"❌ Error processing folder '{folder}': {e}")

    total_time = time.time() - start_time
    print(f"✅ Total processing time: {total_time:.2f} seconds")

    return results


In [21]:
def save_to_excel(results, output_file):
    """
    Save the results to an Excel file.
    """
    df = pd.DataFrame(results, columns=["File ID", "Institution"])
    df.to_excel(output_file, index=False)
    print(f"Results have been saved to {output_file}")


In [34]:
root_directory = "2311_tex"
output_excel = "affiliations_gemini_1000.xlsx"


results = process_tex_files(root_directory, max_files=1000, max_workers=20)
save_to_excel(results, output_excel)


Processing .tex files:  61%|██████▏   | 614/1000 [00:24<00:15, 25.47it/s]

✅ Total processing time: 24.35 seconds
Results have been saved to affiliations_gemini_1000.xlsx





In [24]:
import pandas as pd

df = pd.read_excel("affiliations_test.xlsx")
df

Unnamed: 0,File ID,Institution
0,2311.12617,
1,2311.06812,"1. The Chinese University of Hong Kong, Shenzhen"
2,2311.06812,2. Tsinghua University
3,2311.06812,3. Simon Fraser University
4,2311.14992,1. China University of Petroleum (East China)
...,...,...
1136,2311.09042,1. University of West Bohemia
1137,2311.09042,2. Kitasato University
1138,2311.09042,3. Ishinomaki Senshu University
1139,2311.09042,4. Yokohama National University


In [46]:
import multiprocessing
max_workers = min(32, multiprocessing.cpu_count() + 4)
print(max_workers)


8


In [24]:
import pandas as pd

def analyze_paper_extraction(file_path):
    df = pd.read_excel(file_path)
    num_papers = df["File ID"].nunique()
    num_nulls = df["Institution"].isna().sum()
    null_percentage = (num_nulls / len(df)) * 100
    print(f"Total Papers Extracted: {num_papers}")
    print(f"Total Null Entries: {num_nulls}")
    print(f"Null Percentage: {null_percentage:.2f}%")
    
    return num_papers, num_nulls, null_percentage

In [35]:
file_path = "affiliations_gemini_1000.xlsx"  
analyze_paper_extraction(file_path)

Total Papers Extracted: 1000
Total Null Entries: 129
Null Percentage: 5.17%


(1000, 129, 5.168269230769231)