In [4]:
"""
This code processes text files containing academic papers:
1. Reads multiple txt files from a specified directory
2. Removes unwanted content like page numbers and leading numbers
3. Detects and prints similar/duplicate lines within each file using difflib
4. Handles text cleaning and preprocessing
"""

import difflib
import os

def read_file_to_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [line.strip() for line in lines]

# Initialize list to store txt file paths
txt_files = []

# Specify folder path - using relative path for portability
folder_path = "./data/mof_papers"

# Walk through directory to find txt files
for root, dirs, files in os.walk(folder_path):
    for file in files:
        if file.endswith(".txt"):
            txt_files.append(os.path.join(root, file))

# Content to be removed
unwanted_content = [
    'Z. Anorg. Allg. Chem. 0000, 0–0 © 0000 WILEY-VCH Verlag GmbH & Co.',
    'Journal of Inorganic and General Chemistry Zeitschrift für anorganische und allgemeine Chemie www.zaac.wiley-vch.de ARTICLE',
    'I. Gil de Muro, M. Insausti, L. Lezama, J. L. Pizarro, M. I. Arriortua, T. Rojo FULL PAPER'
]

def remove_unwanted_content(unwanted_content, lines):
    result = []
    for line in lines:
        for content in unwanted_content:
            if content in line:
                line = line.replace(content, "")
        result.append(line.strip())
    return result

def find_similar_lines(lines):
    similar_lines = []
    
    for i in range(len(lines)-1):
        line = lines[i]
        sentence = line.split(".")[0].strip()
        found_similar_line = False

        for j in range(i+1, len(lines)):
            next_line = lines[j]
            next_sentence = next_line.split(".")[0].strip()
            
            similarity_ratio = difflib.SequenceMatcher(None, sentence, next_sentence).ratio()
            
            if similarity_ratio > 0.9 and not found_similar_line:
                similar_lines.append(line)
                similar_lines.append(next_line)
                found_similar_line = True

        if similar_lines:
            print(similar_lines)
            similar_lines = []

    return similar_lines

# Process first 10 files
for file_path in txt_files[0:10]:
    lines = read_file_to_list(file_path)
    lines = remove_page_numbers(lines)  
    lines = remove_leading_numbers(lines)
    similar_lines = find_similar_lines(lines)

['This is a test sentence.', 'This is a test sentence.']
['Eur. J. Inorg. Chem. 1999,9352943 \uf6d9 WILEY-VCH Verlag GmbH, D-69451 Weinheim, 1999 143421948/99/060620935 $ 17.501.50/0 935', 'Eur. J. Inorg. Chem. 1999, 9352943938']
['I. Gil de Muro, M. Insausti, L. Lezama, J. L. Pizarro, M. I. Arriortua, T. Rojo FULL PAPER Table 2. Selected bond lengths [A ˚ ] and angles [°] for the Results and Discussion [SrCo(C 3 H 2 O 4 ) 2 (H 2 O) 5 ] ·2H 2 O compound; symmetry transformations used to generate equivalent atoms: (1): 2x 1 1, y 2 1/2, The synthesis of the complexes was performed as de2z 1 1/2; (2): x, 2y 1 1/2, z 1 1/2; (3): 2x, 2y, 2z 1 1; (4): scribed in the Experimental Section. Crystal structure 2x 1 1, 2y, 2z 1 1; (5): x, 2y 1 1/2, z 2 1/2; (6): 2x 1 1, y 1 1/2, 2z 1 1/2 analyses were performed for the [SrCo(C 3 H 2 O 4 ) 2 (H 2 O) 7 ] complex. Crystal data and details of the structure determiSr2O(8) 1 2.583(4) O(2)2C(1) 1.249(7) nation are summarized in Table 1. Bond lengths and 

In [9]:
"""
This script is designed to clean and preprocess academic paper text files. It includes functions to:
1. Remove references and page numbers
2. Remove leading numbers and unwanted content
3. Detect and remove repetitive sentences
4. Clean up text formatting

The script helps prepare text data for further analysis by removing common artifacts found in academic papers.
"""

import re

def read_file_to_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [line.strip() for line in lines]

def remove_references(lines):
    reference_pattern = re.compile(r'\[\d+\]|\bReferences\b|\bREFERENCES\b')
    numbered_reference_pattern = re.compile(r'^\d+\.')
    reference_detected = False
    reference_start_line = None

    for i, line in enumerate(lines):
        if 'Reference' in line or 'REFERENCE' in line or 'reference' in line:
            if '[1]' in line or (i + 1 < len(lines) and '[1]' in lines[i + 1]):
                reference_detected = True
                reference_start_line = i
                break
            if '1.' in line:
                if i + 2 < len(lines) and '2.' in lines[i + 1] and '3.' in lines[i + 2]:
                    reference_detected = True
                    reference_start_line = i
                    break

    if not reference_detected:
        print('No reference section found')
        reference_lines = [i for i, line in enumerate(lines) if reference_pattern.search(line)]
        if reference_lines:
            last_reference_line = reference_lines[-1]
            if '[2]' in lines[last_reference_line] or (last_reference_line + 1 < len(lines) and '[2]' in lines[last_reference_line + 1]):
                if '[3]' in lines[last_reference_line] or (last_reference_line + 1 < len(lines) and '[3]' in lines[last_reference_line + 1]):
                    reference_detected = True
                    reference_start_line = last_reference_line
            else:
                numbered_reference_lines = [i for i, line in enumerate(lines) if numbered_reference_pattern.search(line)]
                if numbered_reference_lines:
                    last_numbered_reference_line = numbered_reference_lines[-1]
                    if '2.' in lines[last_numbered_reference_line] or (last_numbered_reference_line + 1 < len(lines) and '2.' in lines[last_numbered_reference_line + 1]):
                        if '3.' in lines[last_numbered_reference_line] or (last_numbered_reference_line + 1 < len(lines) and '3.' in lines[last_numbered_reference_line + 1]):
                            reference_detected = True
                            reference_start_line = last_numbered_reference_line

    if reference_detected and reference_start_line is not None:
        return lines[:reference_start_line]
    return lines

def remove_page_numbers(lines):
    page_number_pattern = re.compile(r'-+page \d+-+')
    return [line for line in lines if not page_number_pattern.search(line)]

def remove_leading_numbers(lines):
    return [re.sub(r'^\d+\s*', '', line) for line in lines]

def remove_repetitive_sentences(lines):
    example_removed = None
    for i in range(len(lines)-1):
        line = lines[i]
        sentence = line.split(".")[0].strip()
        
        for j in range(i+1, len(lines)):
            next_line = lines[j]
            next_sentence = next_line.split(".")[0].strip()
            
            similarity_ratio = difflib.SequenceMatcher(None, sentence, next_sentence).ratio()
            
            if similarity_ratio > 0.8:
                if not example_removed:
                    example_removed = line
                lines[i] = line.replace(sentence, "").strip()
                lines[j] = next_line.replace(next_sentence, "").strip()
    
    return lines, example_removed

# Process the file
input_file = 'input/papers/sample.txt'  # Generic file path
lines = read_file_to_list(input_file)

# Cleaning process
lines = remove_references(lines)
lines = remove_page_numbers(lines)
lines = remove_leading_numbers(lines)
lines, example_removed = remove_repetitive_sentences(lines)

# Save results
with open('output/cleaned_text.txt', 'w', encoding='utf-8') as file:
    for line in lines:
        file.write(line + '\n')

In [4]:
"""
This script processes text files from academic papers for further analysis. It includes functions to:
1. Read text files and split them into manageable chunks
2. Create sliding windows of text with controlled token counts and overlaps
3. Count tokens using spaCy NLP
4. Process large documents while maintaining context between chunks

The sliding window approach ensures that text chunks are neither too large nor too small,
with configurable overlap to maintain context across boundaries.
"""

import spacy
nlp = spacy.load("en_core_web_sm")

def read_file_to_list(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
    return [line.strip() for line in lines]

def count_tokens(text):
    doc = nlp(text)
    return len(doc)

def create_sliding_windows(lines, max_tokens_per_window, overlap_range):
    windows = []
    current_window = []
    current_tokens = 0
    overlap_tokens = 0
    overlap_lines = []

    for line in lines:
        line_tokens = count_tokens(line)
        if current_tokens + line_tokens > max_tokens_per_window:
            # Create new window when current one is full
            windows.append(current_window)
            # Reset current window and token count
            current_window = overlap_lines.copy()
            current_tokens = overlap_tokens
            overlap_lines = []
            overlap_tokens = 0

        # Add current line to window
        current_window.append(line)
        current_tokens += line_tokens

        # Update overlap tracking
        overlap_lines.append(line)
        overlap_tokens += line_tokens
        
        # Ensure overlap tokens stay within specified range
        while overlap_tokens > overlap_range[1] or (overlap_lines and count_tokens(overlap_lines[0]) > overlap_range[1]):
            removed_tokens = count_tokens(overlap_lines.pop(0))
            overlap_tokens -= removed_tokens

        # Maintain minimum overlap
        if overlap_lines and overlap_tokens < overlap_range[0]:
            overlap_lines.append(line)
            overlap_tokens += line_tokens

    # Add final window
    if current_window:
        windows.append(current_window)

    return windows

# Example usage
file_path = "data/papers/example_paper.txt"  # Generic file path
lines = read_file_to_list(file_path)

# Create sliding windows
max_tokens_per_window = 7000
overlap_range = (400, 800)
windows = create_sliding_windows(lines, max_tokens_per_window, overlap_range)

# Print results
for i, window in enumerate(windows):
    print(f"Window {i+1}:")
    print(window)
    print(f"Tokens in window: {sum(count_tokens(line) for line in window)}")
    print('---')

Window 1:
['', '---------------page 0---------------', 'Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen stor

In [5]:
windows

[['',
  '---------------page 0---------------',
  'Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage in covalent organic frameworks Multi-scale theoretical investigation of hydrogen storage i