# `erfindungen_extract_data.ipynb` 

This file extracts the data from the "Erfindungen" overview of patents until 1852 in the Austro-Hungarian empire. The .pdf already contains implicit text data, however, OpenAI Chat Completions cannot handle this very well. Hence, we opted to extract the text manually using the `tabula` piece of software. 

See [here](https://tabula.technology/) how to install tabula. It's a rule based approach based on either row finding or column finding. The base file that I use has column finding. This is a real nice basic DataFrame, but it omits the last observations on the page (it misses it). 

Hence, I augment it with 

The program `geeqie`

```
sudo apt-get install geeqie
```
helps to find the precise coordinates. See [here](https://askubuntu.com/questions/298877/which-image-viewer-is-able-to-show-coordinates) (View > Pixel Info). 


## Clean Data

- First, we clean the basic dataset of Column Headers and sort the data. 
- Then, we find out which numbers are missing from the list (set difference). We also manually append the last number in the document, 5833. 

In [125]:
from pathlib import Path
import pandas as pd
import numpy as np

data = (pd.read_csv("../../data/patent_data/interim_patent_data/tabula-erfindungsprivilegien_without_first_page_basic.csv").
    iloc[:, 1:13])

data = data[data['PrivRegNr'] != "PrivRegNr"]
data['PrivRegNr'] = pd.to_numeric(data['PrivRegNr'])
data.sort_values('PrivRegNr', inplace=True)

In [126]:
present_elements = set(data['PrivRegNr'])
all_elements = set([i for i in range(1, 5816)] + [5833])

missing_elements = all_elements.difference(present_elements)

# Appending the dataset with the missing elements 

Write a function that takes `missing_elements` as input, then parses each `.csv`, merges the rows from the missing number and the row below per column, and then coalesce the 2 columns next to "Schlagworte" if one of those columns is empty. Then extracts the rows (including column names) for the `missing_elements`. 


In [195]:
def list_files_in_directory(path):
    # List all files in the directory
    return [str(file) for file in Path(path).iterdir() if file.is_file()]
def coalesce_columns(df, col1, col2):
    """
    Merge two columns into one by filling missing values from col1 with values from col2.
    """
    return df[col1].combine_first(df[col2])

def merge_missing_rows(df):
    """
    Merge rows where the 'PrivRegNr' is missing.
    """
    merged_row = df.iloc[0].combine_first(df.iloc[1])
     # Create a new DataFrame with the merged row
    merged_df = pd.DataFrame([merged_row])  # Wrap the Series in a list to create a DataFrame

    return merged_df

def is_column_empty(df, column_idx):
    """
    Check if a column is completely empty (contains no text or all values are NaN).
    """
    # Step 1: Replace empty strings with NaN
    df.iloc[:, column_idx].replace("", np.nan, inplace=True)
    
    # Step 2: Check if all values in the column are NaN
    return df.iloc[:, column_idx].isna().all()

def process_csv(file_path, missing_elements):
    """
    Main function to process the CSV, merge rows with missing elements,
    coalesce Schlagworte columns, and extract rows with missing elements.
    """
    
    out = pd.DataFrame()
    files = list_files_in_directory(file_path)
    for file in files:
        print(file)
        # Load the CSV file into a DataFrame
        df = pd.read_csv(file).reset_index()
        df['PrivRegNr'] = pd.to_numeric(df['PrivRegNr'])
        
        # Filter the observations on the missing elements and the row below
        retain = df[df['PrivRegNr'].isin(missing_elements)]
        
        if not retain.empty:
            first_int_index = retain.index[0]  # Get the first index directly
            second_int_index = first_int_index + 1
            keep = [first_int_index, second_int_index]
            
            try:
                # Always use `df.loc[[index]]` to ensure a DataFrame is returned
                df = df.loc[keep]
            except:
                # Wrap `first_int_index` in a list to ensure it remains a DataFrame
                df = df.loc[[first_int_index]]
        else:
            # Handle case where `retain` is empty
            pass

        # Merge the rows of the retained df
        if df.shape[0] > 1:
            df = merge_missing_rows(df)
            
        # Coalesce columns next to 'Schlagworte' if one of these is empty:
        if "Schlagworte" in df.columns:
            col_idx = df.columns.get_loc("Schlagworte")
        else:
            col_idx = np.where(data.columns.str.contains('Schlag'))[0][0]
        if (is_column_empty(df, col_idx+1) | is_column_empty(df, col_idx+2)):
            df['Ort'] = coalesce_columns(df, df.columns[col_idx+1], df.columns[col_idx+2])
        
        # Append the new line to the already existing df
        out = pd.concat([out, df], ignore_index=True)

    return out

In [200]:
#data.loc[[1,2]]
missing_obs = process_csv("../../data/patent_data/interim_patent_data/augment", missing_elements)
print(missing_obs.shape)
print(len(missing_elements))
#data[data.columns.str.contains("Schlagworte", case=False)]


../../data/patent_data/interim_patent_data/augment/tabula-erfindungsprivilegien_without_first_page-62.csv
Index(['index', 'PrivRegNr', 'Name1', 'Vorname1', 'Name2', 'Vorname2', 'Name3',
       'Vorname3', 'Titel', 'Schlagworte', 'Unnamed: 9', 'Ort', 'Jahr',
       'Einreichdatum'],
      dtype='object')
../../data/patent_data/interim_patent_data/augment/tabula-erfindungsprivilegien_without_first_page-99.csv
Index(['index', 'PrivRegNr', 'Name1', 'Vorname1', 'Name2', 'Vorname2', 'Name3',
       'Vorname3', 'Titel', 'Schlagworte', 'Unnamed: 9', 'Ort', 'Jahr',
       'Einreichdatum'],
      dtype='object')
../../data/patent_data/interim_patent_data/augment/tabula-erfindungsprivilegien_without_first_page-143.csv
Index(['index', 'PrivRegNr', 'Name1', 'Vorname1', 'Name2', 'Vorname2', 'Name3',
       'Vorname3', 'Titel', 'Schlagworte', 'Ort', 'Jahr', 'Einreichdatum'],
      dtype='object')
../../data/patent_data/interim_patent_data/augment/tabula-erfindungsprivilegien_without_first_page-178.cs

## Extract using `fitz`

Manually extract the already OCR'ed text using `fitz` library. 

In [7]:
import fitz  # PyMuPDF

def extract_text(pdf_file, export_file_name, x1, x2, y1, y2):
    
    doc = fitz.open(pdf_file)
    extracted_text = ""
    # Iterate over all the pages in the PDF
    for page_num in range(doc.page_count):
        page = doc.load_page(page_num)  # Load each page
        # Define the cropping rectangle: (x1, y1, x2, y2)
        crop_rect = fitz.Rect(x1, y1, x2, y2)
        # Crop the page
        page.set_cropbox(crop_rect)
        # Extract the text from the cropped area
        cropped_text = page.get_text("text")  # Extract text after cropping
        # Append the text from each page
        extracted_text += cropped_text 

    # Optionally, save the extracted text to a file
    with open(f"{export_file_name}.txt", "w") as f:
        f.write(extracted_text)
        
    return(extracted_text)

In [10]:
x1 = 682 # Left boundary of the cropping rectangle
x2 = 725  # Right boundary of the cropping rectangle
y1 = 62
y2 = 540
# Load the PDF file
pdf_file = '../../data/patent_data/raw_patent_data/erfindungsprivilegien_without_first_page.pdf'


places = extract_text(pdf_file, "place_names", x1=x1, x2=x2, y1=y1, y2=y2)

In [12]:
x1 = 723
x2 = 749

year = extract_text(pdf_file, "year", x1=x1, x2=x2, y1=y1, y2=y2)