In [1]:
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
import os
import re
import numpy as np
#import openai

In [2]:
# Helper function
def clean_rest_string(input_str):
    # Define a regex pattern to remove the hyphen "—" and the newline character "\n"
    pattern = r'[—\n\.]'
    # Use the re.sub() function to replace the matched pattern with an empty string
    output_str = re.sub(pattern, ' ', input_str)
    output_str = re.sub(r'\s+', ' ', output_str)
    return(output_str)

In [4]:
# Convert to JPG from source
images = convert_from_path('../data/PARIS_1867_Austria.pdf')

for count, image in enumerate(images):
    image.save(f'../data/Paris1867_Austria_{count}.jpg', 'JPEG')

In [5]:
# Define the folder path where the image files are located
folder_path = '../data'

# Initialize an empty dictionary to store the text for each page
output_dict = {}

# Loop through all the .jpg files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".jpg") and "Paris1867_Austria" in filename:
        # Create the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Extract text from the image using pytesseract
        text = pytesseract.image_to_string(Image.open(file_path), lang='fra')
        
        # Add the text to the dictionary with the filename as the key
        output_dict[filename] = text

In [6]:
# Extract numeric part of keys and convert to integers
numeric_keys = [int(key.split('_')[-1].split('.')[0]) for key in output_dict.keys()]

# Pair numeric keys with original keys
key_value_pairs = list(zip(output_dict.keys(), output_dict.values()))
sorted_pairs = sorted(key_value_pairs, key=lambda x: int(x[0].split('_')[-1].split('.')[0]))

# Reconstruct ordered dictionary
ordered_dict = {pair[0]: pair[1] for pair in sorted_pairs}


In [7]:
# Initialize an empty list to store the split text and sources
split_text_list = []
source_list = []

# Iterate through the elements in the output_dict
for filename, text in ordered_dict.items():
    # Split the text by '\n\n' and add the resulting elements to the list
    split_entries = text.split('\n\n')
    
    # Add each split entry to the split_text_list and its corresponding source to the source_list
    for entry in split_entries:
        split_text_list.append(entry)
        source_list.append(filename)

# Create a pandas DataFrame with 'raw_text' and 'source' columns
df = pd.DataFrame({'raw_text': split_text_list, 'source': source_list})


In [8]:
# Assuming you have your DataFrame df as described
# Create a list to store the filtered data
filtered_data = []

# Initialize a variable to track the 'class' value
current_class = None

# Iterate through the rows of the original DataFrame
for index, row in df.iterrows():
    raw_text = row['raw_text']
    source = row['source']
    # Check if the row contains the word "CLASSE"
    if "CLASSE" in raw_text:
        current_class = raw_text
    else:
        # If the row does not contain "CLASSE," add it to the list
        filtered_data.append({'raw_text': raw_text, 'class': current_class, 'source':source})

# Create a DataFrame from the list of dictionaries
filtered_df = pd.DataFrame(filtered_data)

In [9]:
filtered_df

Unnamed: 0,raw_text,class,source
0,PRODUITS D'IMPRIMERIE ET DE LIBRAIRIC. 2],,Paris1867_Austria_0.jpg
1,EMPIRE D’AUTRICHE,,Paris1867_Austria_0.jpg
2,"1. Beek (maison Alfred Hülder), à\nVienne. — L...",CLASSE 6 — PRODUITS D'IMPRIMERIE\n: ET DE LIBR...,Paris1867_Austria_0.jpg
3,"3 Braumäller (Guillaume), à\nVienne. — Livres ...",CLASSE 6 — PRODUITS D'IMPRIMERIE\n: ET DE LIBR...,Paris1867_Austria_0.jpg
4,"3. Caumo (Antoine), à Rovéredo\n(Tyrol). — Epr...",CLASSE 6 — PRODUITS D'IMPRIMERIE\n: ET DE LIBR...,Paris1867_Austria_0.jpg
...,...,...,...
3473,"7. Laporzynski (Romuald), à Neu-\nmarkt (Galli...",CLASSE 94. — PRODUITS DR TOUTE\nSORTE FABRIQUÉ...,Paris1867_Austria_90.jpg
3474,"8. Lay (Félix), à Essegg (Esclavo-\nne — Tapis...",CLASSE 94. — PRODUITS DR TOUTE\nSORTE FABRIQUÉ...,Paris1867_Austria_90.jpg
3475,". Milj vie (Stojan), du 4er ré-\nen de la fron...",CLASSE 94. — PRODUITS DR TOUTE\nSORTE FABRIQUÉ...,Paris1867_Austria_90.jpg
3476,"10. Tatur (Jean), à Zakopane (Galli-\ncie! Bra...",CLASSE 94. — PRODUITS DR TOUTE\nSORTE FABRIQUÉ...,Paris1867_Austria_90.jpg


In [10]:
merged_text = []  # List to store merged text
merged_text_source = [] # List to store the class
merged_text_class = [] # List to store the source picture

for index, row in filtered_df.iterrows():
    starts_with_digit_or_percent = any(char.isdigit() or char == '%' for char in row['raw_text'][:3])
    
    if not starts_with_digit_or_percent and index > 0:
        # Merge the current row's text into the previous row
        merged_text[-1] += ' ' + row['raw_text']
        merged_text_source[-1] = row['source']
        merged_text_class[-1] = row['class']
    else:
        # Append the current row's text as a new entry
        merged_text.append(row['raw_text'])
        merged_text_source.append(row['source'])
        merged_text_class.append(row['class'])

In [11]:
df = pd.DataFrame({'raw_text':merged_text, 'source':merged_text_source, 'class':merged_text_class})

In [12]:
df['raw_text'] = (df['raw_text'].str.replace(r'[^\w\s()&]', ' ').
                                   str.replace(r'\s+', ' ').
                                   str.replace(r'-', '').
                                   str.replace('\n', '').
                                   str.replace('', '').
                                   str.strip())

In [13]:
# This can be ignored in further attempts
def process_lines(lines):
    for line in lines:
        count = 0  # Counter for sequences of consecutive digits
        for match in re.finditer(r'\d+', line):
            count += 1
            if count == 2:
                split_index = match.start()  # Get the index of the second sequence of digits
                yield line[:split_index]  # Yield the part before the second sequence of digits
                yield from process_lines([line[split_index:]])  # Recursively process the remaining part
                break
        else:
            yield line  # If no split occurs, yield the original line
            

def lookup_source(test_strings, dataframe, text_column, source_column):
    sources = []
    for i, text in enumerate(test_strings):
        # Find all rows where 'raw_text' column matches the text from the test object
        rows = dataframe[dataframe[text_column].str.contains(text, regex=False)]
        
        if not rows.empty:
            # Calculate the absolute difference between indices and the current iteration number
            diff = np.abs(rows.index.values - i)
            closest_row_idx = np.argmin(diff)  # Find the index of the closest row number to the current iteration number
            closest_row = rows.iloc[closest_row_idx]  # Get the closest row
            sources.append(closest_row[source_column])  # Append the 'source' value from the closest row
        else:
            sources.append(None)  # Append None if no match is found
    
    return sources


In [14]:
lines_result = list(process_lines(df['raw_text']))
# Call the function with your test object, DataFrame, and columns
sources_result = lookup_source(lines_result, df, 'raw_text', 'source')
class_result = lookup_source(lines_result, df, 'raw_text', 'class')

processed_data = pd.DataFrame({'text':lines_result, 'source':sources_result, 'class':class_result})

In [19]:
def extract_location(text):
    # Pattern to match word after "à" followed by space and potentially words in brackets
    pattern_a_accent = r"(?<=à\s)(\w+)(?:\s*\((.*?)\))?"
    
    # Pattern to match word after accent aigu without space and potentially words in brackets
    pattern_accent_aigu_no_space = r"(?<=à)(\w+)(?:\s*\((.*?)\))?|\Z"

    # Pattern to match word after "de" followed by space and potentially words in brackets
    pattern_de = r"(?<=de\s)(\w+)(?:\s*\((.*?)\))?"

    # Pattern to match word after "a" followed by space and potentially words in brackets
    pattern_a = r"(?<=a\s)(\w+)(?:\s*\((.*?)\))?"

    # First, try to match word after "à" followed by space and potentially words in brackets
    match = re.search(pattern_a_accent, text)
    if match:
        return match.group(1), match.group(2)
    
    # If nothing is found, try to match word after accent aigu without space and potentially words in brackets
    match = re.search(pattern_accent_aigu_no_space, text)
    if match:
        return match.group(1), match.group(2)

    # If not found, try to match word after "de" followed by space and potentially words in brackets
    match = re.search(pattern_de, text)
    if match:
        return match.group(1), match.group(2)

    # If still not found, try to match word after "a" followed by space and potentially words in brackets
    match = re.search(pattern_a, text)
    if match:
        return match.group(1), match.group(2)


    # If no matches found, return None
    return None, None

In [20]:
# Parse the location:
processed_data['location'] = processed_data['text'].apply(lambda x: extract_location(x)[0])

In [21]:
# Export the final resulting file to .csv
# And aftewards continue with name_matching.ipynb
processed_data.to_csv('../data/1867_austria.csv')

In [22]:
processed_data

Unnamed: 0,text,source,class,location
0,PRODUITS D'IMPRIMERIE ET DE LIBRAIRIC. 2] EMPI...,Paris1867_Austria_0.jpg,,
1,"1. Beek (maison Alfred Hülder), àVienne. — Liv...",Paris1867_Austria_0.jpg,CLASSE 6 — PRODUITS D'IMPRIMERIE\n: ET DE LIBR...,Vienne
2,"3 Braumäller (Guillaume), àVienne. — Livres de...",Paris1867_Austria_0.jpg,CLASSE 6 — PRODUITS D'IMPRIMERIE\n: ET DE LIBR...,Vienne
3,"3. Caumo (Antoine), à Rovéredo(Tyrol). — Epreu...",Paris1867_Austria_0.jpg,CLASSE 6 — PRODUITS D'IMPRIMERIE\n: ET DE LIBR...,Rovéredo
4,4. Corporation des libraireset des marchands d...,Paris1867_Austria_0.jpg,CLASSE 6 — PRODUITS D'IMPRIMERIE\n: ET DE LIBR...,vienne
...,...,...,...,...
3464,"6. Krzcptowski (Joseph), à Koscieiisko (Gallic...",Paris1867_Austria_90.jpg,CLASSE 94. — PRODUITS DR TOUTE\nSORTE FABRIQUÉ...,Koscieiisko
3465,"7. Laporzynski (Romuald), à Neumarkt (Gallicie...",Paris1867_Austria_90.jpg,CLASSE 94. — PRODUITS DR TOUTE\nSORTE FABRIQUÉ...,Neumarkt
3466,"8. Lay (Félix), à Essegg (Esclavone — Tapis, f...",Paris1867_Austria_90.jpg,CLASSE 94. — PRODUITS DR TOUTE\nSORTE FABRIQUÉ...,Essegg
3467,4er réen de la frontière de Lika:— Couverture ...,Paris1867_Austria_90.jpg,CLASSE 94. — PRODUITS DR TOUTE\nSORTE FABRIQUÉ...,
