In [2]:
from PIL import Image
from pdf2image import convert_from_path
import pytesseract
import pandas as pd
import os
import re
import numpy as np
#import openai

In [2]:
# Helper function
def clean_rest_string(input_str):
    # Define a regex pattern to remove the hyphen "—" and the newline character "\n"
    pattern = r'[—\n\.]'
    # Use the re.sub() function to replace the matched pattern with an empty string
    output_str = re.sub(pattern, ' ', input_str)
    output_str = re.sub(r'\s+', ' ', output_str)
    return(output_str)

In [5]:
# Convert to JPG from source
images = convert_from_path('../../data/primary_sources/PARIS_1900_Italy.pdf')

for count, image in enumerate(images):
    image.save(f'../../data/primary_sources/Paris_1900_Italy_{count}.jpg', 'JPEG')

In [29]:
# Define the folder path where the image files are located
folder_path = '../../data/primary_sources'

# Initialize an empty dictionary to store the text for each page
output_dict = {}

# Loop through all the .jpg files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".jpg") and "Paris_1878_Italy" in filename:
        # Create the full file path
        file_path = os.path.join(folder_path, filename)
        
        # Extract text from the image using pytesseract
        text = pytesseract.image_to_string(Image.open(file_path), lang='fra')
        
        # Add the text to the dictionary with the filename as the key
        output_dict[filename] = text

In [30]:
# Extract numeric part of keys and convert to integers
numeric_keys = [int(key.split('_')[-1].split('.')[0]) for key in output_dict.keys()]

# Pair numeric keys with original keys
key_value_pairs = list(zip(output_dict.keys(), output_dict.values()))
sorted_pairs = sorted(key_value_pairs, key=lambda x: int(x[0].split('_')[-1].split('.')[0]))

# Reconstruct ordered dictionary
ordered_dict = {pair[0]: pair[1] for pair in sorted_pairs}


In [31]:
# Initialize an empty list to store the split text and sources
split_text_list = []
source_list = []

# Iterate through the elements in the output_dict
for filename, text in ordered_dict.items():
    # Split the text by '\n\n' and add the resulting elements to the list
    split_entries = text.split('\n\n')
    
    # Add each split entry to the split_text_list and its corresponding source to the source_list
    for entry in split_entries:
        split_text_list.append(entry)
        source_list.append(filename)

# Create a pandas DataFrame with 'raw_text' and 'source' columns
df = pd.DataFrame({'raw_text': split_text_list, 'source': source_list})


In [32]:
# Assuming you have your DataFrame df as described
# Create a list to store the filtered data
filtered_data = []

# Initialize a variable to track the 'class' value
current_class = None

# Iterate through the rows of the original DataFrame
for index, row in df.iterrows():
    raw_text = row['raw_text']
    source = row['source']
    # Check if the row contains the word "CLASSE"
    if "CLASSE" in raw_text:
        current_class = raw_text
    else:
        # If the row does not contain "CLASSE," add it to the list
        filtered_data.append({'raw_text': raw_text, 'class': current_class, 'source':source})

# Create a DataFrame from the list of dictionaries
filtered_df = pd.DataFrame(filtered_data)

In [33]:
filtered_df

Unnamed: 0,raw_text,class,source
0,ITALIE.,,Paris_1878_Italy_0.jpg
1,GROUPE II.,,Paris_1878_Italy_0.jpg
2,ÉDUCATION ET ENSEIGNEMENT.\nMATÉRIEL ET PROCÉD...,,Paris_1878_Italy_0.jpg
3,"- Amati (Chevalier À.), à Stradella (Pavie). —...",CLASSE 6.\nÉducation de l'enfant ; enseignemen...,Paris_1878_Italy_0.jpg
4,"Baragioln (E.),à Côme. — Exercices de grammair...",CLASSE 6.\nÉducation de l'enfant ; enseignemen...,Paris_1878_Italy_0.jpg
...,...,...,...
1487,"Direction de l'Agriculture, à Rome.— Collectio...",CLASSE 89.\nGraines et plantes d’essences fore...,Paris_1878_Italy_84.jpg
1488,Société d'aceclimatation et d'agriculture en S...,CLASSE 89.\nGraines et plantes d’essences fore...,Paris_1878_Italy_84.jpg
1489,"Æroubetzkoy (Prince P.),à Intra (Novare). — (C...",CLASSE 89.\nGraines et plantes d’essences fore...,Paris_1878_Italy_84.jpg
1490,4 ES\nrss,CLASSE 90.\nPlantes de serre.\nÉeole\nserre.,Paris_1878_Italy_84.jpg


By now, we have OCR'ed all the data. In what follows, we think that some of the line split ups have been done erroneously. Now we correct that on the basis of numbers. Thus, we decrease the number of lines to something closer to the actual number of lines. 

In [34]:
merged_text = []  # List to store merged text
merged_text_source = [] # List to store the class
merged_text_class = [] # List to store the source picture

for index, row in filtered_df.iterrows():
    starts_with_digit_or_percent = any(char.isdigit() or char == '%' for char in row['raw_text'][:3])
    
    if not starts_with_digit_or_percent and index > 0:
        # Merge the current row's text into the previous row
        merged_text[-1] += ' ' + row['raw_text']
        merged_text_source[-1] = row['source']
        merged_text_class[-1] = row['class']
    else:
        # Append the current row's text as a new entry
        merged_text.append(row['raw_text'])
        merged_text_source.append(row['source'])
        merged_text_class.append(row['class'])

In [35]:
df = pd.DataFrame({'raw_text':merged_text, 'source':merged_text_source, 'class':merged_text_class})

In what follows, we simply rid the text of some annoying symbols without compromising a lot on the structure. Then afterwards, in process_lines, we cut up (increase) the number of lines again, on the basis of numbers, because we felt that some observations were erroneously clustered in 1 line. What results is a very nice and neat vector of raw strings, which should be easy to process. That is the purpose the everything afterwards. 

In [36]:
df['raw_text'] = (df['raw_text'].str.replace(r'[^\w\s()&]', ' ').
                                   str.replace(r'\s+', ' ').
                                   str.replace(r'-', '').
                                   str.replace('\n', '').
                                   str.replace('', '').
                                   str.strip())

In [37]:
# This can be ignored in further attempts
def process_lines(lines):
    for line in lines:
        count = 0  # Counter for sequences of consecutive digits
        for match in re.finditer(r'\d+', line):
            count += 1
            if count == 2:
                split_index = match.start()  # Get the index of the second sequence of digits
                yield line[:split_index]  # Yield the part before the second sequence of digits
                yield from process_lines([line[split_index:]])  # Recursively process the remaining part
                break
        else:
            yield line  # If no split occurs, yield the original line
            

def lookup_source(test_strings, dataframe, text_column, source_column):
    sources = []
    for i, text in enumerate(test_strings):
        # Find all rows where 'raw_text' column matches the text from the test object
        rows = dataframe[dataframe[text_column].str.contains(text, regex=False)]
        
        if not rows.empty:
            # Calculate the absolute difference between indices and the current iteration number
            diff = np.abs(rows.index.values - i)
            closest_row_idx = np.argmin(diff)  # Find the index of the closest row number to the current iteration number
            closest_row = rows.iloc[closest_row_idx]  # Get the closest row
            sources.append(closest_row[source_column])  # Append the 'source' value from the closest row
        else:
            sources.append(None)  # Append None if no match is found
    
    return sources


In [38]:
lines_result = list(process_lines(df['raw_text']))
# Call the function with your test object, DataFrame, and columns
sources_result = lookup_source(lines_result, df, 'raw_text', 'source')
class_result = lookup_source(lines_result, df, 'raw_text', 'class')

processed_data = pd.DataFrame({'text':lines_result, 'source':sources_result, 'class':class_result})

In [39]:
def extract_location(text):
    # Pattern to match word after "à" followed by space and potentially words in brackets
    pattern_a_accent = r"à\s(\w+)"
    pattern_accent_aigu_no_space = r"à(\w+)"
    pattern_de = r"de\s(\w+)"
    pattern_a = r"a\s(\w+)"


    # First, try to match word after "à" followed by space and potentially words in brackets
    match = re.search(pattern_a_accent, text)

    if match:
        return match.group(1)
    
    # If nothing is found, try to match word after accent aigu without space and potentially words in brackets
    match = re.search(pattern_accent_aigu_no_space, text)
    if match:
        return match.group(1)
    
    # If not found, try to match word after "de" followed by space and potentially words in brackets
    match = re.search(pattern_de, text)
    if match:
        return match.group(1)

    # If still not found, try to match word after "a" followed by space and potentially words in brackets
    match = re.search(pattern_a, text)
    if match:
        return match.group(1)

    # If no matches found, return None
    return None

In [40]:
# Parse the location:
processed_data['location'] = processed_data['text'].apply(lambda x: extract_location(x))

In [41]:
# Export the final resulting file to .csv
# And aftewards continue with name_matching.ipynb
processed_data.to_csv('../../data/1878_italy.csv')