In [124]:
#!pip install google-cloud-vision  # No need to run everytime!


In [125]:
import os
import io
from google.cloud import vision_v1 as vision
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"c:\Users\aktne\OneDrive\Desktop\Work\wasson-notebooks.json"

In [156]:
import os
import io
from google.cloud import vision_v1 as vision 
#Downloads google cloud vision which is the language model that will be utilized
from google.cloud.vision_v1 import types


os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = r"c:\Users\aktne\OneDrive\Desktop\Work\wasson-notebooks.json"  
#Personalized credentials that were created in order to use google cloud vision (anyone using this will need their own json key)
# Setting the json key must be run before the client in initialized

def detect_text_from_pdf(pdf_path): #Creating a function with argument pdf_path which is the pdf file needing to be read in
    """Detects text in a PDF file using Vision API's batch_annotate_files."""

    client = vision.ImageAnnotatorClient() #This sets the client which allows the program to run
    content = None #Set the content to None until it is updated (seen down below in next coding group)
    
    # Read PDF content
    try:
        #Opening up the pdf file and setting it as content which was initialized as None to start with
        with open(pdf_path, 'rb') as pdf_file: #Setting the data as "raw bytes" which will allow cloud vision to process pdf
            content = pdf_file.read()
    except FileNotFoundError:
        print(f"Error: The file path '{pdf_path}' was not found.") #If the file is not found it will print this error message
        return

    if content is None: #if there is no content it will just return
        return 

    pdf_source = types.InputConfig(content=content, mime_type='application/pdf') #creates an input configuration object with is pdf_source and specifies the content (raw byte pdf) it is using
    image_context = types.ImageContext(language_hints=['en']) #This is telling the API that the writing is in English, this will make it so it is better with recognition
    feature = types.Feature(type_=vision.Feature.Type.TEXT_DETECTION) #Specifies the feature/task the API will be doing which in this case is text detection
    request = types.AnnotateFileRequest(
        input_config=pdf_source,
        features=[feature],
        image_context=image_context
        )#This takes all the information stated above and packages it into one request to be sent to the API
    
    print("Sending PDF to Google Cloud Vision API...") #This just prints a message letting the user know it is processing
    
    response = None #Setting the response equal to None unless there is an output
    try: #Try so that it it runs without errors it will execute
        response = client.batch_annotate_files(requests=[request]) #calls Vision API for pdfs
    except Exception as e:
        print(f"An error occurred during API call: {e}") #If there is a problem with the API it will print the exception
        return []# Exit the function if the API call fails
    
    page_texts = []

    # Check if a response object was created successfully
    if response and response.responses: #If there is a response then run the following commands 
        for response_file in response.responses: #Loop through the responses created by the API
            for i, page in enumerate(response_file.responses): #Create page number for each image
                page_number = i + 1
                if page.error.message:
                    # Print error and skip to next page if there's an issue with one page
                    print(f'Error on page {i+1}: {page.error.message}') #Specifies which page posed the error
                    continue
                
                # Append the text for the current page
                page_text = page.full_text_annotation.text #Assigns the output as page_text
                page_texts.append(page_text)
                # print text for each page
                print(f'--- Page {page_number} Text ---') #Tprints the page number 
                print(page_text) #prints the text for each page

                print("-" * 50)


    return page_texts #Ends the function returning all printings



In [157]:
import re #Imports the regular expressions module

def correct_ocr_text(text: str) -> str: #Used to hint that the fucntion output should be a string 
    
    corrected_text = text #Assings corrected text to text (which is what you should assign the output for the function detect_text_from_pdf in order for the function to work)
    corrected_text = re.sub(r'[\t]+', ' ', corrected_text) #Convert multiple spaces into a single space
    units_to_fix = r'(\s|[^a-z0-9])([l|L])(b|g|m|oz|ft|in|A|V|W|Hz|s|F|J|K)\b' #Used to help make sure that 1 is not mistaken for "1"
    corrected_text = re.sub(units_to_fix, r'1\2', corrected_text, flags=re.IGNORECASE) #implements the fixed units code

    o_to_zero_pattern = r'(?<=\d)[oO](?=\d)|(?<=\s)[oO](?=\d)|(?<=\d)[oO](?=\s)' #Repleaces the 'O' with a zero if OCR messes it up with reading numbers
    corrected_text = re.sub(o_to_zero_pattern, '0', corrected_text)

    unit_context_words = r'(mass|weight|length|volume)' #This creates and assignment filled with units
    pattern_9_to_g_context = r'(\d)9(\s+' + unit_context_words + r')' #states that the pattern should fit digits+9+space+context_words+any_string_after
    
    corrected_text = re.sub(pattern_9_to_g_context, r'\1g\2', corrected_text, flags=re.IGNORECASE) #replaces the 9 with a g if it fit the patter_9_to_g_context
    corrected_text = re.sub(r'\s+', ' ', corrected_text).strip()
    
    return corrected_text

# Assuming correct_ocr_text() and PAGE_SEPARATOR are defined
PAGE_SEPARATOR = "\n\n---PAGE BREAK---\n\n"

def process_all_pages(all_raw_pages: list[str]) -> str:
    """
    Takes the list of raw page strings (the 'storage object'),
    corrects them one-by-one, and returns a single structured string.
    """
    if not all_raw_pages:
        print("No pages to process.") #This is if there is no input to be processed and it will return this output so there is no error
        return ""

    final_corrected_document = "" #Creates an empty string that will the fixed text will be added to
    
    # Iterate over the storage object (the list of pages)
    for i, page_text in enumerate(all_raw_pages): #We need a for loop because the fucntion above only works with one text at a time so we need to process each group of text separately
        page_number = i + 1 #This is used to help count the page number that we are on as the pages are being processed
        
        # Apply the single-page correction function
        corrected_page_text = correct_ocr_text(page_text) #This will put the output from the Vision API into the helper function corrected_ocr_text so that it can fix errors
        
        # Structure and append to the final document string
        page_header = f"[PAGE {page_number}]\n" #This output creates a page number for each text input from the for loop
        final_corrected_document += page_header #Adds a page header 
        final_corrected_document += corrected_page_text #Moves onto the next group of text
        
        if i < len(all_raw_pages) - 1: #Makes sure we end the loop
            final_corrected_document += PAGE_SEPARATOR
        else:
            final_corrected_document += "\n" #continue if not done
            
    return final_corrected_document #Prints the final output of all the text the ran through the corrected_ocr_text function put together into one outpuy



In [158]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\A-Al_page-0001-0005.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)


Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
Abakan
Octahedritz No 1
(668)
Abakan
Oct.
be
rehcard
1.1 ± 0.3 mm
sch. etc
>1000°C.
weakly
i crcniy..
•
Matrix poly X α2; taenite clear
Curous
unique.?.
little schreib (tmy
all
Shadowy ppter.
post-heating Comosion
ones
disolved ? )
Small fices have already
vanished.
2/8 indistinct & thomy
plessite dissway up.
'
surrounding α
-darker, more
hatched. - C effect ?
dong
g.b & planes
trag gray pale
?
Cop
comosu
--------------------------------------------------
--- Page 2 Text ---
Abee, metal clast 3,3,02,
fran Derch Sears
Ja INAA.
Jyly/angrist 1986
1.124 grammes
one
fragment to be mounted fu
NBA
! Silicate / Carbide inclusions! pal. Thin section
RUB
--------------------------------------------------
--- Page 3 Text ---
Cuña IN 1235.
• sep. 87:
The polished and etched surface is 24x2/cm, Kamarite's
bandwidth is about 0.5-0.6mm.
of Kamicites.
Taenite
Occurs at rim
Schreibersites occurs
in Kamicite.
ta-
kam.
-sch
Cohenite could be f

In [None]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("A-Al_page-0001-0005.txt", "a", encoding="utf-8") as f: #Creates a txt file and then imediately closes it after it is written. The "a" tells it that if you runt his code again update the existing file instead of creating another copy.
    #utf-8 is the text encoding standard that we want to have
    #This also makes sure that if there is weird characters the code won't crash
    f.write(final_corrected_text)

[PAGE 1]
Abakan Octahedritz No 1 (668) Abakan Oct. be rehcard 1.1 ± 0.3 mm sch. etc >1000°C. weakly i crcniy.. • Matrix poly X α2; taenite clear Curous unique.?. little schreib (tmy all Shadowy ppter. post-heating Comosion ones disolved ? ) Small fices have already vanished. 2/8 indistinct & thomy plessite dissway up. ' surrounding α -darker, more hatched. - C effect ? dong g.b & planes trag gray pale ? Cop comosu

---PAGE BREAK---

[PAGE 2]
Abee, metal clast 3,3,02, fran Derch Sears Ja INAA. Jyly/angrist 1986 1.124 grammes one fragment to be mounted fu NBA ! Silicate / Carbide inclusions! pal. Thin section RUB

---PAGE BREAK---

[PAGE 3]
Cuña IN 1235. • sep. 87: The polished and etched surface is 24x2/cm, Kamarite's bandwidth is about 0.5-0.6mm. of Kamicites. Taenite Occurs at rim Schreibersites occurs in Kamicite. ta- kam. -sch Cohenite could be found ? within Kanowite. (Fig 1) sch. Figi cohemite.

---PAGE BREAK---

[PAGE 4]
Aggie Cruck (Alaska, USA) 22. IV. 66, USNM. 8x12cm very reg

In [130]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\A-Al_page-0006-0010.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
27 II88
DJM
Agua Blanca
A few large schreibersite crystals occur
inside the kanacite bands. Bandwidth
is roughly 1.0-1.3 mm.
--------------------------------------------------
--- Page 2 Text ---
CANYON DIABLO / CAP
C
Ahumada (Chihuahua, Mexico)
50
30 68. 8.89 sample from ASV, Pallasile. Kam areas.
π
(not really bands) outlined by schreib, all of which
all of wik- black.
shows evidence
even-
8
guile
Olivine is mostly rounded with some flattick
volume in this small section.
bad an in Mount Vernon.
~ 60%
sides. It occupies.
Ixidation is servais, but not quite
as
--------------------------------------------------
--- Page 3 Text ---
Ainsworth
group IIAB
15 May 86.
IN 129.
The polished and etched surface is 15mm by 5mm. Under
Ar microscope Only minerale that can be observed is Komacite,
with Neumann bands appearing in it
--------------------------------------------------
--- Page 4 Text ---
Ainsworth (Nebraska, USA)
was etched
o

In [131]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("A-Al_page-0006-0010.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
27 II88 DJM Agua Blanca A few large schreibersite crystals occur inside the kanacite bands. Bandwidth is roughly 1.0-1.3 mm.

---PAGE BREAK---

[PAGE 2]
CANYON DIABLO / CAP C Ahumada (Chihuahua, Mexico) 50 30 68. 8.89 sample from ASV, Pallasile. Kam areas. π (not really bands) outlined by schreib, all of which all of wik- black. shows evidence even- 8 guile Olivine is mostly rounded with some flattick volume in this small section. bad an in Mount Vernon. ~ 60% sides. It occupies. Ixidation is servais, but not quite as

---PAGE BREAK---

[PAGE 3]
Ainsworth group IIAB 15 May 86. IN 129. The polished and etched surface is 15mm by 5mm. Under Ar microscope Only minerale that can be observed is Komacite, with Neumann bands appearing in it

---PAGE BREAK---

[PAGE 4]
Ainsworth (Nebraska, USA) was etched on to 1 Aug 66. Four prices weighing 4g from USNM. One this slab Two large faces (~ 1.5 x 2 cm each) & side. The meteorite in odd. The of kamarite appears to be all one single crystal

In [132]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\A-Al_page-0011-0015.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
Albion (Washington, USA)
Strongly reluated IUH structure,
Bandwidth
~ 0.28 mm,
No visible inclusions
Taenile lamellar are now
of.
3 Jan 93
mm
10mm
17 mm
taenile, the result of strappresented by "dots" of
heating
Our comporitional data are essentially identical
to those for Gibeon, which is widely distributed. To
my knowledge Gibeon does not show this reheated
structure so there is reason to support the ASU
view that the iron is a
new = independent metiorite
--------------------------------------------------
--- Page 2 Text ---
Aldama (LC 1240)
21 May 97
12
m³
6.5m
mm
Regular Widmanstätten pattern
Bandwidth 0.4-1.2 (uncorrected)
Plessite: net type
u
Taenite: all lamellae" appear as stringers of 5-10 μ particles
or irregular masses of taenite (reheating)
Schreiberite not observable
:
Thermal history: has been reheated, structures are
modified
severely
are se
--------------------------------------------------
--- Page 3 Text --

In [133]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("A-Al_page-0011-0015.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
Albion (Washington, USA) Strongly reluated IUH structure, Bandwidth ~ 0.28 mm, No visible inclusions Taenile lamellar are now of. 3 Jan 93 mm 10mm 17 mm taenile, the result of strappresented by "dots" of heating Our comporitional data are essentially identical to those for Gibeon, which is widely distributed. To my knowledge Gibeon does not show this reheated structure so there is reason to support the ASU view that the iron is a new = independent metiorite

---PAGE BREAK---

[PAGE 2]
Aldama (LC 1240) 21 May 97 12 m³ 6.5m mm Regular Widmanstätten pattern Bandwidth 0.4-1.2 (uncorrected) Plessite: net type u Taenite: all lamellae" appear as stringers of 5-10 μ particles or irregular masses of taenite (reheating) Schreiberite not observable : Thermal history: has been reheated, structures are modified severely are se

---PAGE BREAK---

[PAGE 3]
GHANIM 2 Feb. 89= IN 1303 The polished and etched surface is 13x5mm, We can See This Al-Glaming sample is the only unweathered piece in t

In [134]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\A-Al_page-0016-0020.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
ALHA 77230, 4
July 7, 1980
2.6 gram
from Ant. Met. Working Grotip via SI.
Pal. section % (there is a lot of them at NMNH)
Thick section 77250,5
aloan.
A few schrei bersite vers alay there mins a
vens alay there wins a terrestrial carrosion attach
tatal P.
is gairy or morderate
MICRO. large 1-3 m 2 grains, eslay Nemmen bents. ~ 20%
recystallisations; these new grains (Smaller 0, (-0,3 mm) also have
mantra! (two schocks events with an intervening rexx-event!).
1.1
approx
have Men -
--------------------------------------------------
--- Page 2 Text ---
ALHA
3.5
77283.5
fram And. Met. W. Grung. NMNI SI.
gram.
Thich Section 77283,4
✓ on loan
large schreibeixtes 2x 15 mm ! and smaller veins, very high buth P.
MICRO: bug-
grans
BW ~ 9.8mm
2V
little farmita (count) lev Mi
diaplex
Saine Neumann bands developed.
Some incipient "flame" / "faces" reagstallization in d. Rhyfedites
Severely deformed (shattered) Schreiber sits, with plasti

In [135]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("A-Al_page-0016-0020.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
ALHA 77230, 4 July 7, 1980 2.6 gram from Ant. Met. Working Grotip via SI. Pal. section % (there is a lot of them at NMNH) Thick section 77250,5 aloan. A few schrei bersite vers alay there mins a vens alay there wins a terrestrial carrosion attach tatal P. is gairy or morderate MICRO. large 1-3 m 2 grains, eslay Nemmen bents. ~ 20% recystallisations; these new grains (Smaller 0, (-0,3 mm) also have mantra! (two schocks events with an intervening rexx-event!). 1.1 approx have Men -

---PAGE BREAK---

[PAGE 2]
ALHA 3.5 77283.5 fram And. Met. W. Grung. NMNI SI. gram. Thich Section 77283,4 ✓ on loan large schreibeixtes 2x 15 mm ! and smaller veins, very high buth P. MICRO: bug- grans BW ~ 9.8mm 2V little farmita (count) lev Mi diaplex Saine Neumann bands developed. Some incipient "flame" / "faces" reagstallization in d. Rhyfedites Severely deformed (shattered) Schreiber sits, with plastic deformation of Surrounding α, Some terrestrial carrosion alay schrichesite (pos. IA crack

---

In [136]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\A-Al_page-0021-0026.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
27 II 88 DJM
ALLAN HILLS 481014
0.22±0.03.
Bandurdth 0.1 0.2 mm. Kanacite
bands surround elongated groups of
carbideschreibersite crystals in many places.
Fusion crust rims the entire curved surface.
bands are Fez C (JTW) or
In cel of lamellae.
Fe₂
Roy Clarke (1984) calls this an ataxite, He observed
only schreibersite, no other inclusions.
--------------------------------------------------
--- Page 2 Text ---
ALLAN HILLS
(AGH 84165,2)
7 Aug.
13mm.
IN 1191
-
TAB
1.1mm.
upwes
Some kamacites
The polished and ethed Longth and width of surface is 23mm and
The bandwith of kamacite is 200 200
exhibite recrystalalization texture and with Neumann (Fig 1), Taenite
Occurs at the rim of or within kamicites. (Figz) Plessite can be found.
Schreibersite can be observed at the rim of kamicite and plessite thig
mean bow. 0,9±0.1mm
Neumann
talnite
kam.
Kam.
sch.
·plessite.
Taenite
Fig 1.
Fig 2
Clarke, Autarc
Newsletter 9 (1) 1986
gives "livi

In [137]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("A-Al_page-0021-0026.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
27 II 88 DJM ALLAN HILLS 481014 0.22±0.03. Bandurdth 0.1 0.2 mm. Kanacite bands surround elongated groups of carbideschreibersite crystals in many places. Fusion crust rims the entire curved surface. bands are Fez C (JTW) or In cel of lamellae. Fe₂ Roy Clarke (1984) calls this an ataxite, He observed only schreibersite, no other inclusions.

---PAGE BREAK---

[PAGE 2]
ALLAN HILLS (AGH 84165,2) 7 Aug. 13mm. IN 1191 - TAB 1.1mm. upwes Some kamacites The polished and ethed Longth and width of surface is 23mm and The bandwith of kamacite is 200 200 exhibite recrystalalization texture and with Neumann (Fig 1), Taenite Occurs at the rim of or within kamicites. (Figz) Plessite can be found. Schreibersite can be observed at the rim of kamicite and plessite thig mean bow. 0,9±0.1mm Neumann talnite kam. Kam. sch. ·plessite. Taenite Fig 1. Fig 2 Clarke, Autarc Newsletter 9 (1) 1986 gives "livitative "bw of 1 mm, Sample shows shope

---PAGE BREAK---

[PAGE 3]
Allan Hills ACH 84233,2 IN 12

In [138]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\An-Ar_page-0001-0005.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
Anderson (Ohio, USA)
30 VIL 68
ms. Broke
Harward, entire 1709 mass
of
From
small fragment. Rounded olivine, in many sized spheres
from 6.3 to 1.0 cm,
some "coalexed." to
Ve a border between coalesced nodules indicating schreib, o
differently oriented crystal structures
oriented crystal structures, or botte Thin
deposits 2 schreib seem to ourround olivine. The
~ 50%
olivine only appears to occupy love packed. One
section and is definitely
and is definitely of area in this
metallic area in
shows some surface
not
~ 1,5 an across.
across. Small
oxidation, but not se pragment
too
too
--------------------------------------------------
--- Page 2 Text ---
Angelica
mather
2
9570 USNMA27, there hand with :00, 5 ~1 mm; this pase
#2177, 4.8
4.89
Area 2 cm² polished and etched (30 sec ) ;
regular Wid mom pattern, No.
this prace contains
inclusion of Froilite and Cohenite (3) (see picture), and
small hole without oxidation products; most

In [139]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("An-Ar_page-0001-0005.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
Anderson (Ohio, USA) 30 VIL 68 ms. Broke Harward, entire 170g mass of From small fragment. Rounded olivine, in many sized spheres from 6.3 to 1.0 cm, some "coalexed." to Ve a border between coalesced nodules indicating schreib, o differently oriented crystal structures oriented crystal structures, or botte Thin deposits 2 schreib seem to ourround olivine. The ~ 50% olivine only appears to occupy love packed. One section and is definitely and is definitely of area in this metallic area in shows some surface not ~ 1,5 an across. across. Small oxidation, but not se pragment too too

---PAGE BREAK---

[PAGE 2]
Angelica mather 2 9570 USNMA27, there hand with :00, 5 ~1 mm; this pase #2177, 4.8 4.89 Area 2 cm² polished and etched (30 sec ) ; regular Wid mom pattern, No. this prace contains inclusion of Froilite and Cohenite (3) (see picture), and small hole without oxidation products; most of the plessitie shows mine - Widman, structure There are some cracks in the piece; mapare slig

In [140]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\An-Ar_page-0006-0010.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
Ang's makssalik
IN 2111
14 Jan 2012-JTW
Rusly slab ~ 3 mm thick ; total mass 62.0g.
Coarsest octahedrete; ham. bandwidth ~5 mm.
Much corrosion at ham. gracic boundaries but
interiors seem to be polid metal,
Metal seems to have a
froily appearance,
probably the result of a deep itch. We did not
st prepare a new surface.
I was
for bed not able to find
get
any
inclusions
--------------------------------------------------
--- Page 2 Text ---
7
IN 1360
be
Anyujokij
Small sample from KMAN, Moscow 5 Seems to be a
hexahedral 5.
hexahedrile; abundant Neumann lives, no taemite.
Curious "whitish" mottled area in lower left - may
rebeating effect. Under high power
many tiny (10 μm x 100 μm) schaboites in
annd even incipient melling? (domashed areas.
but extensive resorption
some sort
some
car
Ree
the unreheated area,
soled 60 x 100 μme in size)
clear
Cle
JR
3.4.90
--------------------------------------------------
--- Page 3 Text ---
An

In [141]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("An-Ar_page-0006-0010.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
Ang's makssalik IN 2111 14 Jan 2012-JTW Rusly slab ~ 3 mm thick ; total mass 62.0g. Coarsest octahedrete; ham. bandwidth ~5 mm. Much corrosion at ham. gracic boundaries but interiors seem to be polid metal, Metal seems to have a froily appearance, probably the result of a deep itch. We did not st prepare a new surface. I was for bed not able to find get any inclusions

---PAGE BREAK---

[PAGE 2]
7 IN 1360 be Anyujokij Small sample from KMAN, Moscow 5 Seems to be a hexahedral 5. hexahedrile; abundant Neumann lives, no taemite. Curious "whitish" mottled area in lower left - may rebeating effect. Under high power many tiny (10 μm x 100 μm) schaboites in annd even incipient melling? (domashed areas. but extensive resorption some sort some car Ree the unreheated area, soled 60 x 100 μme in size) clear Cle JR 3.4.90

---PAGE BREAK---

[PAGE 3]
Antofagasta (Autofagasta, Chile) 13. Aug. 71. Two pieces seems to be Two pieces from NM NH 1207. Both have had olivine removed. Pallasite to 

In [142]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\An-Ar_page-0011-0015.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
Apoala
big 2cm diameter
big
18. IV. 66, USNM, Seautiful specimen, one
11.
troidite nodule, many long thick schreibersite inclusions.
bands
vary greatly in width, from ~ 0 2 to ~ 0.8 mm. One
or 2 schreibersite inclusions have different orcentation
than others.
11 cm
~0.2
Fes.
72cm
13. Dec. 65,
✓ Apoala - prends
smate compleo from Leonard Coll ham band width approwe
to be about 1.3 prime, otel the object
the object is Om not ofte
wall rich
A
--------------------------------------------------
--- Page 2 Text ---
Apoala- pseudo
Sample in Leonard collection labelled Apeala has a very
indistinct Om structure, quite different from USNM and CNHM
camples Apoala card.
width
-
see
appears
13. Dec. 65. samall sample (~ sog) from Leonard Coll, Kam hand
to be about 1.5 mm, thus the object is Dm, not lof.
There are numerous small rust-filled cracks, The Widmannslätten
indistinct, and it appears that the sample has been releated.
structure 

In [143]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("An-Ar_page-0011-0015.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
Apoala big 2cm diameter big 18. IV. 66, USNM, Seautiful specimen, one 11. troidite nodule, many long thick schreibersite inclusions. bands vary greatly in width, from ~ 0 2 to ~ 0.8 mm. One or 2 schreibersite inclusions have different orcentation than others. 11 cm ~0.2 Fes. 72cm 13. Dec. 65, ✓ Apoala - prends smate compleo from Leonard Coll ham band width approwe to be about 1.3 prime, otel the object the object is Om not ofte wall rich A

---PAGE BREAK---

[PAGE 2]
Apoala- pseudo Sample in Leonard collection labelled Apeala has a very indistinct Om structure, quite different from USNM and CNHM camples Apoala card. width - see appears 13. Dec. 65. samall sample (~ sog) from Leonard Coll, Kam hand to be about 1.5 mm, thus the object is Dm, not lof. There are numerous small rust-filled cracks, The Widmannslätten indistinct, and it appears that the sample has been releated. structure is is very samue While in Washington, I noted that Mixteca, which is from the general region of 

In [144]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\An-Ar_page-0016-0020.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
Arvorezinha
This iron is from Rucolollo. She wondered if
it would be Campo and the only evidence that it
bigh Are content
is nol is a be
wwwww
1) ann
Polishing (rough) and elching shows much evidence of oxidation, some
L-shaped (inclunous (bar Thicknesses ~ 0,1 men, bare lengths 0,2 to 3 mm
not
The
of linear
I am
sure what these are. The seem to shining to be schreit
sample is badly curroded and the only evidence regarding bandwidt
is that prear outlined by corrosion have dimensions of ~2
~ 2 mm. But the
balk of the ca. /cm² sample shows no samacite boundaries, I
speculate that it may have suffered reheating
2
Portaps these " inclusions" are tactiles, One Triangular
Occupy ~ 1-2%
of the surface
area could be plessile. The
--------------------------------------------------
--- Page 2 Text ---
Arlington (Minnesota, USA)
Pol'd+
Etched 300ec nital.
kam with
16 π 68. Small, 2.49 sample from AMNH.
Curious structure Sample consists

In [145]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("An-Ar_page-0016-0020.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
Arvorezinha This iron is from Rucolollo. She wondered if it would be Campo and the only evidence that it bigh Are content is nol is a be wwwww 1) ann Polishing (rough) and elching shows much evidence of oxidation, some L-shaped (inclunous (bar Thicknesses ~ 0,1 men, bare lengths 0,2 to 3 mm not The of linear I am sure what these are. The seem to shining to be schreit sample is badly curroded and the only evidence regarding bandwidt is that prear outlined by corrosion have dimensions of ~2 ~ 2 mm. But the balk of the ca. /cm² sample shows no samacite boundaries, I speculate that it may have suffered reheating 2 Portaps these " inclusions" are tactiles, One Triangular Occupy ~ 1-2% of the surface area could be plessile. The

---PAGE BREAK---

[PAGE 2]
Arlington (Minnesota, USA) Pol'd+ Etched 300ec nital. kam with 16 π 68. Small, 2.49 sample from AMNH. Curious structure Sample consists almost entirely of variable band widths. kam bands are parallel without suggesting a from. land

In [146]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\As-Av_page-0001-0005.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
Asarca Mexicana
9170
plains total
USNM
,
no mumber yet
2
Drea
718
Polished and etched (45 mee) Two adjacent
~ 4 cm²; average band width of slightly swollen Widman pattern is ~ 1 mm;
some with minor Widma structure; come
several types of plessite : coase crystalline, fime couple.,
Schreibersite ( or Cohenite?) in clusions.
oxidation
i
one thin crack cents
containing
Schreiber-ite
; alight edge is
1.1 £0.2
--------------------------------------------------
--- Page 2 Text ---
Ashfork (Arizona, USA)
AMNH. Polished and
clear structure
20 Aug 1967. Remnant (~ 5g) & sides col. 1.5cm².
"Lebed cover vital
kam
are distinct
as
reheating visible an mottled hamacite, but plessite is
still very clear, as is schreib at grain boundaries. Ram
bands ~ 1.7-7.9 mm Og. Phobdites in
are some Neumann bands. Relatively fresh specimen No colessile,
graphite, trailite recognizable. Schreibe along grain boundaries.
by fresh
fine, taemile between boun

In [147]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("As-Av_page-0001-0005.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
Asarca Mexicana 9170 plains total USNM , no mumber yet 2 Drea 718 Polished and etched (45 mee) Two adjacent ~ 4 cm²; average band width of slightly swollen Widman pattern is ~ 1 mm; some with minor Widma structure; come several types of plessite : coase crystalline, fime couple., Schreibersite ( or Cohenite?) in clusions. oxidation i one thin crack cents containing Schreiber-ite ; alight edge is 1.1 £0.2

---PAGE BREAK---

[PAGE 2]
Ashfork (Arizona, USA) AMNH. Polished and clear structure 20 Aug 1967. Remnant (~ 5g) & sides col. 1.5cm². "Lebed cover vital kam are distinct as reheating visible an mottled hamacite, but plessite is still very clear, as is schreib at grain boundaries. Ram bands ~ 1.7-7.9 mm Og. Phobdites in are some Neumann bands. Relatively fresh specimen No colessile, graphite, trailite recognizable. Schreibe along grain boundaries. by fresh fine, taemile between boundaries is discontinuous.

---PAGE BREAK---

[PAGE 3]
3 the specimen in swalking kamaste ~ is swa

In [148]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\As-Av_page-0006-0010.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
Avec L
27
Italy)
to
Avée Je
returned
21 Jeb 67. Sample from Vienna, wt ~ 4g; Prior - Hey state that are is
a hexahedrile fall. This is a weathered octahedrite, seemingly too
weathered to be a fall. Polished and etched 60 sec vital.
Prominent num. lives in kam. Plessite some fine + dark, some banded, ~
Numerous smell inclusions, in kam or cross
grain boundaries: I orignally thought these were cobenite, but they
bronze like trailite, I simply don't know for sure. One small
2 mm schreib nodule exposed on crust edge. sample weathered on
"crust" side cond also pitted on polished surface. Sample will
not be lacquered because of pits in "troilite", which could not be
cleaned once filled with lacquer. Kam bund 1.4-1.8mm
5% of
Og
area.
--------------------------------------------------
--- Page 2 Text ---
Avoca
(662)
From 6 cm of surface.
mod ockahodnite
sht rounded & lamelia
b.- 1/mm
sch
3
fow Rh' back lamellae
prodating of 8-2
+ gb

In [149]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("As-Av_page-0006-0010.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
Avec L 27 Italy) to Avée Je returned 21 Jeb 67. Sample from Vienna, wt ~ 4g; Prior - Hey state that are is a hexahedrile fall. This is a weathered octahedrite, seemingly too weathered to be a fall. Polished and etched 60 sec vital. Prominent num. lives in kam. Plessite some fine + dark, some banded, ~ Numerous smell inclusions, in kam or cross grain boundaries: I orignally thought these were cobenite, but they bronze like trailite, I simply don't know for sure. One small 2 mm schreib nodule exposed on crust edge. sample weathered on "crust" side cond also pitted on polished surface. Sample will not be lacquered because of pits in "troilite", which could not be cleaned once filled with lacquer. Kam bund 1.4-1.8mm 5% of Og area.

---PAGE BREAK---

[PAGE 2]
Avoca (662) From 6 cm of surface. mod ockahodnite sht rounded & lamelia b.- 1/mm sch 3 fow Rh' back lamellae prodating of 8-2 + gb schroib. ~> 8½/ 2 Ni Ta-b No comosion visible much plessite - forme unresolved uariety + fine c

In [150]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\As-Av_page-0011-0015.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
ASM 009 = NWA 4708 from Birdwell
JTW
21 Feb 07
This is a medium actahedrite with
remarkably for inclusions. My guess is
that it is low-N: ITAB.
~ 0.85 ± 0.15.
Buen
thích
The kam bandwidth is
I see one round troildte with a draineter
D.9 wewe and another roughly rectangular
R0.4 X NO.8 mm.
There is no beat attered
zove.
is a
we small (0.2x0.8)
Wealtering is minor to sugligible. associates
with lan Fe's that has a similar area (~0.15x1,0mm)
(on edge of)
015
thick
--------------------------------------------------
--- Page 2 Text ---
ASM 005 - NWA 47004
=
219
end mass
Med. Oct, band width 0.70 ±0.10. Bands are
short, swollen, but no inclusions visible in the
centers, I see no Fes and no schreibersite that is
+
N
1.5 cm
Thicks
clearly identifiable. The sample in crossed by swall cracks showing
oxidation (very minor) at edges. Pleasite, mainly dark gray or finely
banded, is abundant, ~ 30 to 35%
zove
35 of the
the area
No heat at

In [151]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("As-Av_page-0011-0015.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
ASM 009 = NWA 4708 from Birdwell JTW 21 Feb 07 This is a medium actahedrite with remarkably for inclusions. My guess is that it is low-N: ITAB. ~ 0.85 ± 0.15. Buen thích The kam bandwidth is I see one round troildte with a draineter D.9 wewe and another roughly rectangular R0.4 X NO.8 mm. There is no beat attered zove. is a we small (0.2x0.8) Wealtering is minor to sugligible. associates with lan Fe's that has a similar area (~0.15x1,0mm) (on edge of) 015 thick

---PAGE BREAK---

[PAGE 2]
ASM 005 - NWA 47004 = 219 end mass Med. Oct, band width 0.70 ±0.10. Bands are short, swollen, but no inclusions visible in the centers, I see no Fes and no schreibersite that is + N 1.5 cm Thicks clearly identifiable. The sample in crossed by swall cracks showing oxidation (very minor) at edges. Pleasite, mainly dark gray or finely banded, is abundant, ~ 30 to 35% zove 35 of the the area No heat attered

---PAGE BREAK---

[PAGE 3]
ASM 003 372 NWA 4702. 19 Feb 2007 - a for INAA This sample is 

In [152]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\Ba_page-0001-0005.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
Babbi's Mill (Troosts Iron) (Tennessee, USA).
76 Sep 66 That slab, I face
entire mass
is a
alry fine, dark
very
in vital. No obructured are 2x 215 cm² pol & etched 60 ser
plessite, which has an indistinct spottiness on a scale of perhaps
1.02 mm. No inclusions to be seen.
similar to S. Byron.
weathering out to crust, Very
1
--------------------------------------------------
--- Page 2 Text ---
Barranca Blanca (Atacama, Chile)
2.5cm²
6 Sep 66. 13g sample from Bril. Mas.
adjacent side
no Wid. pattern visible, but etch has revealed kam & teen very
nicely. the kam occurs as large creptals, Typical dimensions
The taenite cor very reflective plessite!)
irregular regions of typical dimensions 0.5
3 cm², and
Etched 60 sec in nital. There is
the order
z
q
occurs as
thes not
1am.
taenite borders.
The "bordess
"
the
kam cryptals are generally cracks stretching from one taen.
region & another. These are invariable slightly bridized and 

In [153]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("Ba_page-0001-0005.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
Babbi's Mill (Troosts Iron) (Tennessee, USA). 76 Sep 66 That slab, I face entire mass is a alry fine, dark very in vital. No obructured are 2x 215 cm² pol & etched 60 ser plessite, which has an indistinct spottiness on a scale of perhaps 1.02 mm. No inclusions to be seen. similar to S. Byron. weathering out to crust, Very 1

---PAGE BREAK---

[PAGE 2]
Barranca Blanca (Atacama, Chile) 2.5cm² 6 Sep 66. 13g sample from Bril. Mas. adjacent side no Wid. pattern visible, but etch has revealed kam & teen very nicely. the kam occurs as large creptals, Typical dimensions The taenite cor very reflective plessite!) irregular regions of typical dimensions 0.5 3 cm², and Etched 60 sec in nital. There is the order z q occurs as thes not 1am. taenite borders. The "bordess " the kam cryptals are generally cracks stretching from one taen. region & another. These are invariable slightly bridized and I can't determine whether they might also have schreib. in them. In many cases the teenite regio

In [154]:
pdf_to_process = r"C:\Users\aktne\OneDrive\Desktop\Work\Ba_page-0006-0012.pdf"
text = detect_text_from_pdf(pdf_to_process)
print(text)

Sending PDF to Google Cloud Vision API...
--- Page 1 Text ---
Bagdad (Arizona, U.S.A.)
17. II. 7. Iron with intend and clated face has area
zone
around edge.
5 x 7.5 cm.
fasion
Very interesting act structure, Kam bands
range
in size from. 0.8 -0.5 (0m) but also on down & 20.05
mm, as plessite
is banded with bands decreasing in width from
edge to center. No inclusions are observed. Oxidation is
negligible. Does it resemble Madoc?
1 VT67. 119 sample from ASU, 4 flat faces, ~ 5cm²
-5 cm², pol'd +
etched 60 sec. Structure slightly coarser (determined on absolutely
oriented bands!) 0.6-1.0 mm, still Om. Many irregular lines
spaced about each 0.5 mm cross ham- probably indicates shock or
reheating Large amounts of plessite, almost entirely banded, - wilth
but tend to be relatively constant in a given field - 55±10% of area
is plessite. Light oxidation near crust and along single marrow crack.
small (0.1mm () schreib, no other inclusions, Prob.
very
Rare,
very
--------------------------------

In [155]:
final_corrected_text = process_all_pages(text)
print(final_corrected_text)

with open("Ba_page-0006-0012.txt", "a", encoding="utf-8") as f:
    f.write(final_corrected_text)

[PAGE 1]
Bagdad (Arizona, U.S.A.) 17. II. 7. Iron with intend and clated face has area zone around edge. 5 x 7.5 cm. fasion Very interesting act structure, Kam bands range in size from. 0.8 -0.5 (0m) but also on down & 20.05 mm, as plessite is banded with bands decreasing in width from edge to center. No inclusions are observed. Oxidation is negligible. Does it resemble Madoc? 1 VT67. 119 sample from ASU, 4 flat faces, ~ 5cm² -5 cm², pol'd + etched 60 sec. Structure slightly coarser (determined on absolutely oriented bands!) 0.6-1.0 mm, still Om. Many irregular lines spaced about each 0.5 mm cross ham- probably indicates shock or reheating Large amounts of plessite, almost entirely banded, - wilth but tend to be relatively constant in a given field - 55±10% of area is plessite. Light oxidation near crust and along single marrow crack. small (0.1mm () schreib, no other inclusions, Prob. very Rare, very

---PAGE BREAK---

[PAGE 2]
Balfour Downs (W. Australia, Australia) mm 11 Nov. 66 Irr