In [1]:
pip install pytesseract --target=/Users/astghik.kostanyan/Desktop/capstone
pip install google-cloud-vision


Collecting pytesseract
  Using cached pytesseract-0.3.10-py3-none-any.whl.metadata (11 kB)
Collecting packaging>=21.3 (from pytesseract)
  Using cached packaging-24.0-py3-none-any.whl.metadata (3.2 kB)
Collecting Pillow>=8.0.0 (from pytesseract)
  Using cached pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl.metadata (9.2 kB)
Using cached pytesseract-0.3.10-py3-none-any.whl (14 kB)
Using cached packaging-24.0-py3-none-any.whl (53 kB)
Using cached pillow-10.3.0-cp310-cp310-macosx_10_10_x86_64.whl (3.5 MB)
Installing collected packages: Pillow, packaging, pytesseract
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spyder 5.4.1 requires pyqt5<5.16, which is not installed.
spyder 5.4.1 requires pyqtwebengine<5.16, which is not installed.
pytorchyolo 1.8.0 requires Pillow<10.0.0,>=9.1.0, but you have pillow 10.3.0 which is incompatible.
chainlit 1.0.200 requir

In [7]:
import os
import io
import pandas as pd
from google.cloud import vision
from google.oauth2 import service_account
import openpyxl  # Required by pandas for Excel writing

def authenticate_with_service_account(key_path):
    credentials = service_account.Credentials.from_service_account_file(key_path)
    client = vision.ImageAnnotatorClient(credentials=credentials)
    return client

def detect_handwritten_text(image_path, client, visually_similar_groups):
    with io.open(image_path, 'rb') as image_file:
        content = image_file.read()

    image = vision.Image(content=content)
    image_context = vision.ImageContext(language_hints=['en', 'el']) 
    response = client.document_text_detection(image=image, image_context=image_context)
    document = response.full_text_annotation

    full_text = document.text  # Simplified extraction of full text
    detailed_confidences = []

    character_confidences = {}

    for page in document.pages:
        for block in page.blocks:
            for paragraph in block.paragraphs:
                for word in paragraph.words:
                    for symbol in word.symbols:
                        char = symbol.text
                        confidence = symbol.confidence

                        character_confidences[char] = confidence  # Store each character's confidence

                        similar_chars_confidences = {}
                        if confidence < 0.7:
                            for group, similar_chars in visually_similar_groups.items():
                                if char in similar_chars:
                                    for similar_char in similar_chars:
                                        if similar_char != char:
                                            # Only add if confidence exists and is not None
                                            if similar_char in character_confidences and character_confidences[similar_char] is not None:
                                                similar_chars_confidences[similar_char] = character_confidences[similar_char]

                        detailed_confidences.append({
                            'image_name': os.path.basename(image_path),
                            'character': char,
                            'confidence': confidence,
                            'similar_chars': similar_chars_confidences if similar_chars_confidences else None
                        })

    return full_text, detailed_confidences


def process_folder(folder_path, key_path, output_text_file, output_excel_file):
    client = authenticate_with_service_account(key_path)
    visually_similar_groups = {
    'Group 1': {'(', 'c'},
    'Group 2': {'c', '<'},
    'Group 3': {'c', 'e'},
    'Group 4': {'e', 'l'},
    'Group 5': {'e', 'o'},
    'Group 6': {'o', '0'},
    'Group 7': {'=', 'z'},
    'Group 8': {'z', 'r'},
    'Group 9': {'r', '2'},
    'Group 10': {'z', '2'},
    'Group 11': {'v', 'r'},
    'Group 12': {'r', 'n'},
    'Group 13': {'t', '+'},
    'Group 14': {'y', 'g'},
    'Group 15': {'2', 'alpha'},
    'Group 16': {'j', 'i'},
    'Group 17': {'i', ';'},
    'Group 18': {'j', ';'},
    'Group 19': {'{', '('},
    'Group 20': {'O', 'D'}}
    all_confidences = []

    with open(output_text_file, 'w') as text_file:
        for filename in os.listdir(folder_path):
            if filename.lower().endswith(('.png', '.jpg', '.jpeg')):
                image_path = os.path.join(folder_path, filename)
                full_text, image_confidences = detect_handwritten_text(image_path, client, visually_similar_groups)
                text_file.write(f"{filename}: {full_text}\n")
                all_confidences.extend(image_confidences)

    df = pd.DataFrame(all_confidences)
    df.to_excel(output_excel_file, index=False)

# Example usage
folder_path = 'processed'  # Path to your processed images directory
key_path = 'capstone-420319-c5b354e85ad0.json'
output_text_file = 'full_texts.txt'  # Output text file for full texts
output_excel_file = 'character_confidences.xlsx'  # Output Excel file for character confidences

process_folder(folder_path, key_path, output_text_file, output_excel_file)


## pytesseract

In [3]:
# import pytesseract
# from PIL import Image
# import cv2

# # Path to your image file within the Capstone folder
# image_path = '/Users/astghik.kostanyan/Desktop/Capstone/unnamed2_page_3_cropped.jpg'

# # Load the image
# image = Image.open(image_path)

# # Perform OCR
# text = pytesseract.image_to_string(image, lang='eng')

# # Get confidence scores
# confidences = []
# boxes = pytesseract.image_to_boxes(image)
# for b in boxes.splitlines():
#     b = b.split(' ')
#     character = b[0]
#     confidence = float(b[-1])
#     confidences.append((character, confidence))

# # Print extracted text
# print("Extracted Text:", text)

# # Print confidence scores for each character
# print("Confidence Scores:")
# for character, confidence in confidences:
#     print(f"Character: {character}, Confidence: {confidence}")


## EasyOCR

In [4]:
# import easyocr

# reader = easyocr.Reader(['en'])
# result = reader.readtext(image_path)

# # Print each character along with its confidence score
# print("Character    Confidence Score")
# for text_result in result:
#     text = text_result[1]
#     confidence = text_result[2]
#     for character in text:
#         print(f"{character:<12} {confidence}")

## Google Vision

In [5]:
# import io
# from google.cloud import vision
# from google.cloud.vision_v1 import types



# # Authenticate with your API key
# client = vision.ImageAnnotatorClient.from_service_account_json('capstone-420319-c5b354e85ad0.json')

# # Read the image file
# with io.open(image_path, 'rb') as image_file:
#     content = image_file.read()

# # Create an image object
# image = vision.Image(content=content)

# # Perform handwritten text detection
# response = client.text_detection(image=image)

# # Extract text and confidence scores
# texts = response.text_annotations
# extracted_text = texts[0].description

# # Print the extracted text and confidence scores
# print(extracted_text)



In [6]:
# import io
# import json
# from google.cloud import vision
# # from google.cloud.vision_v1 import enums
# from google.oauth2 import service_account

# def authenticate_with_service_account(key_path):
#     credentials = service_account.Credentials.from_service_account_file(key_path)
#     client = vision.ImageAnnotatorClient(credentials=credentials)
#     return client

# def detect_handwritten_text(image_path, client, language_hint=None):
#     with io.open(image_path, 'rb') as image_file:
#         content = image_file.read()

#     image = vision.Image(content=content)
#     image_context = vision.ImageContext(language_hints=[language_hint]) if language_hint else None
#     response = client.document_text_detection(image=image, image_context=image_context)
#     document = response.full_text_annotation

#     text = ''
#     confidences = []

#     for page in document.pages:
#         for block in page.blocks:
#             for paragraph in block.paragraphs:
#                 for word in paragraph.words:
#                     for symbol in word.symbols:
#                         text += symbol.text
#                         confidences.append(symbol.confidence)

#     return text, confidences

# # Path to your JSON key file
# key_path = 'capstone-420319-c5b354e85ad0.json'

# # Authenticate with service account
# client = authenticate_with_service_account(key_path)

# # Language hint (e.g., 'en' for English, 'fr' for French)
# language_hint = 'english'

# # Detect handwritten text and get confidences
# text, confidences = detect_handwritten_text(image_path, client, language_hint)

# # Print confidence scores for each character
# print_confidence_scores(text, confidences)


## For all similar letters

In [8]:
# import io
# from google.cloud import vision
# from google.oauth2 import service_account

# def authenticate_with_service_account(key_path):
#     credentials = service_account.Credentials.from_service_account_file(key_path)
#     client = vision.ImageAnnotatorClient(credentials=credentials)
#     return client

# def detect_handwritten_text(image_path, client, visually_similar_groups, language_hint=None):
#     with io.open(image_path, 'rb') as image_file:
#         content = image_file.read()

#     image = vision.Image(content=content)
#     image_context = vision.ImageContext(language_hints=['en', 'el'])  # Include Greek in language hints
#     response = client.document_text_detection(image=image, image_context=image_context)

#     # Simplified method to extract full text
#     full_text = response.full_text_annotation.text

#     detailed_confidences = {}
#     # Process each page, block, paragraph, word, and symbol to extract confidence and similar characters
#     for page in response.full_text_annotation.pages:
#         for block in page.blocks:
#             for paragraph in block.paragraphs:
#                 for word in paragraph.words:
#                     for symbol in word.symbols:
#                         char = symbol.text
#                         confidence = symbol.confidence

#                         if char not in detailed_confidences:
#                             detailed_confidences[char] = {
#                                 'confidence': confidence,
#                                 'similar_chars_confidences': {}
#                             }

#                         # Populate similar characters' confidences for comparison
#                         for group, similar_chars in visually_similar_groups.items():
#                             if char in similar_chars:
#                                 for similar_char in similar_chars:
#                                     if similar_char != char:
#                                         if similar_char in detailed_confidences:
#                                             detailed_confidences[char]['similar_chars_confidences'][similar_char] = detailed_confidences[similar_char]['confidence']
#                                         else:
#                                             detailed_confidences[char]['similar_chars_confidences'][similar_char] = None

#                         # Print each character, confidence, and similar characters' confidences immediately
#                         print(f"Character: {char}, Confidence: {confidence}")
#                         if detailed_confidences[char]['similar_chars_confidences']:
#                             print(f"  Similar Characters Confidences: {detailed_confidences[char]['similar_chars_confidences']}")

#     return full_text, detailed_confidences





# # Print the full text at the beginning
# print(full_text)

# # Detect handwritten text and get detailed confidences
# full_text, detailed_confidences = detect_handwritten_text(image_path, client, visually_similar_groups, language_hint)



In [9]:
# import io
# from google.cloud import vision
# from google.oauth2 import service_account

# def authenticate_with_service_account(key_path):
#     credentials = service_account.Credentials.from_service_account_file(key_path)
#     client = vision.ImageAnnotatorClient(credentials=credentials)
#     return client

# def detect_handwritten_text(image_path, client, visually_similar_groups, language_hint=None):
#     with io.open(image_path, 'rb') as image_file:
#         content = image_file.read()

#     image = vision.Image(content=content)
#     image_context = vision.ImageContext(language_hints=[language_hint] if language_hint else ['en'])  # Default language hint
#     response = client.document_text_detection(image=image, image_context=image_context)

#     full_text = response.full_text_annotation.text
#     detailed_confidences = {}
#     replacement_map = {}

#     # Process each page, block, paragraph, word, and symbol
#     for page in response.full_text_annotation.pages:
#         for block in page.blocks:
#             for paragraph in block.paragraphs:
#                 for word in paragraph.words:
#                     for symbol in word.symbols:
#                         char = symbol.text
#                         confidence = symbol.confidence
#                         detailed_confidences[char] = confidence

#                         # Check and compare similar character confidences
#                         for group, similar_chars in visually_similar_groups.items():
#                             if char in similar_chars:
#                                 max_confidence = confidence
#                                 best_char = char
#                                 for similar_char in similar_chars:
#                                     if similar_char in detailed_confidences and detailed_confidences[similar_char] is not None:
#                                         if detailed_confidences[similar_char] > max_confidence:
#                                             max_confidence = detailed_confidences[similar_char]
#                                             best_char = similar_char
#                                 replacement_map[char] = best_char

#     # Replace characters in the full text based on the highest confidence characters
#     corrected_text = ''.join([replacement_map.get(char, char) for char in full_text])

#     return corrected_text, detailed_confidences


# # Detect handwritten text and get the corrected full text
# corrected_text, detailed_confidences = detect_handwritten_text(image_path, client, visually_similar_groups, language_hint)

# # Print the corrected full text
# print(corrected_text)


## with 0.5 conf thresh

In [11]:
# import io
# from google.cloud import vision
# from google.oauth2 import service_account

# def authenticate_with_service_account(key_path):
#     credentials = service_account.Credentials.from_service_account_file(key_path)
#     client = vision.ImageAnnotatorClient(credentials=credentials)
#     return client

# def detect_handwritten_text(image_path, client, visually_similar_groups, language_hint=None):
#     with io.open(image_path, 'rb') as image_file:
#         content = image_file.read()

#     image = vision.Image(content=content)
#     image_context = vision.ImageContext(language_hints=[language_hint]) if language_hint else None
#     response = client.document_text_detection(image=image, image_context=image_context)
#     document = response.full_text_annotation

#     full_text = document.text  # Simplified extraction of full text
#     detailed_confidences = {}

#     for page in document.pages:
#         for block in page.blocks:
#             for paragraph in block.paragraphs:
#                 for word in paragraph.words:
#                     for symbol in word.symbols:
#                         char = symbol.text
#                         confidence = symbol.confidence

#                         if char not in detailed_confidences:
#                             detailed_confidences[char] = {
#                                 'confidence': confidence,
#                                 'similar_chars_confidences': {}
#                             }

#                         # Always print the character and its confidence
#                         print(f"Character: {char}, Confidence: {confidence}")

#                         # Only check similar characters if confidence is below 0.7
#                         if confidence < 0.7:
#                             for group, similar_chars in visually_similar_groups.items():
#                                 if char in similar_chars:
#                                     for similar_char in similar_chars:
#                                         if similar_char != char:
#                                             # Check if similar character has been encountered and recorded before
#                                             if similar_char in detailed_confidences and detailed_confidences[similar_char]['confidence'] is not None:
#                                                 detailed_confidences[char]['similar_chars_confidences'][similar_char] = detailed_confidences[similar_char]['confidence']

#                             # Print similar characters' confidences if any
#                             if detailed_confidences[char]['similar_chars_confidences']:
#                                 # Filter out None values before printing
#                                 filtered_confidences = {k: v for k, v in detailed_confidences[char]['similar_chars_confidences'].items() if v is not None}
#                                 if filtered_confidences:
#                                     print(f"  Similar Characters Confidences: {filtered_confidences}")

#     return full_text, detailed_confidences

# # Example of initializing and using the function

# client = authenticate_with_service_account(key_path)


# # Detect handwritten text and get detailed confidences
# full_text, detailed_confidences = detect_handwritten_text(image_path, client, visually_similar_groups, language_hint)

# # Print the full text at the beginning
# print("Full Text:", full_text)


In [None]:
pip install docx

In [None]:
pip install exceptions

In [None]:
from wordcloud import WordCloud

# Generate word cloud from characters
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(' '.join(df['Character']))
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Handwritten Characters')
plt.show()
plt.savefig('character_wordcloud.png')


In [None]:
# Add a column for character length
df['Character Length'] = df['Character'].apply(len)

# Plot scatter plot of confidence vs. character length
plt.scatter(df['Character Length'], df['Confidence Score'], alpha=0.5, color='skyblue')
plt.xlabel('Character Length')
plt.ylabel('Confidence Score')
plt.title('Confidence vs. Character Length')
plt.show()


In [None]:
import matplotlib.pyplot as plt

def plot_character_frequency_histogram(texts):
    # Combine all detected text into a single string
    all_text = ''.join(texts)
    
    # Count the frequency of each character
    character_counts = {}
    for char in all_text:
        if char in character_counts:
            character_counts[char] += 1
        else:
            character_counts[char] = 1
    
    # Plot the histogram
    plt.figure(figsize=(18, 8))  # Increase the figure size
    plt.bar(character_counts.keys(), character_counts.values())
    plt.xlabel('Character')
    plt.ylabel('Frequency')
    plt.title('Frequency Histogram of Characters')
    plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
    plt.show()

# Plot frequency histogram for each character
plot_character_frequency_histogram(image_texts)


In [None]:
import matplotlib.pyplot as plt

def plot_character_frequency_histogram(texts, threshold=500):
    # Combine all detected text into a single string
    all_text = ''.join(texts)
    
    # Count the frequency of each character
    character_counts = {}
    for char in all_text:
        if char in character_counts:
            character_counts[char] += 1
        else:
            character_counts[char] = 1
    
    # Filter characters with frequency > threshold
    filtered_character_counts = {char: count for char, count in character_counts.items() if count > threshold}
    
    # Sort the character counts in descending order of frequency
    sorted_character_counts = dict(sorted(filtered_character_counts.items(), key=lambda item: item[1], reverse=True))
    
    # Plot the histogram
    plt.figure(figsize=(18, 8))  # Increase the figure size
    plt.bar(sorted_character_counts.keys(), sorted_character_counts.values())
    plt.xlabel('Character')
    plt.ylabel('Frequency')
    plt.title('Frequency Histogram of Characters (Frequency > 500)')
    plt.xticks(rotation=0)  # Rotate x-axis labels for better readability
    plt.show()
    plt.savefig('frequency_histogram.jpg')


# Plot frequency histogram for characters with frequency > 500
plot_character_frequency_histogram(image_texts, threshold=500)


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import re

# Read the contents of the text file
with open('handwriting_results.txt', 'r') as file:
    lines = file.readlines()

# Extract detected text paragraphs
detected_text = ''
for line in lines:
    if line.startswith('Detected Text:'):
        detected_text += line.split(':', 1)[1].strip() + ' '

# Generate a word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(detected_text)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()


# Save the word cloud as an image
wordcloud.to_file('word_cloud.jpg')
