In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
!pip install pytesseract # Installing the pytesseract library, which is used for optical character recognition (OCR) in images.

Collecting pytesseract
  Downloading pytesseract-0.3.10-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.10


In [None]:
# Importing necessary libraries for text extraction and image processing
import pytesseract  # For optical character recognition (OCR) to extract text from images
from PIL import Image  # For opening and manipulating image files
import cv2  # For advanced image processing and computer vision tasks
import matplotlib.pyplot as plt  # For visualizing images and results using plots
import glob  # For finding all pathnames matching a specified pattern



image_paths = []
folder_dirs = [ '/content/gdrive/MyDrive/cam07-output 0.85/false detections']

for folder_dir in folder_dirs:
    for images in glob.iglob(f'{folder_dir}/*'):
        if (images.endswith(".JPG")):
            image_paths.append(images)
print(image_paths[:10]) # Print the first 10 image paths
print(len(image_paths))

['/content/gdrive/MyDrive/cam07-output 0.85/false detections/11280035.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/11280053.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/11280052.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/11280058.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/11300497.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/11300496.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/12070249.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/12210056.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/12210055.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/12230433.JPG']
395


In [None]:
!pip install easyocr # Installing the easyocr library for optical character recognition (OCR) in images.

Collecting easyocr
  Downloading easyocr-1.7.1-py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m36.2 MB/s[0m eta [36m0:00:00[0m
Collecting python-bidi (from easyocr)
  Downloading python_bidi-0.4.2-py2.py3-none-any.whl (30 kB)
Collecting pyclipper (from easyocr)
  Downloading pyclipper-1.3.0.post5-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (908 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m908.3/908.3 kB[0m [31m56.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting ninja (from easyocr)
  Downloading ninja-1.11.1.1-py2.py3-none-manylinux1_x86_64.manylinux_2_5_x86_64.whl (307 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.2/307.2 kB[0m [31m30.4 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch->easyocr)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==

In [None]:
# This script processes images to extract text related to bird detections from specified folders.
# It uses OCR to read text from images, organizes the extracted data, and saves it into a CSV file.
# The code also tracks the number of birds detected in each folder and formats the extracted text into structured data.
# Note: The folder_dir and the name of the CSV file can be changed based on your data root and personal naming preferences.
# The output of this code may require manual verification, as it sometimes extracts incorrect characters that need to be removed.

import pytesseract
from PIL import Image
import cv2
import matplotlib.pyplot as plt
import glob
import easyocr
import time
import pickle
import pandas as pd

folder_dirs = [
    '/content/gdrive/MyDrive/cam07-output 0.85/false detections']


def get_number_of_birds(folder_dir): #function determines the number of birds based on the folder name.
    if '01' in folder_dir:
        return 1
    elif '02' in folder_dir:
        return 2
    elif '03' in folder_dir:
        return 3
    elif '04' in folder_dir:
        return 4
    elif '05' in folder_dir:
        return 5
    else:
        return 0

# Lists to store image paths and the corresponding number of birds
image_paths = []
number_of_birds = []

for folder_dir in folder_dirs:
    num_birds = get_number_of_birds(folder_dir)
    for images in glob.iglob(f'{folder_dir}/*'):
        if images.endswith(".JPG"):
            image_paths.append(images)
            number_of_birds.append(num_birds)

print(image_paths[:10])
print(len(image_paths))


# This function processes an image by loading it, converting it to RGB, drawing a rectangle around
#                 the region of interest (the white ribbon at the bottom containing temporal data)
# cropping the image, converting it to grayscale, and returning the processed grayscale image for text extraction.

def process_image(image_path):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    cv2.rectangle(image, (0, 1300), (1920, 1440), (255, 0, 0), 6)
    cropped_image = image[1200:1440, 0:1920]
    gray_image = cv2.cvtColor(cropped_image, cv2.COLOR_RGB2GRAY)
    return gray_image

texts = []  # List to store extracted texts
picture_names = []  # List to store image names
batch_size = 1000  # Define batch size for processing
start_time = time.time()  # Start timer to track elapsed time
reader = easyocr.Reader(['en'])  # Initialize EasyOCR reader for English

# Process images in batches
for i in range(0, len(image_paths), batch_size):
    batch_paths = image_paths[i:i + batch_size]  # Get the current batch of image paths
    for image_path in batch_paths:
        gray_image = process_image(image_path)  # Process the image
        result = reader.readtext(gray_image)  # Extract text using EasyOCR
        picture_name = image_path.split('/')[-1]  # Get the image file name
        picture_names.append(picture_name)  # Store the picture name
        for detection in result:
            text = detection[1]  # Extract the detected text
            texts.append(text)  # Append text to the list
end_time = time.time()
elapsed_time = end_time - start_time

print("Elapsed Time:", elapsed_time)

split_texts = [text.split(',') for text in texts]

grouped_elements = []
temp_group = []
for element in split_texts:
    temp_group.extend(element)
    if len(temp_group) == 5:
        grouped_elements.append(temp_group)
        temp_group = []

# If the last group is not complete, fill it with placeholders
if temp_group:
    temp_group.extend(['aa'] * (5 - len(temp_group)))
    grouped_elements.append(temp_group)

bushnell = []
camera_numbers = []
temperatures = []
dates = []
times = []

# Extract structured data from grouped elements
for text in grouped_elements:
    bushnell.append(text[0] if len(text) > 0 else 'aa')
    camera_numbers.append(text[1] if len(text) > 1 else 'aa')
    temperatures.append(text[2] if len(text) > 2 else 'aa')
    dates.append(text[3] if len(text) > 3 else 'aa')
    times.append(text[4] if len(text) > 4 else 'aa')

# Ensure all lists have the same length by adding placeholders where necessary
max_length = max(len(bushnell), len(camera_numbers), len(temperatures), len(dates), len(times), len(picture_names), len(number_of_birds))

bushnell.extend(['aa'] * (max_length - len(bushnell)))
camera_numbers.extend(['aa'] * (max_length - len(camera_numbers)))
temperatures.extend(['aa'] * (max_length - len(temperatures)))
dates.extend(['aa'] * (max_length - len(dates)))
times.extend(['aa'] * (max_length - len(times)))
picture_names.extend(['aa'] * (max_length - len(picture_names)))
number_of_birds.extend([0] * (max_length - len(number_of_birds)))

print(len(bushnell))
print(len(camera_numbers))
print(len(temperatures))
print(len(dates))
print(len(times))
print(len(picture_names))
print(len(number_of_birds))

df = pd.DataFrame({
    'Image_Name': picture_names,
    'Bushnell': bushnell,
    'Camera_Numbers': camera_numbers,
    'Temperatures': temperatures,
    'Dates': dates,
    'Times': times,
    'Number_of_Birds': number_of_birds
})

print(df)
df.to_csv('CAME07-FalseDetections.csv', index=False)  # Save the DataFrame to a CSV file




['/content/gdrive/MyDrive/cam07-output 0.85/false detections/11280035.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/11280053.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/11280052.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/11280058.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/11300497.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/11300496.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/12070249.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/12210056.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/12210055.JPG', '/content/gdrive/MyDrive/cam07-output 0.85/false detections/12230433.JPG']
395
Progress: |██████████████████████████████████████████████████| 100.0% Complete



Progress: |██████████████████████████████████████████████████| 100.0% Complete

  return F.conv2d(input, weight, bias, self.stride,


Elapsed Time: 114.05581974983215
395
395
395
395
395
395
395
       Image_Name   Bushnell Camera_Numbers Temperatures       Dates  \
0    11280035.JPG  (Bushnell          Camo7    16*F -8*C  11-28-2022   
1    11280053.JPG  (Bushnell          Camo7    16*F -8C6  11-28-2022   
2    11280052.JPG  (Bushnell          Camo7    16*F -8C8  11-28-2022   
3    11280058.JPG  IBushnell          Camo7    16*F -8C8  11-28-2022   
4    11300497.JPG  (Bushnell          Camo7      21*-6C8  11-30-2022   
..            ...        ...            ...          ...         ...   
390  01270103.JPG  (Bushnell          Camo7      20*-6C8  01-27-2023   
391  01270108.JPG  (Bushnell          Camo7    20* -6*C8  01-27-2023   
392  01270105.JPG  (Bushnell          Camo7    20* -6*C8  01-27-2023   
393  01270111.JPG  (Bushnell          Camo7      21*-6C4  01-27-2023   
394  01270104.JPG  (Bushnell          Camo7     20*-6*C8  01-27-2023   

           Times  Number_of_Birds  
0      17:10 :27                0  
1 