In [None]:
! apt install tesseract-ocr
! apt install libtesseract-dev
! pip install pytesseract

In [None]:
#import packages
import pytesseract
import cv2
from google.colab.patches import cv2_imshow
import numpy as np
import pandas as pd
import re
import os
import shutil
from google.colab import drive
import datetime


In [14]:
#this will connect to your Google Drive. It will ask you to allow access
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
# create a file list data frame to keep the old-name -> new-name conversions and status 
file_names_list = []


In [None]:
def unzip_src(folder):
  """
  Unzip zipped folder.

  A zipped folder is taken in and unzipped.

  Parameter
  ---------
  kind : Google Drive Folder
    Folder containing watershed photos.

  Returns
  -------
  new_src : new src for unzipped folder.
  unzipped : unzipped folder.

  """
  new_src = folder
  unzipped = None

  if folder[-4:] == '.zip':
    src_as_zip = True
    unzipped = folder[:-4]
    shutil.unpack_archive(folder, unzipped)

    parentZip = np.array(os.listdir(unzipped))
    new_src = unzipped + '/' + parentZip[0]
  
  return new_src, unzipped


def new_folder(src, dst = None):
  '''
  Create a new folder destination address for folder to be copied.

  This method's parameter is the address of folder you're going to copy, address of destination parent folder
  It will return the name of folder (to use later), full new address of copy folder 
  determines new folder address.

  Parameters
  ----------
  src : src address for folder to be copied.
  dst : destination address for folder to be copied.

  Returns
  -------
  folder_name : the new folders name.
  newAddress : the full new address of copy folder.
  '''
  if dst is None:
    dst = src
  # print(src, "Src")
  src_elements = src.split('/')
  folder_name = src_elements[-1][:2]   #!!!this should be changed if src_elements[-2][1] will not be watershed number!!
  # print(folder_name, "Folder")
  newAddress = dst + "/" + folder_name
  return folder_name, newAddress


def generate_picName(fdr_name, tStamp):
  '''
  Generate standardized picture names.

  The full source address of a pciture file and its time stamp are standardized 
  to a format delineated by water shed number and time stamp. 

  Parameters
  ----------
  fdr_name : full source address of current picture file.
  tStamp : unaltered timeStamp from bottom of the photo
      as a string.

  Returns
  -------
  new_name : new name of picture
  '''
  ws_num = fdr_name[1] #!!!this should be changed if src_elements[-2][1] will not be watershed number!!

  stamp_elements = re.split('[\n: -]', tStamp)
  date = stamp_elements[2] + stamp_elements[0] + stamp_elements[1]
  time = stamp_elements[3] + stamp_elements[4] + stamp_elements[5]

  new_name = "Hbwtr_w" + ws_num + '_' + date + '_' + time + '.JPG'
  return new_name

def extract_timeStamp(pic_address):
    '''
    Extract time Stamp from picture file.

    From the bottom right of each picture file, the time stamp is read as image using cv2. It is then converted to a string.
    text which is then checked for format and subsequently returned through match_date_format.

    Parameters
    ----------
    pic_address : full source address of current picture file.
    Returns
    -------
    match_date_format.group(0) : unaltered timeStamp from bottom of the photo as a string.
    '''
    # print(pic_address)

    img = cv2.imread(pic_address) #read as an image

    # check if the timestamp is the correct format
    date_pattern = "\d\d-\d\d-\d\d\d\d \d\d:\d\d:\d\d" # eg 12-12-2020 11:59:32

    loop = 1
    size_extension=0
    x,y,z = np.shape(img)
    x = (x//1000)*1000
    y = (y//1000)*1000
    # print(x,y,z)
    while loop>0:
      ts = img[2352 - size_extension:, 2000-size_extension:, :] #(change if sizing conventions change!)
      text = pytesseract.image_to_string(ts)
      match_date_format = re.search(date_pattern,text)
      if match_date_format:
        # found timestamp, return
        break
      ts_2 = img[x - size_extension:, x-size_extension:, :] #(change if sizing conventions change!)
      text_2= pytesseract.image_to_string(ts_2)
      match_date_format = re.search(date_pattern,text_2)
      if match_date_format:
        # found timestamp, return
        break
      size_extension+=100
      loop-=1

    if loop ==0:      
      # reached end of loop without finding correct timestamp
      print("Correct timestamp not found")

    else:
      return match_date_format.group(0)

      

def rename_images(picFolder, fdr_name, fdr_dst, file_df=None):

  '''
  Rename all files in the current folder and populate the list of old and new names.

  For each picture file in the current folder, its filename is changed and its new name is generated in another method.
  The old image is copied to its new destination.

  Parameters
  ----------
  picFolder : a folder containing pictures.
  fdr_name : full address of folder.
  fdr_dst : full destination address.
  '''

  picFiles = np.array(os.listdir(picFolder))
  old_folder = "/".join(picFolder.split("/")[-3:-1])
  for filename in picFiles:
    print(filename, "filename")
    old_name = filename
    new_name = np.nan
    status="raw"
    note = np.nan
    print("Old filename", filename) #just to track where you are
    src = picFolder + '/' + filename #old img address
    # print(src)
    filetype = filename[-4:]
    try:
      if (filetype == '.JPG') or (filetype == '.jpg'):
        tStamp = extract_timeStamp(src)
        new_name = generate_picName(fdr_name, tStamp)
        print("New filename", new_name)
        status = "renamed"
        dst = fdr_dst + '/' + filename #new img address
        dst_renamed = fdr_dst + '/' + new_name #img new address + name
        # shutil.copy(src, dst_renamed) #copies old image to new destination

        # os.rename(src, dst) #renames file in Google Drive
      else:
        print("Not image file")
        status= "error"
        note = "Not image file"
    except:
      print("Unable to process file")
      status = "error"
      note =  "Unable to process file"
    curr_list = [old_name, new_name, status, note, picFolder]
    file_names_list.append(curr_list)

In [None]:
# MAIN

from glob import glob

#collect all folder paths from newly uploaded data on folder
folder_list = glob("/content/drive/MyDrive/2_Camera Trap photos/COPY of data for script/Newly_uploaded_data/*/", recursive = True)
# collect all folder path from on deck folder
folder_list.extend(glob("/content/drive/MyDrive/2_Camera Trap photos/COPY of data for script/On_Deck/*/", recursive = True))
# extract folder_name 
folder_list = [f[:-1] for f in folder_list]
i = 0
file_df = pd.read_csv("/content/drive/MyDrive/2_Camera Trap photos/COPY of data for script/Testing destination/file_df.csv")

# for each folder rename and add them to the new destination - dst
for folder in folder_list:
  print(i,"/", len(folder_list))
  i+=1
  # destination to save labeled images
  dst = "/content/drive/MyDrive/2_Camera Trap photos/project_dir/labeled_image_files"
  save_as_zip = False

  #will unzip if necessary
  folder, unzipped = unzip_src(folder)

  # #create new destination folder
  fdr_name, fdr_dst = new_folder(folder, dst)

  if os.path.exists(fdr_dst):
    print("path already exists")
  else:
    print("new path")
    os.mkdir(fdr_dst)
  print(folder)
  print(fdr_name)
  print(fdr_dst)
  rename_images(folder, fdr_name, fdr_dst, file_df = file_df)

In [None]:
file_df.head()

In [None]:
file_df[(file_df.status=="error") ]

In [None]:
# file_list data frame
file_df = pd.DataFrame(file_names_list, columns = ["old_name", "new_name","status", "note","old_folder"])
file_df.to_csv(dst+"/"+"file_df.csv")
file_df.head()

Unnamed: 0,old_name,new_name,status,note,old_folder
0,11060002.JPG,Hbwtr_w4_20201106_115812.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
1,11070004.JPG,Hbwtr_w4_20201107_115811.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
2,11080006.JPG,Hbwtr_w4_20201108_115810.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
3,11090008.JPG,Hbwtr_w4_20201109_115809.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
4,11100010.JPG,Hbwtr_w4_20201110_115807.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...


In [None]:
# total number of image files
file_df.shape[0]

3144

In [63]:
# Load in created csv, returns file name and path if extracted timestamp is not in range

file_df = pd.read_csv("/content/drive/MyDrive/2_Camera Trap photos/project_dir/labeled_image_files/file_df.csv")
file_df = file_df[file_df["new_name"].notnull()]
for index, row in file_df.iterrows():
  #Check month range
  if int(row["new_name"][13:15]) > 12 or int(row["new_name"][13:15]) < 0:
    print(row["new_name"]+" Month not in range, check name in folder :"+row["old_folder"])
  #Check year range
  if int(row["new_name"][9:13]) > 2022 or int(row["new_name"][9:13]) < 2018:
    print(row["new_name"]+" Year not in range, check name in folder :"+row["old_folder"])
  #Check day range
  if int(row["new_name"][15:17]) > 31 or int(row["new_name"][15:17]) < 0:
    print(row["new_name"]+" Day not in range, check name in folder :"+row["old_folder"])

Hbwtr_w1_20201904_115941.JPG Month not in range, check name in folder :/content/drive/MyDrive/2_Camera Trap photos/COPY of data for script/Newly_uploaded_data/W1 GC Channel 11-7-20  thru 5-9-21
Hbwtr_w1_20204216_115928.JPG Month not in range, check name in folder :/content/drive/MyDrive/2_Camera Trap photos/COPY of data for script/Newly_uploaded_data/W1 GC Channel 11-7-20  thru 5-9-21
Hbwtr_w1_20201295_115918.JPG Day not in range, check name in folder :/content/drive/MyDrive/2_Camera Trap photos/COPY of data for script/Newly_uploaded_data/W1 GC Channel 11-7-20  thru 5-9-21
Hbwtr_w1_20910925_115335.JPG Year not in range, check name in folder :/content/drive/MyDrive/2_Camera Trap photos/COPY of data for script/Newly_uploaded_data/W1 GC Channel 5-10-21 thru 12-5-21


In [59]:
file_df[file_df["new_name"].notnull()]

Unnamed: 0.1,Unnamed: 0,old_name,new_name,status,note,old_folder
0,0,11060002.JPG,Hbwtr_w4_20201106_115812.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
1,1,11070004.JPG,Hbwtr_w4_20201107_115811.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
2,2,11080006.JPG,Hbwtr_w4_20201108_115810.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
3,3,11090008.JPG,Hbwtr_w4_20201109_115809.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
4,4,11100010.JPG,Hbwtr_w4_20201110_115807.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
...,...,...,...,...,...,...
3138,3138,05050182.JPG,Hbwtr_w2_20190505_115920.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
3139,3139,02090010.JPG,Hbwtr_w2_20190209_120413.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
3140,3140,03050057.JPG,Hbwtr_w2_20190305_115807.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...
3141,3141,02050001.JPG,Hbwtr_w2_20190205_115925.JPG,renamed,,/content/drive/MyDrive/2_Camera Trap photos/CO...


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# files with error status
file_df[file_df.status=="error"].shape
# files unable to process
fl = (file_df[file_df.note=="Unable to process file"][["old_folder", "old_name"]]).values

(29, 5)

In [None]:
# number of files with error status
file_df[(file_df.status=="error") & (file_df.note!="Not image file")].shape[0]

# These are renamed and uploaded  manually-  new name column changed and  status column changed  to "renamed manually"


18