<a href="https://colab.research.google.com/github/a-anksri/sculpture/blob/main/Dataset_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Code to Test if annotations are proper for later consolidation

##Expectations:
1. Annotations are saved in UTF-8 encoded csv files. You can download the gsheet, save as UTF-8 CSV and upload back in the same folder
2. For any annotation sheet in a folder, the corresponding images are either in
(i) An subfolder with the name "images"/"Images"/"IMAGES" (in that order of priority), or
(ii) if no such sub-folder exists, in the same folder itself.
3. The code will scan through all csv files in given root folder and its sub_folders. Multiple csv files in same folder are ok but please ensure that duplicate csv files are not kept otherwise the same image will get loaded twice.
4. All images are in .jpg, .png or .jfif format. pl tell me if you expect any other format too
4. We expect that the csv file will have a header row. If not, the first entry will get missed (We can change this later, but I guess this is a better option)
5. You need not worry about blank entries or rows that do not relate to an image. Code will handle that
6. Once you run the code on your folder, it will generate an final_annotation sheet and failed sheet. Pl check both. Failed sheet entries are entries in any csv file for which corresponding image file was not found.
8. The code will merge all information entered in column 3 onwards. So any temple names etc entered in separate columns will get merged. 
9. The names of folders (from root_folder downwards) will also get added to caption. You may give informative names to your folders/sub_folders and the same will get added to all entries

## Imports etc. Do not Change

In [None]:
import pandas as pd
import os
import numpy as np
from PIL import Image
from operator import add

In [None]:
from google.colab import drive

In [None]:
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:
from google.colab import auth
import gspread
from google.auth import default
#autenticating to google
auth.authenticate_user()
creds, _ = default()
gc = gspread.authorize(creds)

## Enter the folders you want to use. If it is a shared folder, Please add a shortcut to your mydrive so that colab can access

In [None]:
#Root folder you want to test. It will recursively look at all sub-folders too
root_folder = '/content/drive/MyDrive/Nepal'

#Folder where you want to save the final annotation sheet and failed sheet
destination_folder = '/content/drive/MyDrive/Nepal'

## Run all subsequent cells

In [None]:
#Global Variables:
column_names = ['file_name', 'web_address', 'caption']

#To collect all failed entries
failed = []

In [None]:
class Annotation:
  def __init__(self):
    self.id = 0
    self.final_annotations = pd.DataFrame(columns = column_names)

  def get_id(self):
    x = self.id
    self.id += 1
    return x
  
  def read_id(self):
    return id
  
  def reset_id(self, id = 0):
    self.id = id

  def append_annotations(self, df):
    
    self.final_annotations = self.final_annotations.append(df, ignore_index = True)
    

  def fetch_row(self, i):
    return self.final_annotations.iloc[i]

  def save(self):
    path = os.path.join(destination_folder,'final_annotation.csv')
    self.final_annotations.to_csv(path, columns = column_names)

Experiments in Progress form here

In [None]:

def gsheet_crawl(path, annot, name = '', test = False):
  ret = False
  #recursively crawl all subfolders
  dirs = os.listdir(path)
  for dir in dirs:
      
      pth = os.path.join(path, dir)
      new_name = name + ' ' + dir
      if(os.path.isdir(pth)):
        r = gsheet_crawl(pth, annot, new_name, test)
        ret = r or ret
  
  
  #delete this if you want to search in parent folders too
  if(ret):
    return(True)
  
  print("working on " + path)
  files = os.listdir(path)
  failed_num = 0
  count = 0

  if(test):
    print(files)
  
  rett = False
  #look for csv files in the current folder
  for file in files:
    if(file.endswith('.gsheet')):
      rett = True
      tmp = os.path.join(path, file[:-7])
      worksheet = gc.open(file[:-7]).sheet1
      #get_all_values gives a list of rows
      rows = worksheet.get_all_values()
      #Convert to a DataFrame 
      annotations = pd.DataFrame(rows)
      #annotations.columns = annotations.iloc[0]
      #annotations = annotations.iloc[1:].astype('str')
      print("found gheet " + tmp)
      ret = True
    else:
      continue


    #For all csv files found, do the following
    
    #check if an images sub-folder exists. If so, set the same as the image_path, else presume that images are in the same folder
    tmp1 = os.path.join(path, 'images')
    tmp2 = os.path.join(path, 'Images')
    tmp3 = os.path.join(path, 'IMAGES')

    if ('images' in files) and (os.path.isdir(tmp1)):
      image_path = tmp1
    elif ("Images in files") and (os.path.isdir(tmp2)):
      image_path = tmp2
    elif ("IMAGES in files") and (os.path.isdir(tmp3)):
      image_path = tmp3
    else:
      image_path = path




    # Merge caption and other info fields
    annotations['final_caption'] = str(name)
    num_columns = len(annotations.columns.values)
    if (num_columns < 4):
      print('wrongly formatted annotation sheet. Less than 3 columns')
    else:
      for col in annotations.columns.values[2:-1]:
        annotations = annotations.fillna(value = {col:' '})
        annotations['final_caption'] = annotations['final_caption'].astype(str) + ' ' + annotations[col].astype(str)
    


    #Take only three columns and drop any entry for which any of the three columns is Na/NaN
    annotations = annotations.loc[:,[annotations.columns.values[0], annotations.columns.values[1], 'final_caption']]
    annotations = annotations.replace('', np.nan)
    annotations = annotations.dropna()


    #Iterate over rows of annotations 
    names = []
    web_adds = []
    captions = []
    column_name = annotations.columns.values
    
    for i, row in enumerate(annotations.iterrows()):
      data = row[1]
      
      file_name = data[column_name[0]].strip()
      web_add = data[column_name[1]]
      caption = data[column_name[2]]
      if(test):
        pass
        #print(file_name)
      #Check if corresponding image file exists in image path
      file_path1 = os.path.join(image_path, file_name+'.jpg')
      file_path2 = os.path.join(image_path, file_name+'.png')
      file_path3 = os.path.join(image_path, file_name)
      file_path4 = os.path.join(image_path, file_name+'.JPG')
      file_path5 = os.path.join(image_path, file_name+'.PNG')
      file_path6 = os.path.join(image_path, file_name+'.jpeg')
      file_path7 = os.path.join(image_path, file_name+'.JPEG')
      if (os.path.exists(file_path3) and os.path.isfile(file_path3)):
        #and (file_name.endswith('.jpg') or file_name.endswith('.png') or file_name.endswith('.jfif') or file_name.endswith('.PNG') or file_name.endswith('.JPG') or file_name.endswith('.jpeg') or file_name.endswith('.JPEG'))):
        file_path = file_path3
      elif (os.path.exists(file_path1) and os.path.isfile(file_path1)):
        file_path = file_path1
      elif (os.path.exists(file_path2) and os.path.isfile(file_path2)):
        file_path = file_path2
      elif (os.path.exists(file_path4) and os.path.isfile(file_path4)):
        file_path = file_path4
      elif (os.path.exists(file_path5) and os.path.isfile(file_path5)):
        file_path = file_path5
      elif (os.path.exists(file_path6) and os.path.isfile(file_path6)):
        file_path = file_path6
      elif (os.path.exists(file_path7) and os.path.isfile(file_path7)):
        file_path = file_path7
      else:
        if(test):
          print(file_name)

        if(file_name.lower().endswith('name')) or (file_name.lower().endswith('names')):
          pass
        else:
          failed_num += 1
          failed.append({"path":tmp, "Sl No":i, "file_name":file_name})
        
        continue
      
      #img = Image.open(file_path)
      file_add = 'I{}.png'.format(annot.get_id())
      #add = os.path.join(destination_folder, 'images', file_add)
      

      #Add found entries to dataframe
      #img.save(add)
      names.append(file_add)
      web_adds.append(web_add)
      captions.append(caption)
      count += 1
    
    df = pd.DataFrame({'file_name':names,'web_address':web_adds,'caption':captions})
    annot.append_annotations(df)
  
  if(not rett):
    print("no gsheet found")
    return(ret)

  print("Total Successful Count =  {}".format(count) )
  global total_success
  global total_failed
  total_success += count
  total_failed += failed_num
  print("Failed = {}".format(failed_num))
  folder_wise.append({"Folder":name, "Success":count, "Failed":failed_num})
  return(ret)

In [None]:
# driver code

failed = []
folder_wise = []
total_success = 0
total_failed = 0

dir = root_folder

annot = Annotation()

gsheet_crawl(dir, annot, test = False) 

paths = []
sl_nos = []
names = []

#collecting failed entries
for entry in failed:
  paths.append(entry["path"])
  sl_nos.append(entry["Sl No"])
  names.append(entry["file_name"])

folders = []
suc = []
fail = []
for entry in folder_wise:
  folders.append(entry["Folder"])
  suc.append(entry["Success"])
  fail.append(entry["Failed"])

failed_sheet = pd.DataFrame({"path": paths, "Sl_No": sl_nos, "file_name": names})
path = os.path.join(destination_folder,'failed_sheet.csv')
failed_sheet.to_csv(path)

folder_wise_sheet = pd.DataFrame({"Folder": folders, "success": suc, "failed": fail, "Total": list(map(add, suc, fail))})
path = os.path.join(destination_folder,'folder_wise_sheet.csv')
folder_wise_sheet.to_csv(path)


annot.save()

print(total_success, total_failed)

working on /content/drive/MyDrive/Nepal/Images
no gsheet found
working on /content/drive/MyDrive/Nepal
found gheet /content/drive/MyDrive/Nepal/Data Collection_Nepal
Total Successful Count =  1690
Failed = 84
1690 84


## Fallback code

In [None]:
path = os.path.join(destination_folder,'failed_sheet.csv')
failed_sheet.to_csv(path)

folder_wise_sheet = pd.DataFrame({"Folder": folders, "success": suc, "failed": fail, "Total": list(map(add, suc, fail))})
path = os.path.join(destination_folder,'folder_wise_sheet.csv')
folder_wise_sheet.to_csv(path)


annot.save()

print(total_success, total_failed)

7426 2086


In [None]:
def crawl(path, annot, name = ''):
  
  #recursively crawl all subfolders
  dirs = os.listdir(path)
  for dir in dirs:
      pth = os.path.join(path, dir)
      new_name = name + ' ' + dir
      if(os.path.isdir(pth)):
        crawl(pth, annot, new_name)
  
  
  print("working on " + path)
  files = os.listdir(path)
  failed_num = 0
  count = 0

  #look for csv files in the current folder
  for file in files:
    if(file.endswith('.csv')):
      tmp = os.path.join(path, file)
      annotations = pd.read_csv(tmp)
      print("found csv file " + tmp)
    else:
      continue


    #For all csv files found, do the following
    
    #check if an images sub-folder exists. If so, set the same as the image_path, else presume that images are in the same folder
    tmp1 = os.path.join(path, 'images')
    tmp2 = os.path.join(path, 'Images')
    tmp3 = os.path.join(path, 'IMAGES')

    if ('images' in files) and (os.path.isdir(tmp1)):
      image_path = tmp1
    elif ("Images in files") and (os.path.isdir(tmp2)):
      image_path = tmp2
    elif ("IMAGES in files") and (os.path.isdir(tmp3)):
      image_path = tmp3
    else:
      image_path = path




    # Merge caption and other info fields
    annotations['final_caption'] = name
    num_columns = len(annotations.columns.values)
    if (num_columns < 4):
      print('wrongly formatted annotation sheet. Less than 3 columns')
    else:
      for col in annotations.columns.values[2:-1]:
        annotations = annotations.fillna(value = {col:' '})
        annotations['final_caption'] = annotations['final_caption'] + ' ' + annotations[col]
    


    #Take only three columns and drop any entry for which any of the three columns is Na/NaN
    annotations = annotations.loc[:,[annotations.columns.values[0], annotations.columns.values[1], 'final_caption']]
    annotations = annotations.dropna()


    #Iterate over rows of annotations 
    names = []
    web_adds = []
    captions = []
    column_name = annotations.columns.values
    
    for i, row in enumerate(annotations.iterrows()):
      data = row[1]
      
      file_name = data[column_name[0]]
      web_add = data[column_name[1]]
      caption = data[column_name[2]]


      #Check if corresponding image file exists in image path
      file_path1 = os.path.join(image_path, file_name+'.jpg')
      file_path2 = os.path.join(image_path, file_name+'.png')
      file_path3 = os.path.join(image_path, file_name)
      if (os.path.exists(file_path3) and (os.path.isfile(file_path3)) and (file_name.endswith('.jpg') or file_name.endswith('.png') or file_name.endswith('.jfif') or file_name.endswith('.PNG') or file_name.endswith('.JPG') or file_name.endswith('.jpeg') or file_name.endswith('.JPEG'))):
        file_path = file_path3
      elif (os.path.exists(file_path1) and os.path.isfile(file_path1)):
        file_path = file_path1
      elif (os.path.exists(file_path2) and os.path.isfile(file_path2)):
        file_path = file_path2
      else:
        failed_num += 1
        failed.append({"path":tmp, "Sl No":i, "file_name":file_name})
        continue
      
      #img = Image.open(file_path)
      file_add = 'I{}.png'.format(annot.get_id())
      #add = os.path.join(destination_folder, 'images', file_add)
      

      #Add found entries to dataframe
      #img.save(add)
      names.append(file_add)
      web_adds.append(web_add)
      captions.append(caption)
      count += 1
    
    df = pd.DataFrame({'file_name':names,'web_address':web_adds,'caption':captions})
    annot.append_annotations(df)
  
  print("Total Successful Count =  {}".format(count) )
  global total_success
  global total_failed
  total_success += count
  total_failed += failed_num
  print("Failed = {}".format(failed_num))

In [None]:
# driver code
dir = root_folder

annot = Annotation()

gsheet_crawl(dir, annot) 

paths = []
sl_nos = []
names = []

#collecting failed entries
for entry in failed:
  paths.append(entry["path"])
  sl_nos.append(entry["Sl No"])
  names.append(entry["file_name"])

failed_sheet = pd.DataFrame({"path": paths, "Sl_No": sl_nos, "file_name": names})
path = os.path.join(destination_folder,'failed_sheet.csv')
failed_sheet.to_csv(path)


annot.save()

print(total_success, total_failed)