In [1]:
#Packgaes required
import distutils
from glob import glob
import numpy as np
import os
import re
import shutil
from sklearn.model_selection import train_test_split

In [2]:
from ipynb.fs.full.Data_Repair import xml_repair,json_repair,txt_repair

In [3]:
#Function to move given file list from source directory to destination directory.
#file_list should only contain filenames not folder names.
#move=True if we want to move the files instead of copying.
def move_given_files(source_dir,destination_dir,file_list=None,move=False):
    os.makedirs(destination_dir,exist_ok=True)
    if(file_list==None):
        file_list=next(os.walk(source_dir))[2]
    for file_name in file_list:
        if(move):
            #Since we are using shutil.move and shutil.copy for this with 
            #copy_function paramater NOT copytree, this is viable only for files not folders. 
            #shutil.move(source_dir+file_name, destination_dir+file_name,copy_function = shutil.copytree)
            shutil.move(source_dir+file_name, destination_dir+file_name)
        else:
            shutil.copy(source_dir+file_name, destination_dir+file_name)

In [4]:
#Returns 
# a) pairs of (img,annotations) having same name AND also belonging to required extension lists.
# b) files that either doesn't have an image or annotation along with it, files with improper extensions
#It is guaranteed that we will be having annotations along with the image. Will return even number of files.
def pairs(class_dir,image_extension_list,annotation_extension_list):
    file_list=sorted(next(os.walk(class_dir))[2])
    pair_list=[]
    for file_name in file_list:
        if(file_name in pair_list):
            continue
        #This iteration returns every file other than itself
        #Sorted because if there are additional annotations, it takes xml instead of json or txt. [bad solution]
        #Since file_list sorted ascending order and file_list in second loop in descending order,
        #we will get jpg,xml pair instead of any other formats first
        for file in sorted([item for item in file_list if item!=file_name],reverse=True):
            #Additional formats were added again along with images too i.e. 
            #if there were two annotations for one image, the image would occur in pair list twice 
            #along with their various annotations.
            #file_name in pair_list avoids that.
            if(file in pair_list or file_name in pair_list):
                continue
            #First check is for same file name then it checks whether image or annotation belongs to required extension lists
            if((os.path.splitext(file)[0] == os.path.splitext(file_name)[0]) and (os.path.splitext(file)[1] in annotation_extension_list) and (os.path.splitext(file_name)[1] in image_extension_list)):
                pair_list.append(file_name)
                pair_list.append(file)
    return pair_list,[item for item in file_list if item not in pair_list]

In [5]:
def disparity(class_dir):
    #1 odd number of files
    file_list=sorted(os.listdir(class_dir))
    if(len(file_list)%2!=0):
        print(class_dir,"Disparity 1 : Uneven number of files : ",len(file_list))
    #2 folders inside
    if(len(next(os.walk(class_dir))[1])>=1):
        print(class_dir,"Disparity 2 : ",len(next(os.walk(class_dir))[1])," Folders inside")
    count_image=0
    count_annotation=0
    for file in file_list:
        count_same_name=1
        if(os.path.splitext(file)[1] in image_extension_list):
            count_image+=1
        elif(os.path.splitext(file)[1] in annotation_extension_list):
            count_annotation+=1
    #3 2 images instead of an annotation and an image [vice-versa]
    if(count_image!=count_annotation):
        print(class_dir,"Disparity 3 : ",count_image," Images ",count_annotation," Annotations")
    #4 files with the same name -> only 2 and [image,annotation] pair
    #l=pairs(source_dir,image_extension_list,annotation_extension_list)[0]
    #print(len(file_list)-len(l)," : ",[item for item in file_list if item not in l])

In [6]:
#file_list here should only contain file_names without extensions of the files we want to move.
def batch_move_files(file_list, source_path, destination_path):
    #FILE_LIST contains all file_names along with their extension in the source directory 
    FILE_LIST=sorted(os.listdir(source_path))
    for file in file_list:
        for file_name in FILE_LIST:
            if(os.path.splitext(file_name)[0]==file):
                same = file+os.path.splitext(file_name)[1]
                shutil.move(os.path.join(source_path, same), 
                         os.path.join(destination_path, same))
    return

In [7]:
#Renoves folders having file count below threshold
def delete_small_folders(source_dir,threshold_file_count=1):
    folders = list(os.walk(source_dir))[1:]
    for folder in folders:
        # folder example: ('FOLDER/3', [], ['file'])
        #if not folder[2]:
        #os.rmdir(folder[0])
        print("In ",folder[0])
        if(len(folder[2])<1):
            if(len(folder[1])<1):
                print("Folder getting removed : ",folder[0])
                shutil.rmtree(folder[0])

In [8]:
#This function must be done after proper files are copied to a proper directory. 
#This is because, we aim to rename annotation,image together 
#so that we know to which image, the annotation belongs to
def rename_files(source_dir,image_extension_list,annotation_extension_list,text):
    classes=sorted(os.listdir(source_dir))
    for i in classes:
        class_dir=source_dir+i+"/"
        count=1  #File starting number
        file_list=sorted(os.listdir(class_dir))
        done_list=[]
        for file_name in file_list:
            try:
                if(os.path.splitext(file_name)[0] in done_list):
                    continue
                for file in [item for item in file_list if item!=file_name]:
                    if(os.path.splitext(file_name)[0]==os.path.splitext(file)[0] and (os.path.splitext(file)[1] in annotation_extension_list) and (os.path.splitext(file_name)[1] in image_extension_list)):
                        source = class_dir + file_name
                        destination = class_dir + text + i +" "+str(count) + os.path.splitext(file_name)[1]
                        os.rename(source, destination)
                        source = class_dir + file
                        destination = class_dir + text + i +" "+str(count) + os.path.splitext(file)[1]
                        os.rename(source, destination)
                        done_list.append(os.path.splitext(file_name)[0])
                        count += 1
            except FileExistsError:
                print("PROBLEM PROBLEM\t",class_dir,file_name)
        file_list=sorted(os.listdir(class_dir))
        for file_name in file_list:
            source = class_dir + file_name
            os.rename(source, source.replace(text,""))

In [9]:
def preliminary_check_rename_classes(source_dir):
    classes=next(os.walk(source_dir))[1]
    for i in classes:
        for k in i.split("\n"):
            name=(re.sub(r"[^a-zA-Z0-9]+", ' ', k)).title()
        if(name!=i and name in classes):
            print(i,name)

In [10]:
def rename_classes(source_dir,text):
    classes=next(os.walk(source_dir))[1]
    count=0
    list_of_classes=list()
    for i in classes:
        for k in i.split("\n"):
            name=(re.sub(r"[^a-zA-Z0-9]+", ' ', k)).title()
        if(name!=i):
            #print(name,i,"\tDifferent")
            move_given_files(source_dir+i+'/',source_dir+text+'/',None,True)
            os.rmdir(source_dir+i)
            #If title(name) == name and such a folder exists, it means it's the same class. 
            #The files will be moved in together
            move_given_files(source_dir+text+'/',source_dir+name+'/',None,True)
            list_of_classes.append(i)
        else:
            #print(name,i,"\tSame")
            continue
    try:
        os.rmdir(source_dir+text+'/')
    except FileNotFoundError:
        pass
    #return list_of_classes
    return

In [15]:
def Data_Prepare(source_dir,disparity_dir,image_extension_list,annotation_extension_list,label_folder_list,text="UNIQUECODE"):
    classes=next(os.walk(source_dir))[1]
    Problematic_classes=[]
    for i in classes:
        try:
            os.makedirs(disparity_dir+i) 
        except FileExistsError:
            pass
        class_dir=source_dir+i+'/'
        Problem=[]
        #Gets list of folders in the class directory
        inner_folder_list=next(os.walk(class_dir))[1]
        for inner_folder in inner_folder_list:
            if(inner_folder.lower() not in label_folder_list):
                print(class_dir+inner_folder+"\t"+disparity_dir+i+'/'+inner_folder+"\t"+inner_folder)
                shutil.move(class_dir+inner_folder,disparity_dir+i+'/'+inner_folder,copy_function=shutil.copytree)
            else:
                move_given_files(class_dir+inner_folder+'/',class_dir,next(os.walk(class_dir+inner_folder+'/'))[2],True)
                try:
                    shutil.rmtree(class_dir+inner_folder)
                except FileNotFoundError:
                    pass
        #Gets list of files without a pair or without the right extension in class directory
        pairless_files=pairs(class_dir,image_extension_list,annotation_extension_list)[1]
        if(len(inner_folder_list)>0 or len(pairless_files)>0):
            #Class name added to Problem List
            Problem.append(i)
            #List of folders inside added to Problem List
            Problem.append(inner_folder_list)
            #List of pairless files inside added to Problem List
            Problem.append(pairless_files)
            #Moving pairless files out of class directory into class of disparity directory
            move_given_files(class_dir,disparity_dir+i+'/',Problem[2],True)
            #List containing all the problems
            Problematic_classes.append(Problem)
        disparity(class_dir)
    delete_small_folders(disparity_dir,1)  
    rename_classes(source_dir,text)
    rename_files(source_dir,image_extension_list,annotation_extension_list,text)
    BAD=xml_repair(source_dir)    
    Bad_JSON,Good_JSON=json_repair(source_dir)
    Bad_TXT,Good_TXT=txt_repair(source_dir)
    BAD.extend(Bad_JSON)
    BAD.extend(Bad_TXT)
    GOOD=Good_JSON.extend(Good_TXT)
    for i in BAD:
        file_name = i.split('/')[-1]
        class_dir="/".join(i.split('/')[:-1])+'/'
        class_name=i.split('/')[-2]
        image_name, =[x for x in next(os.walk(class_dir))[2] if os.path.splitext(file_name)[0]==os.path.splitext(x)[0] and x!=file_name]
        try:
            move_given_files(class_dir,disparity_dir+class_name+'/',[file_name,image_name],True)
        except FileNotFoundError:
            continue
    for i in GOOD:
        file_name = i.split('/')[-1]
        class_dir="/".join(i.split('/')[:-1])+'/'
        class_name=i.split('/')[-2]
        try:
            move_given_files(class_dir,disparity_dir+class_name+'/',[file_name],True)
        except FileNotFoundError:
            continue
    delete_small_folders(source_dir,1) 
    delete_small_folders(disparity_dir,1)
    #return Problematic_classes

In [12]:
#Root directory containing the original dataset
ROOTDIR= 'D:/Downloads/Analysed/Dataset 1/'
CLASSES=next(os.walk(ROOTDIR))[1]
disparity_dir='D:/Downloads/Analysed/Disparity/'
os.makedirs(disparity_dir,exist_ok=True)
destination_dir='D:/Downloads/Analysed/DESTINATION/'
#os.makedirs(destination_dir,exist_ok=True)
#Directory to simply experiment around
#root_dir='D:/Downloads/Analysed/E2 - Copy/'
#classes=next(os.walk(root_dir))[1]
#Extension list is the list of file types available in the dataset.
extension_list=['.jpg', '.xml', '.jpeg', '.png', '', '.JPG', '.json', '.webp', '.html', '.txt', '.cms', '.gif', '.jfif']
#Image extension list contains the types of images available in the dataset
#.gif removed since model doesn't read it
image_extension_list=['.jpg', '.png', '.jpeg', '.JPG', '.webp', '.cms', '.jfif'] 
#Annotation extension list contains the types of annotations available in the dataset
annotation_extension_list=['.xml', '.txt', '.json']
#label_folder_list contains the folder names available in some classes which 
#contain the annotations of that class
label_folder_list=['labes','label','annotation','annotations','labels']
forbidden=['train','test']