<a href="https://colab.research.google.com/github/alexjochs/ECE_539_Penguins/blob/aochs%2Fdownload_data/delete_images_without_labels.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Cleaning
It looks like a bunch of the annotations are completely blank, so I made this to find blank entries in the annotations, get their corresponding image, and delete them both. This doesnt do anything about blank, but not null entries, however I havent seen any of those yet. Takes about 5m of runtime on standard colab.

In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

data_filepath = r"/content/gdrive/MyDrive/'Penguin_counting'/data_peng_watch"


Mounted at /content/gdrive


In [4]:
import os
import pandas as pd
import json
from matplotlib import image

In [24]:
class CleanData:
    """
    Class to remove data without labels and move labels to image folders
    """


    def __init__(self):
        self.json_filepath_list = []
        self.image_path_list = None
        self.image_folder_ = None
        self.data_folder = r'/content/gdrive/MyDrive/Penguin_counting/data_peng_watch'
        self.default_path = self.data_folder + r'/CompleteAnnotations_2016-07-11'

    def run(self):
        self.get_json_files_from_folder()
        print(self.json_filepath_list)
        for filepath in self.json_filepath_list:
            df = self.load_json_as_df(filepath)
            self.image_folder_ = df.iloc[1].imName.split('_')[0][:5]
            no_labels_df = self.get_empty_annotations(df)
            if no_labels_df.shape[0] != 0:
                self.set_image_path_list(no_labels_df)
                self.remove_images_without_labels()
                df.dropna()
            # copy cleaned json to folder with respective images
            with open(self.data_folder + '/' + self.image_folder_ + '/annotations.json', 'w') as json_file:
                json.dump(df.to_json(orient='records'), json_file)

    def get_json_files_from_folder(self):
        for filename in os.listdir(self.default_path):
            f = os.path.join(self.default_path, filename)
            if os.path.isfile(f):
                file_extension = os.path.splitext(f)[1]
            if file_extension == '.json':
                self.json_filepath_list.append(f)

    def load_json_as_df(self, filepath):
        with open(filepath,'r') as json_file:
            json_data = json.loads(json_file.read())

        return pd.json_normalize(json_data, record_path =['dots'])

    @staticmethod
    def get_empty_annotations(df):
        "return dataframe of missing or NaN entries in input dataframe"
        return df[df.isna().any(axis=1)]

    def set_image_path_list(self, df):
        '''
        return pandas series of image locations with no labels
        '''
        image_paths = []
        self.image_path_list = self.data_folder + '/' + self.image_folder_ + '/' + df['imName'] + '.JPG'

    def remove_images_without_labels(self):
        for image_path in self.image_path_list:
            print(image_path)
            !rm {image_path}
            try:
                image.imread(image_path)
                raise Exception('image was not deleted!')
            except:
                print(f'Image {image_path} was deleted sucessfully')


In [25]:
cd = CleanData()
cd.run()

['/content/gdrive/MyDrive/Penguin_counting/data_peng_watch/CompleteAnnotations_2016-07-11/DAMOa.json', '/content/gdrive/MyDrive/Penguin_counting/data_peng_watch/CompleteAnnotations_2016-07-11/GEORa.json', '/content/gdrive/MyDrive/Penguin_counting/data_peng_watch/CompleteAnnotations_2016-07-11/HALFb.json', '/content/gdrive/MyDrive/Penguin_counting/data_peng_watch/CompleteAnnotations_2016-07-11/HALFc.json', '/content/gdrive/MyDrive/Penguin_counting/data_peng_watch/CompleteAnnotations_2016-07-11/LOCKb.json', '/content/gdrive/MyDrive/Penguin_counting/data_peng_watch/CompleteAnnotations_2016-07-11/NEKOc.json', '/content/gdrive/MyDrive/Penguin_counting/data_peng_watch/CompleteAnnotations_2016-07-11/PETEc.json', '/content/gdrive/MyDrive/Penguin_counting/data_peng_watch/CompleteAnnotations_2016-07-11/SPIGa.json', '/content/gdrive/MyDrive/Penguin_counting/data_peng_watch/CompleteAnnotations_2016-07-11/BAILa.json', '/content/gdrive/MyDrive/Penguin_counting/data_peng_watch/CompleteAnnotations_201

[{"imName":"DAMOa2014a_000001","xy":[[[501,484],[576,565],[912,605],[1054,694],[1133,536],[1372,594],[1825,663],[1977,651],[841,548]],[[1129,509],[1386,616],[1075,703],[941,602],[583,530],[1805,675],[1974,654],[501,480],[848,553]],[[1826,671],[1955,659],[1391,622],[1058,707],[921,598],[1134,526],[847,557],[586,552],[504,487]],[[1830,652],[1391,599],[1061,691],[1129,498],[926,597],[588,536],[855,553],[510,482],[1965,650]],[[502,488],[594,564],[847,550],[950,600],[1125,512],[1058,687],[1382,608],[1825,661],[1990,657]],[[1823,665],[1386,611],[1124,512],[1039,667],[1985,650],[940,587],[578,534],[855,553],[498,478]],[[606,562],[841,552],[966,592],[1135,524],[1075,673],[1375,626],[1814,654],[1979,652],[513,477]],[[1825,656],[1940,646],[1395,629],[1061,692],[1132,518],[918,597],[835,557],[598,565],[518,482],[1328,602]],[[1054,682],[949,598],[1140,517],[1845,649],[1384,594],[584,538],[850,553],[516,479],[1983,642]],[[1823,646],[1070,687],[944,594],[598,560],[844,553],[2003,653],[1400,619],[112

In [29]:
filepath = r'/content/gdrive/MyDrive/Penguin_counting/data_peng_watch/HALFc/annotations.json'

with open(filepath,'r') as json_file:
            json_data = json.loads(json_file.read())
print(json_data)

[{"imName":"HALFc2013a_000001","xy":[[[1489,470],[1312,297],[1077,316],[743,225],[649,164],[304,491],[660,501],[470,669],[518,967],[757,748],[1171,755],[1637,48],[1030,85],[653,123],[664,355]],[[551,942],[1198,739],[811,673],[476,635],[653,488],[327,510],[716,205],[689,348],[1096,291],[1515,467],[1293,320],[633,153],[1268,54],[1044,69]],[[314,511],[186,624],[442,941],[466,692],[636,504],[793,658],[1082,796],[1111,328],[1293,335],[1520,457],[1657,47],[1082,75],[632,137],[708,234],[700,355]],[[1108,320],[1285,332],[1446,420],[1179,731],[867,694],[451,623],[661,511],[725,209],[305,501],[207,629],[569,941],[640,150],[1057,80],[1241,63],[1638,48]],[[182,644],[319,504],[706,212],[649,145],[684,344],[640,487],[483,612],[547,920],[811,676],[1216,751],[1076,331],[1300,321],[1492,439],[1020,77],[1254,55]],[[1217,733],[837,669],[545,947],[482,631],[323,497],[191,628],[621,477],[681,345],[701,209],[640,150],[1050,72],[1086,309],[1270,314],[1516,445],[1627,44],[1276,54]],[[1281,63],[1332,314],[1101