<a href="https://colab.research.google.com/github/alexjochs/ECE_539_Penguins/blob/preprocess/PenguinsPreprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive

# Alex
# drive.mount('/content/gdrive')
# gdrive_data_filepath = r"/content/gdrive/MyDrive/'Penguin_counting'/data_peng_watch"
# Oscar
drive.mount('/content/drive')
gdrive_data_filepath = r'/content/drive/MyDrive/Colab\ Notebooks/539\ Project/data'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
import pandas as pd
import json
import numpy as np
from tqdm import tqdm
from matplotlib import image
from matplotlib import pyplot as plt
from scipy.ndimage.filters import gaussian_filter 
import scipy
from scipy.spatial import KDTree
import h5py
import time
from PIL import Image
import glob
import math

In [None]:
MASTER_LIST = ['BAILa', 'DAMOa', 'HALFb', 'HALFc', 'LOCKb', 'MAIVb', 'MAIVc', 'NEKOa', 'NEKOb', 'NEKOc', 'PETEc', 'PETEd', 'PETEf', 'SPIGa', 'GEORa']
VM_ROOT = r'/content'
cwd = None
def get_new_batch(target=None):
    assert target is not None, f"can't get specific folder: {target} and load all data"
    tgz_name = target + '.tgz'
    folder_path = os.path.join(gdrive_data_filepath, tgz_name)
    !tar --gunzip --extract --file={folder_path} --directory {VM_ROOT}
    cwd = os.path.join(VM_ROOT, target)

def save_batch_to_drive(target=None):
    # take working files, save them back to Gdrive
    assert target is not None, f"can't get specific folder: {target} and load all data"
    tgz_name = target + '.tgz'
    folder_path = os.path.join(VM_ROOT, target)
    # Oscar
    os.chdir('/content/drive/MyDrive/Colab Notebooks/539 Project/data') 
    # Alex
    # os.chdir(gdrive_data_filepath)
    # !tar -czvf {tgz_name} -P {folder_path} # Verbose
    !tar -czf {tgz_name} -P {folder_path}

In [None]:
annotations_path = VM_ROOT + r'/CompleteAnnotations_2016-07-11'

def run_all():
    json_filepath_list = get_json_files_from_folder()
    for filepath in json_filepath_list:
        df = run(filepath_=filepath)
        save_df_as_json(df)

def run(filepath_=None, target=None):
    if target is not None:
        filepath_ = annotations_path + r'/' + target + '.json'
    data_group_name = filepath_[-10:-5]
    df = load_json_as_df(filepath_)
    df.loc[df.xy.isnull(), 'xy'] = [[]]
    # have to check if inner list has na values as well :/
    df_xy_ = df['xy']
    for index, value in df_xy_.items():
        if len(value) > 0:
            while '_NaN_' in value:
                value.remove('_NaN_')
                df_xy_.at[index] = value
            while None in value:
                value.remove(None)
                df_xy_.at[index] = value
            if len(value) == 0:
                df_xy_.at[index] = [[]]
        else:
            df_xy_.at[index] = [[]]
    # df_xy_ = to_1D(df['xy'])
    # if df_xy_.isna().sum() > 0:
    #     df_xy_na_mask = df_xy_.isna()
    #     df.loc[df_xy_na_mask, 'xy'] = [[]]
    df['xy'] = df_xy_
    return df

def save_df_as_json(df):
    data_group_filename = df['imName'].iloc[0][:5] + '.json'
    try:
        os.mkdir('/content/annotations')
    except FileExistsError as e:
        print('looks like local annotations folder already exists!')
    print(data_group_filename)
    with open(os.path.join('/content/annotations', data_group_filename), 'w') as json_file:
        json.dump(json.loads(df.to_json(orient='records')), json_file)

def get_json_files_from_folder():
    json_filepath_list = []
    for filename in os.listdir(annotations_path):
        f = os.path.join(annotations_path, filename)
        if os.path.isfile(f):
            file_extension = os.path.splitext(f)[1]
        if file_extension == '.json':
            json_filepath_list.append(f)
    return json_filepath_list

def load_json_as_df(filepath):
    with open(filepath,'r') as json_file:
        json_data = json.loads(json_file.read())
    return pd.json_normalize(json_data, record_path =['dots'])

def to_1D(series):
    return pd.Series([x for _list in series for x in _list])

In [1]:
# dots: 2d list of xy dots from users
# return longest (ie most penguins clicked) 1d list
def get_longest_dot_list(dots):
    if len(dots) == 0:
        return []
    idx = 0
    if len(dots) > 1:
        for i, x in enumerate(dots):
            if type(x[0]) != type(0):
                if len(x) >= len(dots[idx]):
                    idx = i
    return dots[idx]

# One hot encoding of user clicks
def make_sparse_mat(img_shape, dots):
    mat = np.zeros((img_shape[1], img_shape[0]))

    # Check for 1-D case TODO
    if type(dots[0]) == type(0):
        mat[dots[1], dots[0]] = 1
    else:
        for dot in dots:
            if not (dot[0] > 800 or dot[1] > 600 or dot[0] < 0 or dot[1] < 0):
                mat[dot[1], dot[0]] = 1

    return mat

def downsample_dots(dots, img_shape):
    """convert xy coords of annotations down to 600x800 img space"""
    ds_dots = []
    x_scaler = 800.0 / float(img_shape[0])
    y_scaler = 600.0 / float(img_shape[1])

    # Check for empty dots list case
    if len(dots) == 0:
        return []

    # Check for 1-D case TODO
    if type(dots[0]) == type(0):
        return [math.floor(dots[0] * x_scaler), math.floor(dots[1] * y_scaler)]

    for dot in dots:
        try:
            if dot[0] >= img_shape[0] or dot[1] >= img_shape[1] or dot[0] < 0 or dot[1] < 0: # if == could we just subtract one? i.e. [2048, 1536] - > [2047, 1535]
                # print('DOT OUT OF RANGE:', dot) TODO
                continue
            ds_dots.append([math.floor(dot[0] * x_scaler), math.floor(dot[1] * y_scaler)]) # TODO
        except:
            print('ERROR -', dot) # TODO
        
    return ds_dots

def gaussian_filter_density(gt):
    #Generates a density map using Gaussian filter transformation
    
    density = np.zeros(gt.shape, dtype=np.float32)
    
    gt_count = np.count_nonzero(gt)
    
    if gt_count == 0:
        return density

    # FInd out the K nearest neighbours using a KDTree
    
    pts = np.array(list(zip(np.nonzero(gt)[1].ravel(), np.nonzero(gt)[0].ravel())))
    leafsize = 2048
    
    # build kdtree
    tree = scipy.spatial.KDTree(pts.copy(), leafsize=leafsize)
    
    # query kdtree
    distances, locations = tree.query(pts, k=4)

        
    for i, pt in enumerate(pts):
        pt2d = np.zeros(gt.shape, dtype=np.float32)
        pt2d[pt[1],pt[0]] = 1.
        if gt_count > 1:
            sigma = (distances[i][1]+distances[i][2]+distances[i][3])*0.1
        else:
            sigma = np.average(np.array(gt.shape))/2./2. #case: 1 point
        
        #Convolve with the gaussian filter
        
        # density += scipy.ndimage.filters.gaussian_filter(pt2d, sigma, mode='constant')
        input_ = np.fft.fft2(pt2d)
        result = scipy.ndimage.fourier_gaussian(input_, sigma)
        density = np.add(density, np.fft.ifft2(result).real, casting="unsafe")
    
    return density

some ideas to improve speed:  
- load all images into memory
- downsample images to input size
- - w/ fft, down to 4 sec!
- use mutiprocessing to speed up execution
- use fft instead of gaussian filter?
- - It works! down to 35 sec per image

Cell below loads image sizes into original image data dict, which is something like {'BAILa': (2048, 1536), 'DAMOa': (2048, 1536), 'HALFb': (2048, 1536), 'HALFc': (1920, 1080), etc...

In [None]:
# TODO I think we still want this
original_img_dim = {'BAILa': (2048, 1536), 'DAMOa': (2048, 1536), 'HALFb': (2048, 1536), 'HALFc': (1920, 1080), 'LOCKb': (1920, 1080), 'MAIVb': (2048, 1536), 'MAIVc': (2048, 1536), 'NEKOa': (1920, 1080), 'NEKOb': (2048, 1536), 'NEKOc': (2048, 1536), 'PETEc': (2048, 1536), 'PETEd': (2048, 1536), 'PETEf': (2048, 1536), 'SPIGa': (1920, 1080), 'GEORa': (2048, 1536)}
# for data_split_name in working_list:
#     local_directory = os.path.join(VM_ROOT, data_split_name)
#     for filename in glob.glob(local_directory + '/*.' + 'JPG'): #assuming gif
#         im=Image.open(filename)
#         original_img_dim[data_split_name] = (im.size[0], im.size[1])
#         break

---
The greatest of for loops:  
- for every datasplit in the master list:
    - get images for datasplit, load in to ram
    - get annotations, store in a df
    - get image size for that datasplit (each split is diff)
    - for **every image in datasplit**:  
        - downsample image to (600,800)
        - downsample annotations
        - get groundtruth of image via fft
        - save downsampled image and downsampled groundtruth

In [None]:
# Get Annotations
get_new_batch('CompleteAnnotations_2016-07-11')

In [88]:
!rm -rf /content/*_gt

In [89]:
# slice working list to split workload between us
working_list = MASTER_LIST[3:8] # TODO

# MASTER LOOP
for data_split_name in working_list:
    # Get images of batch
    if not os.path.exists(os.path.join(VM_ROOT, data_split_name)):
        get_new_batch(data_split_name)
    
    # Create split dir of ground truths
    split_dir = os.path.join(VM_ROOT, data_split_name + '_gt')
    os.mkdir(split_dir)
    os.chdir(split_dir)

    # Get annotations as DataFrame
    df = run(target=data_split_name)

    # Create heatmap for each image
    for i, row in df.iterrows():
        # Get image for size to pass to downsample
        # img = Image.open(os.path.join('/content', data_split_name, row['imName']) + '.JPG')
        # TODO: actually reshape image?
        # downsample dots to new img size of 600,800
        img_dots = row['xy']
        dots = get_longest_dot_list(img_dots)
        ds_dots = downsample_dots(dots, original_img_dim[data_split_name])

        # make gt heatmap
        k_ds = make_sparse_mat((800, 600), ds_dots)
        k_ds = gaussian_filter_density(k_ds)

        # save file as an h5 type
        with h5py.File(row['imName'] + '_gt.h5', 'w') as f:
            f['density'] = k_ds

    # Save ground truth zip
    save_batch_to_drive(target=data_split_name + '_gt')
    print('Saved', data_split_name+'_gt.tgz')

  arr_value = np.asarray(value)


Saved HALFc_gt.tgz


TypeError: ignored

In [58]:
[x for x in range(10)][4:11]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# # open each image
# img = Image.open(os.path.join('/content', data_split_name, row['imName']) + '.JPG')
# img.size
# img = img.resize((800, 600), Image.ANTIALIAS)
#         # image_list.append(img) TODO