# Camera Trap to Random Forests in Python

In [1]:
import os
os.chdir("C:/Users/athellma/OneDrive - University of North Carolina at Chapel Hill/Documents/Duke University/Research/_HBEF/CameraTrapAnalysis/hbwater_cameratrap_pheno")

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from datetime import datetime as dt
import ast
import itertools
from PIL import Image
#import packages
import pytesseract
#Set tesseract location
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" 
import glob
import cv2
import re

### Wrangle the Data

In [3]:
def extract_temperature(pic_address):
    '''
    Extract temperature from picture file.

    From the middle of each picture file, the time stamp is read as image using cv2. It is then converted to a string.
    text which is then checked for format and subsequently returned through temp_format.

    Parameters
    ----------
    pic_address : full source address of current picture file.
    Returns
    -------
    temp_format.group(0) : unaltered temperature from bottom of the photo as a string.
    '''

    img = cv2.imread(pic_address) #read as an image

    # check if the timestamp is the correct format
    temp_pattern = "\d\dF" # eg 37F3C or 30F-1C

    loop = 5
    size_extension=0
    x,y,z = np.shape(img)
    x = (x//1000)*1000

    y = (y//1000)*1000
    # print(x,y,z)
    while loop>0:
        ts = img[2300 - size_extension:, 1400-size_extension:, :] #(change start values manually if sizing conventions change!)
        text = pytesseract.image_to_string(ts)
        temp_format = re.search(temp_pattern,text)
        if temp_format:
            # found temperature, return
            break
        ts_2 = img[2430 - size_extension:, 1565-size_extension:, :] #(change start values manually if sizing conventions change!)
        text_2= pytesseract.image_to_string(ts_2)
        temp_format = re.search(temp_pattern,text_2)
        if temp_format:
        # found temperature, return
            break
        size_extension+=50
        loop-=1

    if loop ==0:      
      # reached end of loop without finding correct timestamp
        return np.nan
        # return None
    
    return temp_format.group(0)


  temp_pattern = "\d\dF" # eg 37F3C or 30F-1C


In [4]:
extract_temperature("example_data/invert_Hbwtr_w3_20201107_115913.JPG") #extract temperature doesn't work on inverted images

  temp_pattern = "\d\dF" # eg 37F3C or 30F-1C


ValueError: not enough values to unpack (expected 3, got 0)

In [8]:
def wrangle_data_old(df):
    """
    wrangle via annotated data to produce dataframe of pixel coordinates with their corresponding
    RGB values, class and temperature info
    
    """
    # extract region attributes to produce class variable
    for i in range(df.shape[0]):
        try:
            json_item = json.loads(df['region_attributes'][i])
            keys = list(json_item["attribute"].keys())
            df.loc[i,"class"] = keys[0]
        except:
            print("Not able to extract region attributes at row {}".format(i))
    
    # convert json code to python dictionary
    for i in range(df.shape[0]):
        dictionary = ast.literal_eval(df.loc[i, "region_shape_attributes"])
        for k,v in dictionary.items():
            df.loc[i,k ] = v
        
    # change to int type
    for col in ["x", "y", "width","height"]:
        df[col] = df[col].astype('Int64')
        
    # dictionary of filename data
    filename_data = {
        'system': [],
        'watershed': [],
        'date': [],
        'pic_id': [],
    }

    # loop through filename string to add filename data
    for i in range(len(df['filename'])): #used to be file name now invert 
        # split contents by underscore
        file_items = df['filename_inverted'][i].split('_')

        # add system column for 'hbwtr'
        filename_data['system'].append(file_items[1])
        # add watershed olumn, 'w1', 'w2', etc.
        filename_data['watershed'].append(file_items[2])
        # add date column
        filename_data['date'].append(
                                    # modify integer date to date format, MM/DD/YYYY
                                    dt.strptime(file_items[3], '%Y%m%d').strftime('%m/%d/%Y')
                                    )
        # add picture id number
        filename_data['pic_id'].append(file_items[4])

    # create columns of this data in dataframe
    for variable in filename_data.keys():
        df[variable] = filename_data[variable]
        

    # make list to reorder columns
    new_cols = ['filename', 'region_count', 'region_id', 'class', 'name', 'x','y', 'width', 'height',  'system','watershed', 'date', 'pic_id']

    df = df[new_cols]
    
    # create image dictionary and temperature dictionary
    image_dict = {}
    temp_dict = {}
    def my_func(row):
        if row["filename"] in image_dict:
            return
        path =  "example_data/"+row["filename"] # sample path to example data folder with all images
        # note to Audrey: can you change this to read the images recursively (e.g. find file regardless of path)
        img= np.asarray(Image.open(path)) #note to Audrey: read in raw images, not inverted images?
        image_dict[row["filename"]] = img
        temp_dict[row["filename"]] = extract_temperature(path)
    _ = df.apply(lambda c: my_func(c),axis=1)
    
    
    # Assign image pixel values for each row by extracting RGB using x,y coordinates
    
    # drop na values in x and y column
    df = df.dropna(subset= ["x","y"])
    # save new data points to this list
    li = []
    # loop over all rows and poopulate pixel coordinates
    for ind, row in df.iterrows():
        y_range = (row.y, row.y+row.height)
        x_range = (row.x, row.x+row.width)
        # print(y_range, x_range)
        # calculate the coordinates range in the x and y axis
        range_list =  [range(row.x, row.x+row.width), range(row.y,row.y+row.height)]
        # set product of x_range and y_range
        combination_list = list(itertools.product(*range_list))
        width, height = 1,1
        # append new row entry for every pixel locations within y_range and x_range
        for new_x, new_y in combination_list:       
            li.append([row["filename"],  new_x, new_y, width, height, row["class"]])
      
    pixels_df = pd.DataFrame(li, columns=['filename', 'x', 'y', 'width', 'height', 'class'])
    # print(pixels_df.head())
    
    # function to assign pixel RGB values using image dictionary
    def assign_pixels(row):
        return image_dict[row["filename"]][row.y:row.y+1, row.x:row.x+1].flatten()
    pixels_df["RGB"] = pixels_df.apply(lambda row: assign_pixels(row),axis=1)

    # create color channels
    pixels_df["R"] = pixels_df.apply(lambda row: np.int64(row["RGB"][0]),axis=1)
    pixels_df["G"] = pixels_df.apply(lambda row: np.int64(row["RGB"][1]),axis=1)
    pixels_df["B"] = pixels_df.apply(lambda row: np.int64(row["RGB"][2]),axis=1)
    # create temperature column
    pixels_df["temperature"] = pixels_df.apply(lambda row: temp_dict[row["filename"]][:2])
    # change date to pandas datetime
    pixels_df["date"] = pd.to_datetime(pixels_df["date"])

    df = pixels_df[["x","y","R","G","B","temperature", "class"]]

    # drop all missing values
    df = df.fillna(value=np.nan)
    df = df.dropna()
    # remove duplicate RGB
    df = df.drop_duplicates()
    return df



In [15]:
def wrangle_data(df, folder):
# """
# wrangle via annotated data to produce dataframe of pixel coordinates with their corresponding
# RGB values, class and temperature info
# """
# testing 
# input data: folder of images in example data/ 
#df = csv_1
# here change the name of the files if relevant: 
    df = df.rename(columns = {"filename":"filename_inverted"})
    df["filename"] = df["filename_inverted"].str.replace("invert_", "")

    # extract region attributes to produce class variable

    #here add in create column functionality 

    for i in range(df.shape[0]):
        try:
            json_item = json.loads(df['region_attributes'][i])
            keys = list(json_item["attribute"].keys())
            df.loc[i,"class"] = keys[0]
        except:
            print("Not able to extract region attributes at row {}".format(i))
        
    # convert json code to python dictionary
    for i in range(df.shape[0]):
        dictionary = ast.literal_eval(df.loc[i, "region_shape_attributes"])
        for k,v in dictionary.items():
            df.loc[i,k ] = v
            
    # change to int type
    for col in ["x", "y", "width","height"]:
        df[col] = df[col].astype('Int64')

        
    # dictionary of filename data
    filename_data = {
        'system': [],
        'watershed': [],
        'date': [],
        'time':[],
        'pic_id': [],
    }

    # loop through filename string to add filename data
    for i in range(len(df['filename'])): #used to be file name now invert 
        # split contents by underscore
        file_items = df['filename_inverted'][i].split('_')

        # add system column for 'hbwtr'
        filename_data['system'].append(file_items[1])
        # add watershed olumn, 'w1', 'w2', etc.
        filename_data['watershed'].append(file_items[2])
        # add date column
        filename_data['date'].append(
                                    # modify integer date to date format, MM/DD/YYYY
                                    dt.strptime(file_items[3], '%Y%m%d').strftime('%m/%d/%Y')
                                    )
        filename_data['time'].append(
                                    #modify interger to time format
                                    dt.strptime(file_items[4][:-4],'%H%M%S').strftime('%H:%M:%S')
                                    )
        # add picture id number
        filename_data['pic_id'].append(file_items[4])


    # create columns of this data in dataframe
    for variable in filename_data.keys():
        df[variable] = filename_data[variable]
        

    # make list to reorder columns
    new_cols = ['filename_inverted', 'filename', 'region_count', 'region_id', 'class', 'name', 'x','y', 'width', 'height',  'system','watershed', 'date','time','pic_id']

    df = df[new_cols]

    #create image dictionary and temperature dictionary
    image_dict = {}
    temp_dict = {}

    def my_func(row):
        if row["filename"] in image_dict:
            return
        path =  folder +row["filename"] # sample path to example data folder with all images
        # note to Audrey: can you change this to read the images recursively (e.g. find file regardless of path)
        img= np.asarray(Image.open(path)) #note to Audrey: read in raw images, not inverted images?
        image_dict[row["filename"]] = img
        temp_dict[row["filename"]] = extract_temperature(path)
    _= df.apply(lambda c: my_func(c), axis =1)

    print(temp_dict)   
    #print(temp_dict.get("Hbwtr_w1_20190102_115959.JPG"))
    #print(temp_dict["Hbwtr_w1_20190102_115959.JPG"]) #this also gives the tmperature 
        # # Assign image pixel values for each row by extracting RGB using x,y coordinates
        
    # # drop na values in x and y column
    df = df.dropna(subset= ["x","y"])
    # save new data points to this list
    li = []
    # loop over all rows and poopulate pixel coordinates
    for ind, row in df.iterrows():
        y_range = (row.y, row.y+row.height)
        x_range = (row.x, row.x+row.width)
        # print(y_range, x_range)
        # calculate the coordinates range in the x and y axis
        range_list =  [range(row.x, row.x+row.width), range(row.y,row.y+row.height)]
        # set product of x_range and y_range
        combination_list = list(itertools.product(*range_list))
        width, height = 1,1
        # append new row entry for every pixel locations within y_range and x_range
        for new_x, new_y in combination_list:       
            li.append([row["filename"],  new_x, new_y, width, height, row["class"]])
        
    pixels_df = pd.DataFrame(li, columns=['filename', 'x', 'y', 'width', 'height', 'class'])
    #print(pixels_df.head())
        
    # function to assign pixel RGB values using image dictionary
    def assign_pixels(row):
        return image_dict[row["filename"]][row.y:row.y+1, row.x:row.x+1].flatten()
    pixels_df["RGB"] = pixels_df.apply(lambda row: assign_pixels(row),axis=1)

    # create color channels
    pixels_df["R"] = pixels_df.apply(lambda row: np.int64(row["RGB"][0]),axis=1)
    pixels_df["G"] = pixels_df.apply(lambda row: np.int64(row["RGB"][1]),axis=1)
    pixels_df["B"] = pixels_df.apply(lambda row: np.int64(row["RGB"][2]),axis=1)
    # # create temperature column
    # pixels_df["temperature"] = pixels_df.apply(lambda row: temp_dict[row["filename"]][:2])
    #print(row["filename"])
    pixels_df["temperature"] = pixels_df.apply(lambda row: temp_dict[row["filename"]], axis = 1) #added axis = 1

    file_items_df = df[["filename", "watershed", "date", "time"]].drop_duplicates()

    pixels_df = pixels_df.merge(file_items_df, on="filename")

    df = pixels_df[["x","y","R","G","B","date", "time", "filename", "watershed", "temperature","class"]]

    # drop all missing values
    df = df.fillna(value=np.nan)
    df = df.dropna() #this removes missing values, maybe change this to just drop na on anything that is not temperature 
    # remove duplicate RGB
    df = df.drop_duplicates()

    return df

In [6]:
# read via annotated image pixels data
# we will use three data sets( from watershed 3 and watershed 6)
#csv_1 = pd.read_csv('data/training_data/input_data/hbwater_w3_bottom_1 1 20-3 5 20_csv.csv') #this file name was swapped out for wrangle
#csv_1 = pd.read_csv('data/training_data/input_data/hbwtr_w1_2019_bottom_TEST.csv')
csv_1 = pd.read_csv('data/training_data/input_data/hbwtr_w1_2019_bottom.csv')
csv_2 = pd.read_csv("data/training_data/input_data/hb2_w6_2019_top_csv.csv")
csv_3 = pd.read_csv("data/training_data/input_data/hbwtr_w6_oct2018dec2018_bottom_csv.csv")
csv_4 = pd.read_csv("data/training_data/input_data/via_project_w1_csv.csv")

csv_4.head()

Unnamed: 0,filename,file_size,file_attributes,region_count,region_id,region_shape_attributes,region_attributes
0,invert_Hbwtr_w1_20201001_120348.jpg,461442,"{""attribute"":{}}",22,0,"{""name"":""rect"",""x"":2934,""y"":2677,""width"":86,""h...","{""attribute"":{""leaf_fall"":true}}"
1,invert_Hbwtr_w1_20201001_120348.jpg,461442,"{""attribute"":{}}",22,1,"{""name"":""rect"",""x"":3114,""y"":2612,""width"":83,""h...","{""attribute"":{""leaf_fall"":true}}"
2,invert_Hbwtr_w1_20201001_120348.jpg,461442,"{""attribute"":{}}",22,2,"{""name"":""rect"",""x"":3238,""y"":2612,""width"":65,""h...","{""attribute"":{""leaf_fall"":true}}"
3,invert_Hbwtr_w1_20201001_120348.jpg,461442,"{""attribute"":{}}",22,3,"{""name"":""rect"",""x"":2712,""y"":2928,""width"":92,""h...","{""attribute"":{""leaf_fall"":true}}"
4,invert_Hbwtr_w1_20201001_120348.jpg,461442,"{""attribute"":{}}",22,4,"{""name"":""rect"",""x"":2148,""y"":2984,""width"":33,""h...","{""attribute"":{""leaf_green"":true}}"


In [14]:
# a note before running
csv_look = csv_4.copy()
print("the following non-inverted images: ", csv_look["filename"].iloc[1], " & ", csv_4["filename"].iloc[-1], " must be in a folder, usually example_data/")

the following dates:  invert_Hbwtr_w1_20201001_120348.jpg  &  invert_Hbwtr_w1_20211031_115957.JPG  must be in example data


In [20]:
# main function
# csv_1 = wrangle_data(df = csv_1, folder = "example_data/") #takes 10 mins 
# csv_2 = wrangle_data(csv_2)
# csv_3 = wrangle_data(csv_3)
csv_4 = wrangle_data(csv_4, "data/munged/W1/")

# wrangle and save dataset
# csv_1.to_csv("data/training_data/derived/pixels_df_1.csv", index=False)
# csv_2.to_csv("data/training_data/derived/pixels_df_2.csv", index=False)
# csv_3.to_csv("data/training_data/derived/pixels_df_3.csv", index=False)
csv_4.to_csv("data/training_data/derived/pixels_df_4.csv", index = False)

Not able to extract region attributes at row 59
Not able to extract region attributes at row 138
Not able to extract region attributes at row 197
Not able to extract region attributes at row 251
Not able to extract region attributes at row 328
Not able to extract region attributes at row 426
Not able to extract region attributes at row 479
Not able to extract region attributes at row 502
Not able to extract region attributes at row 563
Not able to extract region attributes at row 643
Not able to extract region attributes at row 644
Not able to extract region attributes at row 645
Not able to extract region attributes at row 646
Not able to extract region attributes at row 647
Not able to extract region attributes at row 648
Not able to extract region attributes at row 649
Not able to extract region attributes at row 650
Not able to extract region attributes at row 651
Not able to extract region attributes at row 652
Not able to extract region attributes at row 653
Not able to extract r

In [22]:
csv_4.head()

Unnamed: 0,x,y,R,G,B,date,time,filename,watershed,temperature,class
0,2934,2677,57,17,0,10/01/2020,12:03:48,Hbwtr_w1_20201001_120348.jpg,w1,66F,leaf_fall
1,2934,2678,40,4,0,10/01/2020,12:03:48,Hbwtr_w1_20201001_120348.jpg,w1,66F,leaf_fall
2,2934,2679,29,0,0,10/01/2020,12:03:48,Hbwtr_w1_20201001_120348.jpg,w1,66F,leaf_fall
3,2934,2680,25,0,0,10/01/2020,12:03:48,Hbwtr_w1_20201001_120348.jpg,w1,66F,leaf_fall
4,2934,2681,24,0,0,10/01/2020,12:03:48,Hbwtr_w1_20201001_120348.jpg,w1,66F,leaf_fall
