# Camera Trap to Random Forests in Python

In [None]:
! pip install tesseract
! pip install tesseract-ocr
! pip install libtesseract-dev
! pip install pytesseract

In [17]:
import pandas as pd
import numpy as np
import seaborn as sns
import json
from datetime import datetime as dt
import ast
import itertools
from PIL import Image
#import packages
import pytesseract
#Set tesseract location
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe" 
import glob
import cv2
import numpy as np
import pandas as pd
import re

### Wrangle the Data

In [None]:
def extract_temperature(pic_address):
    '''
    Extract temperature from picture file.

    From the middle of each picture file, the time stamp is read as image using cv2. It is then converted to a string.
    text which is then checked for format and subsequently returned through temp_format.

    Parameters
    ----------
    pic_address : full source address of current picture file.
    Returns
    -------
    temp_format.group(0) : unaltered temperature from bottom of the photo as a string.
    '''

    img = cv2.imread(pic_address) #read as an image

    # check if the timestamp is the correct format
    temp_pattern = "\d\dF" # eg 37F3C or 30F-1C

    loop = 5
    size_extension=0
    x,y,z = np.shape(img)
    x = (x//1000)*1000

    y = (y//1000)*1000
    # print(x,y,z)
    while loop>0:
        ts = img[2300 - size_extension:, 1400-size_extension:, :] #(change start values manually if sizing conventions change!)
        text = pytesseract.image_to_string(ts)
        temp_format = re.search(temp_pattern,text)
        if temp_format:
            # found temperature, return
            break
        ts_2 = img[2430 - size_extension:, 1565-size_extension:, :] #(change start values manually if sizing conventions change!)
        text_2= pytesseract.image_to_string(ts_2)
        temp_format = re.search(temp_pattern,text_2)
        if temp_format:
        # found temperature, return
            break
        size_extension+=50
        loop-=1

    if loop ==0:      
      # reached end of loop without finding correct timestamp
        return np.nan
        # return None
    
    return temp_format.group(0)


In [6]:
def wrangle_data(df):
    """
    wrangle via annotated data to produce dataframe of pixel coordinates with their corresponding
    RGB values, class and temperature info
    
    """
    # extract region attributes to produce class variable
    for i in range(df.shape[0]):
        try:
            json_item = json.loads(df['region_attributes'][i])
            keys = list(json_item["attribute"].keys())
            df.loc[i,"class"] = keys[0]
        except:
            print("Not able to extract region attributes at row {}".format(i))
    
    # convert json code to python dictionary
    for i in range(df.shape[0]):
        dictionary = ast.literal_eval(df.loc[i, "region_shape_attributes"])
        for k,v in dictionary.items():
            df.loc[i,k ] = v
        
    # change to int type
    for col in ["x", "y", "width","height"]:
        df[col] = df[col].astype('Int64')
        
    # dictionary of filename data
    filename_data = {
        'system': [],
        'watershed': [],
        'date': [],
        'pic_id': [],
    }

    # loop through filename string to add filename data
    for i in range(len(df['filename'])):
        # split contents by underscore
        file_items = df['filename'][i].split('_')

        # add system column for 'hbwtr'
        filename_data['system'].append(file_items[1])
        # add watershed olumn, 'w1', 'w2', etc.
        filename_data['watershed'].append(file_items[2])
        # add date column
        filename_data['date'].append(
                                    # modify integer date to date format, MM/DD/YYYY
                                    dt.strptime(file_items[3], '%Y%m%d').strftime('%m/%d/%Y')
                                    )
        # add picture id number
        filename_data['pic_id'].append(file_items[4])

    # create columns of this data in dataframe
    for variable in filename_data.keys():
        df[variable] = filename_data[variable]
        

    # make list to reorder columns
    new_cols = ['filename', 'region_count', 'region_id', 'class', 'name', 'x','y', 'width', 'height',  'system','watershed', 'date', 'pic_id']

    df = df[new_cols]
    
    # create image dictionary and temperature dictionary
    image_dict = {}
    temp_dict = {}
    def my_func(row):
        if row["filename"] in image_dict:
            return
        path =  "example_data/"+row["filename"] # sample path to example data folder with all images
        img= np.asarray(Image.open(path))
        image_dict[row["filename"]] = img
        temp_dict[row["filename"]] = extract_temperature(path)
    _ = df.apply(lambda c: my_func(c),axis=1)
    
    
    # Assign image pixel values for each row by extracting RGB using x,y coordinates
    
    # drop na values in x and y column
    df = df.dropna(subset= ["x","y"])
    # save new data points to this list
    li = []
    # loop over all rows and poopulate pixel coordinates
    for ind, row in df.iterrows():
        y_range = (row.y, row.y+row.height)
        x_range = (row.x, row.x+row.width)
        # print(y_range, x_range)
        # calculate the coordinates range in the x and y axis
        range_list =  [range(row.x, row.x+row.width), range(row.y,row.y+row.height)]
        # set product of x_range and y_range
        combination_list = list(itertools.product(*range_list))
        width, height = 1,1
        # append new row entry for every pixel locations within y_range and x_range
        for new_x, new_y in combination_list:       
            li.append([row["filename"],  new_x, new_y, width, height, row["class"]])
      
    pixels_df = pd.DataFrame(li, columns=['filename', 'x', 'y', 'width', 'height', 'class'])
    # print(pixels_df.head())
    
    # function to assign pixel RGB values using image dictionary
    def assign_pixels(row):
        return image_dict[row["filename"]][row.y:row.y+1, row.x:row.x+1].flatten()
    pixels_df["RGB"] = pixels_df.apply(lambda row: assign_pixels(row),axis=1)

    # create color channels
    pixels_df["R"] = pixels_df.apply(lambda row: np.int64(row["RGB"][0]),axis=1)
    pixels_df["G"] = pixels_df.apply(lambda row: np.int64(row["RGB"][1]),axis=1)
    pixels_df["B"] = pixels_df.apply(lambda row: np.int64(row["RGB"][2]),axis=1)
    # create temperature column
    pixels_df["temperature"] = pixels_df.apply(lambda row: temp_dict[row["filename"]][:2])
    # change date to pandas datetime
    pixels_df["date"] = pd.to_datetime(pixels_df["date"])

    df = pixels_df[["x","y","R","G","B","temperature", "class"]]

    # drop all missing values
    df = df.fillna(value=np.nan)
    df = df.dropna()
    # remove duplicate RGB
    df = df.drop_duplicates()
    return df



Bring in the CSV data. The data is also available in JSON, which may actually be the most intuitive and efficient structure to unpack from in the long run.  

In [4]:
# read via annotated image pixels data
# we will use three data sets( from watershed 3 and watershed 6)
csv_1 = pd.read_csv('example_data/hbwater_w3_bottom_1_1_20-3_5_20_csv.csv')
csv_2 = pd.read_csv("example_data\hb2_w6_2019_top_csv.csv")
csv_3 = pd.read_csv("example_data\hbwtr_w6_oct2018dec2018_bottom_csv.csv")
csv_1.head()

Unnamed: 0,filename,file_size,file_attributes,region_count,region_id,region_shape_attributes,region_attributes
0,invert_Hbwtr_w3_20200101_120451.JPG,180286,"{""attribute"":{}}",20,0,"{""name"":""rect"",""x"":199,""y"":2028,""width"":46,""he...","{""attribute"":{""open_water_dark"":true}}"
1,invert_Hbwtr_w3_20200101_120451.JPG,180286,"{""attribute"":{}}",20,1,"{""name"":""rect"",""x"":192,""y"":2125,""width"":33,""he...","{""attribute"":{""open_water_dark"":true}}"
2,invert_Hbwtr_w3_20200101_120451.JPG,180286,"{""attribute"":{}}",20,2,"{""name"":""rect"",""x"":396,""y"":2022,""width"":38,""he...","{""attribute"":{""open_water_dark"":true}}"
3,invert_Hbwtr_w3_20200101_120451.JPG,180286,"{""attribute"":{}}",20,3,"{""name"":""rect"",""x"":488,""y"":2114,""width"":45,""he...","{""attribute"":{""open_water_dark"":true}}"
4,invert_Hbwtr_w3_20200101_120451.JPG,180286,"{""attribute"":{}}",20,4,"{""name"":""rect"",""x"":670,""y"":2079,""width"":43,""he...","{""attribute"":{""open_water_dark"":true}}"


In [None]:
# main function
csv_1 = wrangle_data(csv_1)
csv_2 = wrangle_data(csv_2)
csv_3 = wrangle_data(csv_3)
# wrangle and save dataset
csv_1.to_csv("pixels_df_1.csv", index=False)
csv_2.to_csv("pixels_df_2.csv", index=False)
csv_3.to_csv("pixels_df_3.csv", index=False)