In [1]:
import pandas as pd
import pytesseract
import glob
import cv2
import numpy as np

In [2]:
import sys  
sys.path.insert(0, '../code/')

In [3]:
from image_processing import prepare_composition, compose_functions, resize_img, e4_all_funcs
from texts import words_water, words_food
from utils import (
    process_text,
    merge_dicts,
)

In [4]:
from e2 import e2_funcs

In [5]:
from legacy import do_all, apply_ocr_e3

In [6]:
def e1(image, words_to_consider):
    txt = pytesseract.image_to_string(image, lang="por")
    try:
        terms, terms_to_search = process_text(txt, words_to_consider)
        terms = list(set(terms))
        return len(terms)/len(words_to_consider)
    except:
        return 0

In [7]:
def e2(image, words_to_consider):
    transformed_img = e2_funcs(image)
    txt = pytesseract.image_to_string(transformed_img, lang="por")
    try:
        terms, terms_to_search = process_text(txt, words_to_consider)
        terms = list(set(terms))
        return len(terms)/len(words_to_consider)
    except:
        return 0
    
    

In [8]:
def e3(image, words_to_consider):
    filtered, merged, resized = do_all(image)
    text = apply_ocr_e3(image, merged)
    try:
        terms, terms_to_search = process_text(txt, words_to_consider)
        terms = list(set(terms))
        return len(terms)/len(words_to_consider)
    except:
        return 0

In [9]:
def e4(image, words_to_consider):
    try:
        result_df = pd.DataFrame(columns=["terms"])
        resize_options = [0.1, 0.3, 0.5, 0.7, 0.9, 1]
        for scale_factor in resize_options:
            resized = resize_img(img, scale_factor)
            composed_image = e4_all_funcs(resized)
            txt = pytesseract.image_to_string(composed_image, lang="por")
            try:
                terms, terms_to_search = process_text(txt, words_to_consider)
                result_dict = {}
                result_dict = {"terms": terms}
                result_df = result_df.append(result_dict, ignore_index=True)
            except:
                print(f"Unable to process {scale_factor}")
        terms_set = list(
                set([item for sublist in result_df["terms"] for item in sublist])
            )
        return len(terms_set)/len(words_to_consider)
    except:
        return 0

In [10]:
def e5(image, words_to_consider):
    try:
        result_df = pd.DataFrame(columns=["terms"])
        resize_options = [0.1, 0.3, 0.5, 0.7, 0.9, 1]
        for scale_factor in resize_options:
            resized = resize_img(img, scale_factor)
            txt = pytesseract.image_to_string(resized, lang="por")
            try:
                terms, terms_to_search = process_text(txt, words_to_consider)
                result_dict = {}
                result_dict = {"terms": terms}
                result_df = result_df.append(result_dict, ignore_index=True)
            except:
                print(f"Unable to process {scale_factor}")
        terms_set = list(
                set([item for sublist in result_df["terms"] for item in sublist])
            )
        return len(terms_set)/len(words_to_consider)
    except:
        return 0

In [11]:
final_results = pd.DataFrame(columns=["sample_set", "sample_name", "method", "acc"])
for sample_set in list(sorted(glob.glob("../images/sample_sets/*"))):
    for filename in list(sorted(glob.glob(f"{sample_set}/*"))):
        sample_name_to_show = sample_set.split("/")[-1]
        name_to_show = filename.split("/")[-1].split(".jpg")[0]
        img = cv2.imread(filename, cv2.IMREAD_GRAYSCALE)
        words_to_consider = words_water if "water" in name_to_show else words_food
        words_c = "W" if "water" in name_to_show else "F"
        print("========START===========")
        print(f"Doing exp for {sample_name_to_show}/{name_to_show}. W = {words_c}")
        for exp_func, exp_name in [(e1, "e1"), (e2, "e2"), (e4, "e4"), (e5, "e5")]:
            acc = exp_func(img, words_to_consider)
            result_dict = dict(
                    zip(
                        ["sample_set", "sample_name", "method", "acc"],
                        [sample_name_to_show, name_to_show, exp_name, acc]
                    )
                )
            final_results = final_results.append(result_dict, ignore_index=True)
        print("=========END==========")
final_results
        
        

Doing exp for 1/water_1. W = W
Unable to process 0.1
Unable to process 0.3
Unable to process 0.1
Doing exp for 1/water_2. W = W
Unable to process 0.1
Unable to process 0.3
Unable to process 0.1
Doing exp for 1/water_3. W = W
Unable to process 0.1
Unable to process 0.3
Unable to process 0.1
Doing exp for 1/water_4. W = W
Unable to process 0.1
Unable to process 0.3
Unable to process 0.5
Unable to process 0.7
Unable to process 0.9
Unable to process 1
Unable to process 0.1
Unable to process 0.3
Unable to process 0.5
Unable to process 1
Doing exp for 1/water_5. W = W
Unable to process 0.1
Doing exp for 3/water_1. W = W
Doing exp for 3/water_2. W = W
Unable to process 0.1
Doing exp for 3/water_3. W = W
Unable to process 0.1
Unable to process 0.5
Unable to process 0.1
Doing exp for 3/water_4. W = W
Doing exp for 3/water_5. W = W
Doing exp for 4/food_1. W = F
Doing exp for 4/food_2. W = F
Doing exp for 4/food_3. W = F
Unable to process 0.9
Unable to process 1
Doing exp for 4/water_1. W = W
Doi

Unnamed: 0,sample_set,sample_name,method,acc
0,1,water_1,e1,0.0
1,1,water_1,e2,0.0
2,1,water_1,e4,0.0
3,1,water_1,e5,0.0
4,1,water_2,e1,0.181818
5,1,water_2,e2,0.0
6,1,water_2,e4,0.272727
7,1,water_2,e5,0.636364
8,1,water_3,e1,0.272727
9,1,water_3,e2,0.0


In [12]:
final_results.groupby(["sample_set", "method"])['acc'].agg(['min', 'max', 'mean'])

Unnamed: 0_level_0,Unnamed: 1_level_0,min,max,mean
sample_set,method,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,e1,0.0,0.272727,0.127273
1,e2,0.0,0.0,0.0
1,e4,0.0,0.545455,0.254545
1,e5,0.0,0.636364,0.290909
3,e1,0.0,0.272727,0.072727
3,e2,0.0,0.363636,0.072727
3,e4,0.090909,1.0,0.527273
3,e5,0.090909,1.0,0.563636
4,e1,0.0,1.0,0.304545
4,e2,0.0,0.75,0.297727


In [13]:
final_results.query("sample_set == '4'").query("method in ['e4', 'e5']")

Unnamed: 0,sample_set,sample_name,method,acc
42,4,food_1,e4,1.0
43,4,food_1,e5,1.0
46,4,food_2,e4,1.0
47,4,food_2,e5,1.0
50,4,food_3,e4,0.875
51,4,food_3,e5,0.875
54,4,water_1,e4,1.0
55,4,water_1,e5,1.0
58,4,water_2,e4,0.818182
59,4,water_2,e5,0.909091
