In [1]:
import pickle

class TrieNode:
    """A node in the trie structure"""

    def __init__(self, char):
        # the character stored in this node
        self.char = char

        # whether this can be the end of a word
        self.is_end = False

        # a counter indicating how many times a word is inserted
        # (if this node's is_end is True)
        self.counter = 0

        # a dictionary of child nodes
        # keys are characters, values are nodes
        self.children = {}
        
        self.productids = []


class Trie(object):
    """The trie object"""

    def __init__(self):
        self.root = TrieNode("")
    
    def insert(self, words):
        node = self.root
        word = words[0]
        productid = words[1]
        # Loop through each character in the word
        # Check if there is no child containing the character, create a new child for the current node
        for char in word:
            if char in node.children:
                node = node.children[char]
            else:
                # If a character is not found,
                # create a new node in the trie
                new_node = TrieNode(char)
                node.children[char] = new_node
                node = new_node
        
        # Mark the end of a word
        node.is_end = True
        node.productids.append(productid)
        # Increment the counter to indicate that we see this word once more
        node.counter += 1
        
    def dfs(self, node, prefix):
        if node.is_end:
            self.output.append((prefix + node.char, node.counter))
            self.product_result.extend(node.productids)
        
        for child in node.children.values():
            self.dfs(child, prefix + node.char)
            
    def dfs1(self, node, prefix):
        if node.is_end:
            self.output.append((prefix + node.char, node.counter))
            self.product_result.extend(node.productids)

        
    def query(self, x):
        # Use a variable within the class to keep all possible outputs
        # As there can be more than one word with such prefix
        self.output = []
        self.product_result = []
        
        node = self.root
        length = 0 
        
        # Check if the prefix is in the trie
        for char in x:
            length = length + 1
            if char in node.children:
                node = node.children[char]
            else:
                # cannot found the prefix, return empty list
                if length>=4:
                    self.dfs1(node, x[:-1])
                    return sorted(self.output, key=lambda x: x[1], reverse=True), self.product_result
                    #return [], []
                else:
                    return [], []
        
        self.dfs(node, x[:-1])

        # Sort the results in reverse order and return
        return sorted(self.output, key=lambda x: x[1], reverse=True), self.product_result




## Loading the trie structure and database file

In [2]:
import os
import collections
from fuzzywuzzy import fuzz


import cv2
import numpy as np
import time 
import time

import sys
import re




file1 = open('database.pkl', 'rb')
database = pickle.load(file1)

In [3]:
# open a file, where you stored the pickled data
file = open('./trie_milestone2_revised.pkl', 'rb')
import pickle
# dump information to that file
data = pickle.load(file)
data_salt = pickle.load(open('trie_milestone2_salt_revised.pkl', 'rb'))
data_mfs = pickle.load(open('trie_milestone2_mfs_revised.pkl', 'rb'))



In [4]:
def paragraph_process(text):
    words = text#.split()
    final_text1 = []
    final_words = []
    dict_word = {}
    dict_product = {}
    dict_product_count = {}
    for word in words:
        if word == '' or not re.search('[a-zA-Z0-9]', word):
            continue  
        word = word.lower()
        word = word.rstrip('.,-')
        word = word.lstrip('.,-')
            
        if word.isnumeric() or len(word)==1:
            continue
        if not word in dict_word.keys():
            dict_word[word] = 1
            

            if word in ['tablets', 'tablet', 'daily' ,  'and' , 'capsules' ,'contain', 'contains', 'syrup', 'syrups']:
                continue
            xls1, temp = data.query(word)

            dict1 ={}
                
            if len(temp) != 0:
                final_words.append(word)
                for elem in temp:
                    if not elem in dict1.keys():
                        final_text1.append(elem)
                        dict1[elem]=1
                        if not elem in dict_product.keys():
                            dict_product[elem]=[word]
                            dict_product_count[elem]=(1/len(temp)) 
                        else:
                            dict_product[elem].append(word)
                            dict_product_count[elem] = dict_product_count[elem] + 1/len(temp)

    return final_text1, dict_product, dict_product_count, final_words 


def paragraph_process_salt_mfs(text):
    words = text#.split()
    final_text1 = []
    final_words = []
    dict_word = {}
    dict_product = {}
    dict_product_count = {}
    for word in words:
        if word.isnumeric():
            continue
        if not word in dict_word.keys():
            dict_word[word] = 1
            word = word.lower()
            word = word.rstrip('.,-')
            word = word.lstrip('.,-')

            if word in ['tablets', 'tablet', 'daily' ,  'and' , 'capsules' ,'contain', 'contains', 'syrup', 'syrups']:
                continue
            xls1, temp = data_salt.query(word)
            xls2, temp1 = data_mfs.query(word)
            temp.extend(temp1)
            
            dict1 ={}
                
            if len(temp) != 0:
                final_words.append(word)
                for elem in temp:
                    if not elem in dict1.keys():
                        final_text1.append(elem)
                        dict1[elem]=1
                        if not elem in dict_product.keys():
                            dict_product[elem]=[word]
                            dict_product_count[elem]=(1/len(temp)) 
                        else:
                            dict_product[elem].append(word)
                            dict_product_count[elem] = dict_product_count[elem] + 1/len(temp)

    return final_text1, dict_product, dict_product_count, final_words 
        


In [5]:
def preprocess_text(result):
    words = re.split('\W+', result)
    final_words = []
    final_words1 = []
    for elem in words:
        if len(elem) >2:
            final_words.append(elem)
            #final_words1.append(elem)
            if elem[-1] == 's':
                final_words.append(elem[:-1])
    words_new = result.split()
    for elem in words_new:
        #if len(elem) >3:
        #print("elem", elem)
        count1 = 0
        count2 = 0
        for ie in elem:
            if(ie.islower()):
                count1=count1+1
            elif(ie.isupper()):
                count2=count2+1
        if count1 + count2 !=0:        
            if len(elem) <=3 or count2/(count1+count2) > 0.3:
                final_words1.append(elem)
            else:
                final_words1.extend(re.findall('[A-Z][^A-Z]*', elem))
                final_words.extend(re.findall('[A-Z][^A-Z]*', elem))  ## newly added
        else:
            final_words1.append(elem)
            
        if '-' in elem:
            final_words.append(elem.replace('-', '')) # change
            final_words.append(elem)
    return final_words, final_words1

In [6]:

def printlargest(arr, arr_size):    
    fifth = fourth = third = first = second = -sys.maxsize
    
    fifth_index = fourth_index = third_index = second_index = first_index = 0
     
    for i in range(0, arr_size):
     
        # If current element is greater
        # than first
        if (arr[i] > first):
            
            fifth = fourth
            fifth_index = fourth_index
            
            fourth = third
            fourth_index = third_index
         
            third = second
            third_index = second_index
            
            second = first
            second_index = first_index
            
            first = arr[i]
            first_index = i
         
 
        # If arr[i] is in between first
        # and second then update second
        elif (arr[i] > second):
            
            fifth = fourth
            fifth_index = fourth_index
            
            fourth = third
            fourth_index = third_index
         
            third = second
            third_index = second_index

            second = arr[i]
            second_index = i
         
        elif (arr[i] > third):
            fifth = fourth
            fifth_index = fourth_index
            
            fourth = third
            fourth_index = third_index           
            
            third = arr[i]
            third_index = i

        elif (arr[i] > fourth):
            fifth = fourth
            fifth_index = fourth_index        
            
            fourth = arr[i]
            fourth_index = i

        elif (arr[i] > fifth):
            fifth = arr[i]
            fifth_index = i
    
    top_indexes = []
    top_indexes.extend([first_index, second_index, third_index, fourth_index, fifth_index])
     
    return top_indexes


In [11]:
def find_pred(ocr_text):
    try:
        result1 = ocr_text
        result, result_2 = preprocess_text(result1)
        final_text1, dict_product, dict_product_count, final_words = paragraph_process(result)

        Str1 = ""
        for j in range(len(result_2)):
            Str1 = Str1 + " "+ result_2[j]

        temp = collections.Counter(dict_product_count)

        temp1 = temp.most_common(min(200, len(temp)))

        list11 = []
        for jj in range(len(temp1)):
            list11.append(fuzz.token_set_ratio(Str1,database[temp1[jj][0]]))

        top_indexes = printlargest(list11, len(list11))

        sort_index = top_indexes
        if list11[sort_index[0]] >=25 or list11[sort_index[1]] >=25 or list11[sort_index[2]] >=25:
            return (str(temp1[sort_index[0]][0]), list11[sort_index[0]]), (str(temp1[sort_index[1]][0]), sort_index[1]), (str(temp1[sort_index[2]][0]), sort_index[2])
        else:
            final_text1, dict_product, dict_product_count, final_words = paragraph_process_salt_mfs(result)
            temp = collections.Counter(dict_product_count)
            temp1 = temp.most_common(min(200, len(temp)))
            list11 = []
            
            for jj in range(len(temp1)):
                list11.append(fuzz.token_set_ratio(Str1,database[temp1[jj][0]]))
            top_indexes = printlargest(list11, len(list11))
            sort_index = top_indexes
            return (str(temp1[sort_index[0]][0]), list11[sort_index[0]]), (str(temp1[sort_index[1]][0]), sort_index[1]), (str(temp1[sort_index[2]][0]), sort_index[2])
            
        
    except:
        return None, None, None

In [12]:
a,b,c = find_pred(' AzoMF 60ml. R Mefenamic Acid & Paracetamol Suspension ADzo:MF SUSPENSION FOR PAEDIATRICUSE ONLY')

In [13]:
a

('a5868', 78)

In [14]:
b

('s14004', 30)

In [15]:
c

('t12', 31)