In [1]:
import cv2
import re
import numpy as np
import pytesseract
import matplotlib.pyplot as plt

# Function to Extract Text From an Image

The function below will:
- Convert the image passed to a binary image
- Remove noise by applying morphological operation (erosion & dilation) and then and resize the binary image to 2x the original size while still retaining its aspect ratio
- utilise Tesseract to do text detection/recognition and extract text from an image

In [7]:
def read_text(input_img_path: str, 
              threshold: int, 
              txt_output_file: str,
              img_output_file: str,
              save_output: bool = True):
    """
    Read text from the image file passed to the function
    
    Parameters
    ----------
    input_img_path : str
        path to the image file (including its file format)
    
    threshold : int
        The threshold value used for binary thresholding
    
    txt_output_file : str
        path to the output file
    
    img_output_file : str
        path to the image output file which contains the pre-processed binary image
        
    save_output : bool
        Whether or not the text extracted is saved to an output file (default to True)
    
    Returns
    -------
    str
        Text extracted by PyTesseract from the image
    """
    input_img = cv2.imread(input_img_path) # Read the image 

    # gray scale img
    gray = cv2.cvtColor(input_img, cv2.COLOR_BGR2GRAY)

    # Thresholding to produce a binary image
    ret, bin_img = cv2.threshold(gray, threshold, 255, cv2.THRESH_BINARY)

    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (2,1))
    bin_img = cv2.morphologyEx(bin_img, cv2.MORPH_OPEN, kernel, iterations=1)

    ## Apply median blur
    image_blur = cv2.GaussianBlur(bin_img, (3,3), cv2.BORDER_DEFAULT) 

    # lets increase the size of our image/ increasing the size by 0.5 times
    resized_img = cv2.resize(image_blur, None, fx = 2.0, fy = 2.0 , interpolation = cv2.INTER_CUBIC)

    resized_img = cv2.dilate(resized_img,kernel, iterations =2)

    final_bin_img = cv2.erode(resized_img,kernel, iterations =2)
    text = pytesseract.image_to_string(final_bin_img, lang='eng')
    
    if save_output:
        # save the final binary image
        cv2.imwrite(img_output_file,final_bin_img)
        
        # save the text extracted from the final binary image
        with open(txt_output_file,'w') as output_file:
            output_file.write(text)
    else:
        return (final_bin_img,text)

# Extract Text from Sample MacBook Images

The biggest issue with Tesseract is finding the right `threshold` value. In the examples below, we can see that the ability of Tesseract to detect and recognise text in binary images are very much dependent upon the threshold value used in the image pre-processing stage. 

In [8]:
read_text(input_img_path='./input_images/macbook_image_1.jpeg',
          threshold=200,
          txt_output_file='./text_extract/macbook_image_1.txt',
          img_output_file='./output_bin_img/macbook_image_1.jpg')

In [9]:
read_text(input_img_path='./input_images/macbook_image_2.jpg',
          threshold=150, # threshold must be lowered as there's more 
                         # white pixels in this image
          txt_output_file='./text_extract/macbook_image_2.txt',
          img_output_file='./output_bin_img/macbook_image_2.jpg')

In [10]:
read_text(input_img_path='./input_images/macbook_box_image.jpeg',
          threshold=120, # even lower threshold value as there's more white pixels
                         # in this image
          txt_output_file='./text_extract/macbook_box_image.txt',
          img_output_file='./output_bin_img/macbook_box_image.jpg')

In [11]:
read_text(input_img_path='./input_images/macbook_image_3.jpeg',
          threshold=100, # lower the thrsehold value even further for this image
          txt_output_file='./text_extract/macbook_image_3.txt',
          img_output_file='./output_bin_img/macbook_image_3.jpg')

# Extract Laptop Details From Text Files

In [12]:
import re
from pprint import pprint

In [13]:
def get_laptop_details(txt_file_path: str):
    """
    Attempt to get certain laptop details/specs such as
    - screen size (in inches)
    - the year the laptop was released
    - CPU speed/frequency of the laptop
    - The no. of battery cycles of the laptop
    - Memory capacity/storage size of the laptop
    
    Parameters
    ----------
    txt_file_path : str
        file path to the text file output generated by Tesseract
    
    Returns
    -------
    dict[str]
         dictionary which contains string values
    """
    with open(txt_file_path,'r') as txt_file:
        text = txt_file.read()
        
    laptop_details = {}
    laptop_details['screen_size'] = re.search('\d{2}(?=.inch)',text)
    laptop_details['year'] = re.search('\d{4}',text)
    laptop_details['cpu_freq'] = re.search('\d\.\d+ ?ghz',text,flags=re.IGNORECASE)
    laptop_details['battery_cycle'] = re.search('(?<=cycle count).*(\d)+',text,flags=re.IGNORECASE)
    if laptop_details['battery_cycle']:
        laptop_details['battery_cycle'] = re.search('\d+',laptop_details['battery_cycle'].group())

    laptop_details['memory_or_storage'] = re.search('\d+ ?(gb|tb)',text,flags=re.IGNORECASE)
    
    for key,match_obj in laptop_details.items():
        if match_obj:
            laptop_details[key] = match_obj.group()
    return laptop_details

In [14]:
macbook_box_img_detail = get_laptop_details('./text_extract/macbook_box_image.txt')
macbook_img_1_detail= get_laptop_details('./text_extract/macbook_image_1.txt')
macbook_img_2_detail= get_laptop_details('./text_extract/macbook_image_2.txt')
macbook_img_3_detail= get_laptop_details('./text_extract/macbook_image_3.txt')

print('macbook_box_image')
print('==================')
pprint(macbook_box_img_detail)
print('\n')

print('macbook_img_1')
print('==================')
pprint(macbook_img_1_detail)
print('\n')

print('macbook_img_2')
print('==================')
pprint(macbook_img_2_detail)
print('\n')

print('macbook_img_3')
print('==================')
pprint(macbook_img_3_detail)
print('\n')

macbook_box_image
{'battery_cycle': None,
 'cpu_freq': '2.6GHz',
 'memory_or_storage': '16GB',
 'screen_size': '15',
 'year': '1707'}


macbook_img_1
{'battery_cycle': '308',
 'cpu_freq': '2.6 GHz',
 'memory_or_storage': '16 GB',
 'screen_size': '15',
 'year': '2016'}


macbook_img_2
{'battery_cycle': '331',
 'cpu_freq': '2.4 GHz',
 'memory_or_storage': '32 GB',
 'screen_size': '15',
 'year': '2019'}


macbook_img_3
{'battery_cycle': None,
 'cpu_freq': '2.5 Ghz',
 'memory_or_storage': None,
 'screen_size': '13',
 'year': '2012'}




# Match with Dataset Scraped from `everymac.com`

In [17]:
import pandas as pd
import re

everymac_df = pd.read_csv('./everymac_spider_project/output_csv/macbook_pro.csv')

First of all, we need to extract only the relevant/important features from everymac dataset that can be compared with the text Tesseract extracted from the images. The way we're going to do it is to create extra columns to everymac dataset:
- `gpu_no`: indicates the exact GPU number of the laptop
- `release_year`: the laptop release year
- `cpu_freq_value`: indicates the CPU speed/frequency in decimal number
- `screen_size`: the laptop screen size (in inches)

In [18]:
def get_gpu_no(GPU_name):
    normal_gpu_no = re.search('\w?\d+\w?',GPU_name)
    if normal_gpu_no:
        return normal_gpu_no.group()
    else:
        m1_gpu_no = re.search('\d+-core',GPU_name,re.IGNORECASE)
        if m1_gpu_no:
            return m1_gpu_no.group()
    
everymac_df['gpu_no']= everymac_df['GPU'].apply(
    lambda GPU_name: get_gpu_no(GPU_name),
)
everymac_df['release_year'] = everymac_df['Release Date'].apply(
    lambda date: re.search('\d{4}',date).group()
)
everymac_df['cpu_freq_value'] = everymac_df['CPU Frequency'].apply(
    lambda cpu_frequency: re.search('\d.\d+',cpu_frequency).group()
)
everymac_df['screen_size'] = everymac_df['Laptop Name'].apply(
    lambda laptop_name: re.search('(?<=Pro )\d{2}(?=")',laptop_name,flags=re.IGNORECASE).group()
)

In [19]:
everymac_df[['release_year','cpu_freq_value','screen_size','gpu_no']]

Unnamed: 0,release_year,cpu_freq_value,screen_size,gpu_no
0,2008,2.4,15,8600M
1,2007,2.2,15,8600M
2,2007,2.4,15,8600M
3,2006,2.33,17,X1600
4,2006,1.67,15,X1600
...,...,...,...,...
133,2008,2.6,17,8600M
134,2008,2.4,15,9600M
135,2008,2.5,17,8600M
136,2008,2.6,15,8600M


Next, we're going to try to find the laptop model for each macbook detail extracted in the previous section

In [20]:
def find_laptop_model(laptop_detail: dict,everymac_dataset: pd.DataFrame):
    return (
        everymac_df.loc[
            (everymac_df['release_year'] == laptop_detail['year']) & 
            (everymac_df['cpu_freq_value'] == re.search('\d\.\d+',laptop_detail['cpu_freq']).group()) &
            (everymac_df['screen_size'] == laptop_detail['screen_size'])
        ,:]
    )

Below we can see that `find_laptop_model` function is able to find the exact MacBook model which corresonds to `macbook_image_1.jpeg`

In [21]:
find_laptop_model(macbook_img_1_detail,everymac_df)

Unnamed: 0,Laptop Name,Release Date,Order No.,Model No.,ID,Video Memory,Storage,Optical Drive,URL,CPU Cores,...,RAM Speed,GPU,GPU Type,GPU Standard VRAM,GPU Max VRAM,Display Resolution,gpu_no,release_year,cpu_freq_value,screen_size
53,"MacBook Pro 15"" ""Core i7"" 2.6 Touch/Late 2016","October 27, 2016*",MLH32LL/A*,A1707,"MacBookPro13,3",2 GB*,256 GB SSD*,None*,https://everymac.com/systems/apple/macbook_pro...,4,...,2133 MHz,Radeon Pro 450,GDDR5,2 GB*,4 GB**,2880x1800,450,2016,2.6,15


Unfortunatately that's not always the case as we can see below where the function `find_laptop_model` is only able to narrow down the model of the laptop to two **MacBook Pro 15" 2019** variations

In [22]:
find_laptop_model(macbook_img_2_detail,everymac_df)

Unnamed: 0,Laptop Name,Release Date,Order No.,Model No.,ID,Video Memory,Storage,Optical Drive,URL,CPU Cores,...,RAM Speed,GPU,GPU Type,GPU Standard VRAM,GPU Max VRAM,Display Resolution,gpu_no,release_year,cpu_freq_value,screen_size
29,"MacBook Pro 15"" ""Core i9"" 2.4 Touch/2019","May 21, 2019",BTO/CTO,A1990,"MacBookPro15,1",4 GB*,"256, 512 GB SSD*",None*,https://everymac.com/systems/apple/macbook_pro...,8,...,2400 MHz,Radeon Pro 555X/560X*,GDDR5,4 GB*,4 GB*,2880x1800,555X,2019,2.4,15
31,"MacBook Pro 15"" ""Core i9"" 2.4 Touch/2019 Vega","May 21, 2019",BTO/CTO,A1990,"MacBookPro15,3",4 GB*,512 GB SSD*,None*,https://everymac.com/systems/apple/macbook_pro...,8,...,2400 MHz,Radeon Pro Vega 16/20*,HMB2,4 GB*,4 GB*,2880x1800,16,2019,2.4,15


The same thing happens here where the function `find_laptop_model` is only able to narrow down the details extracted from `macbook_image_3.jpeg` to two **MacBook Pro 13" 2012** model with Core i5

In [23]:
find_laptop_model(macbook_img_3_detail,everymac_df)

Unnamed: 0,Laptop Name,Release Date,Order No.,Model No.,ID,Video Memory,Storage,Optical Drive,URL,CPU Cores,...,RAM Speed,GPU,GPU Type,GPU Standard VRAM,GPU Max VRAM,Display Resolution,gpu_no,release_year,cpu_freq_value,screen_size
89,"MacBook Pro 13"" ""Core i5"" 2.5 Retina 2012","October 23, 2012",MD212LL/A*,A1425,"MacBookPro10,2",768 MB*,"128, 256 GB SSD",None*,https://everymac.com/systems/apple/macbook_pro...,2,...,1600 MHz,HD Graphics 4000,Integrated,768 MB*,768 MB*,2560x1600,4000,2012,2.5,13
98,"MacBook Pro 13"" ""Core i5"" 2.5 Mid-2012","June 11, 2012",MD101LL/A*,A1278,"MacBookPro9,2",512 MB*,500 GB HDD,"8X DL ""SuperDrive""",https://everymac.com/systems/apple/macbook_pro...,2,...,1600 MHz,HD Graphics 4000,Integrated,512 MB*,768 MB*,1280x800,4000,2012,2.5,13


We can see that it is possible to match macbook details extracted from images with the dataset scraped from everymac to assign the proper Macbook model and even their hardware specification to an e-commerce listings. This method is not perfect as it only compares matches the dataset with text extracted from images based on screen size, CPU speed and release date. 

In the future this can definitely be improved with:
- Better image pre-processing techniques such as adaptive thresholding and Otsu's Binarisation 
- Creating a K-NN model trained on everymac dataset to assign the proper laptop model to a listing