# Livestock Market Analysis

Analysis of the Livestock Market in the Caribean Region of Colombia.

### OCR With Tesseract

OCR part done with Tesseract to recognize the characters in the pdf's.

In [3]:
# Import modules for OCR
try:
    import Image
except ImportError:
    from PIL import Image
import pytesseract

# Import os for getting image filenames in img directory
from os import listdir

# Import pandas
import pandas as pd

# Import datetime
import datetime

# Import sys
import sys

In [4]:
# The cutoff represents the file where a new format starts
cutoff = "14-01-07.jpg"
# Livestock lot types
livestock_types = ["HL", "HV", "VE", "VP", "ML", "MC", "TO", "BH", "BM"]
# Categories of iron for female/male
iron_categories = ["HEMBRA DE 1a", "HEMBRA DE 2a", "MACHO DE 1a", "MACHO DE 2a", "HEMBRA DE PRIMERA", "MACHO DE PRIMERA", "HEMBRA DE SEGUNDA", "MACHO DE SEGUNDA"]
# Digits array
digits = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
# Number of faulty lines
faulty_lines = 0
# Total lines
total_lines = 0

In [5]:
# Image filenames 
image_filenames = [f for f in listdir("img") if ".jpg" in f]

In [6]:
# Auction filenames list
auction_filenames = []

# Temp ary of auction filename
temp_ary = []

# Generate auction filenames list
for index, filename in enumerate(image_filenames):
    # Append filename to temp_ary
    temp_ary.append(filename)
    # If it is the end append temp_ary no matter what
    if index == len(image_filenames)-1:
        auction_filenames.append(temp_ary)
    else:
        # Select next filename root to check if it belongs to the same multipage file
        next_root_filename = "-".join(image_filenames[index+1].replace(".jpg", "").split("-")[0:3])
        # If the next filename is another page of the same file go on and keep appending to temp_ary
        if next_root_filename in filename:
            pass
        # Else apppend temp ary and empty it
        else:
            auction_filenames.append(temp_ary)
            temp_ary = []

In [7]:
# Check if it coincides with the number of pdf's records gotten from the auction webstie
if len(auction_filenames) == 458:
    print("Ente gut, alles gut!")

Ente gut, alles gut!


In [8]:
# Return if the line corresponds to the category of an iron. If so, return the name of the category
def is_iron_category(line):
    # Check if there is an iron category in the line
    has_iron_category = [category in line for category in iron_categories]
    # Business logic
    if True in has_iron_category:
        return True, iron_categories[has_iron_category.index(True)] 
    return False, None

In [9]:
# Return if the line corresponds to a data entry. If so, return the line 
def is_data_entry(line):
    # Number of digits in the line
    number_of_digits = sum([line.count(digit) for digit in digits])
    # Bussiness logic
    has_livestock_type = [livestock_type in line for livestock_type in livestock_types]
    if (number_of_digits > 10) and (True in has_livestock_type):
        # Return True, the data line, the livestock type of the data line 
        return True, line, livestock_types[has_livestock_type.index(True)]
    return False, None, None

In [25]:
# Correct common errors in data line that come out of OCR routine
def formatted_data_line(line, livestock_type, date, iron_category):
    # Make total_lines and faulty_lines accesible
    global total_lines
    global faulty_lines
    # Replace " ," by ",". Some numbers have this pattern and appear broken
    line = line.replace(" ,", ",")
    # Remove double spaces if any
    while "  " in line:
        line.replace("  ", " ")
    # Remove commas from numbers
    line = line.replace(",", "")
    # Check if age exists or not
    if line.index(livestock_type) == 0: # If age is not present
        age, numerical_data = None, line.replace("{0} ".format(livestock_type), "")
    else: # If age is present
        age, numerical_data = line.split(" {0} ".format(livestock_type))
    # Check that the correct number of columns is in place
    if len(numerical_data.split(" ")) != 6:
        # Add to the number of faulty lines encountered
        faulty_lines += 1
        return None
    # Add to the number of total lines processed
    total_lines +=1
    return (date, age, livestock_type, *map(int, numerical_data.split(" ")), iron_category)

In [11]:
# Method that pulls data from image and append it to dataframe. It returns the last iron brand category 
def append_image_to_dataframe(auction_filename, iron_category, date):
    # Make df accesible
    global df
    # Variables to hold temp dataframe information
    data = []
    columns = ["date", "age", "type", "quantity", "weight", "price_min", "price_max", "price_avg", "price_unit", "category"]
    # Locate and load the image
    img_path = "img/{0}".format(auction_filename)
    img = Image.open(img_path)
    # Apply OCR routine to image
    text = pytesseract.image_to_string(img)
    text_lines = text.split("\n")
    # Loop through each line and prepare dataframe to append
    for line in text_lines:
        # Check if line corresponds to an iron category or a data entry
        output_is_iron_category = is_iron_category(line)
        output_is_data_entry = is_data_entry(line)
        # Set the iron_category variable if the line describes an iron category
        if output_is_iron_category[0] == True: iron_category = output_is_iron_category[1]
        # Append to data to data array if line is a data entry
        if output_is_data_entry[0] == True: 
            # Construct formatted line
            formatted_line = formatted_data_line(output_is_data_entry[1], output_is_data_entry[2], date, iron_category)
            # Append line if it is not None
            if formatted_line != None: data.append(formatted_line)
    # Create temp dataframe and append it to main dataframe
    temp_df = pd.DataFrame(data=data, columns=columns)
    df = df.append(temp_df)
    # Return the last iron category found in the text
    return iron_category

In [26]:
# Dataframe
df = pd.DataFrame([])

In [27]:
# Count to limit number of files for testing
count = -1

# Loop through each auction and process the image data into the dataframe
for auction_ary in auction_filenames:
    # Get date str: YY-MM-DD
    date_str = "-".join(auction_ary[0].replace(".jpg", "").split("-")[0:3]) 
    # Construct date object
    year, month, day = map(int, date_str.split("-"))
    year = year + 2000
    date = datetime.date(year, month, day)
    ####################################
    final_count = 1
    # Break on the count file iteration
    count += 1
    if count == final_count:
        break
    ####################################
    # Category of livestock: Number of iron brands: 1st male, 2nd male, 1st female, 2nd female.
    iron_category = None
    # Auction filename
    for auction_filename in auction_ary:
        iron_category = append_image_to_dataframe("14-01-07.jpg", iron_category, date)

print("#### Error Analysis ####")
print("-------------------------------------")
print("Faulty lines: {0}".format(faulty_lines))
print("Total lines: {0}".format(total_lines))
print("=====================================")
print("Percentage of faults: {0:.2f}%".format(faulty_lines*100/total_lines))

#### Error Analysis ####
-------------------------------------
Faulty lines: 2
Total lines: 46
Percentage of faults: 4.35%


In [17]:
"HL 5 126 2,700 2,700 2,700 340,200".index("HL")

0

In [28]:
df

Unnamed: 0,date,age,type,quantity,weight,price_min,price_max,price_avg,price_unit,category
0,2013-01-08,,HL,5,126,2700,2700,2700,340200,HEMBRA DE PRIMERA
1,2013-01-08,1 1/2,HL,9,193,2700,2700,2700,520209,HEMBRA DE PRIMERA
2,2013-01-08,2 1/2,HV,16,336,2880,2880,2880,969120,HEMBRA DE PRIMERA
3,2013-01-08,2 1/4,HL,7,283,2600,2600,2600,736918,HEMBRA DE PRIMERA
4,2013-01-08,2 3/4,HV,7,383,2800,2800,2800,1071196,HEMBRA DE PRIMERA
5,2013-01-08,2 3/4,VE,5,390,2500,2500,2500,976000,HEMBRA DE PRIMERA
6,2013-01-08,3,HV,1,480,2600,2600,2600,1248000,HEMBRA DE PRIMERA
7,2013-01-08,3,VE,48,503,2400,2840,2696,1355084,HEMBRA DE PRIMERA
8,2013-01-08,3,BM,9,429,2740,2740,2740,1175761,MACHO DE PRIMERA
9,2013-01-08,3,MC,26,437,2900,3100,2988,1307526,MACHO DE PRIMERA
