# Dataframe Creation

Apply OCR routine to create pandas dataframe to posterior analysis.

### OCR With Tesseract

OCR part done with Tesseract to recognize the characters in the pdf's.

In [242]:
# Import modules for OCR
try:
    import Image
except ImportError:
    from PIL import Image
import pytesseract

# Import os for getting image filenames in img directory
from os import listdir

# Import pandas
import pandas as pd

# Import datetime
import datetime

# Import sys
import sys

# Import progress bar 2
import progressbar

In [243]:
# The cutoff represents the file where a new format starts
cutoff = "14-01-07.jpg"
# Livestock lot types
livestock_types = ["HL", "HV", "VE", "VP", "ML", "MC", "TO", "BH", "BM"]
# Categories of iron for female/male
iron_categories = ["HEMBRA DE 1a", "HEMBRA DE 2a", "MACHO DE 1a", "MACHO DE 2a", "HEMBRA DE PRIMERA", "MACHO DE PRIMERA", "HEMBRA DE SEGUNDA", "MACHO DE SEGUNDA"]
# Digits array
digits = ["0", "1", "2", "3", "4", "5", "6", "7", "8", "9"]
# Number of faulty lines
faulty_lines = 0
# Total lines
total_lines = 0

In [244]:
# Image filenames 
image_filenames = [f for f in listdir("img") if ".jpg" in f]

In [245]:
# Auction filenames list
auction_filenames = []

# Temp ary of auction filename
temp_ary = []

# Generate auction filenames list
for index, filename in enumerate(image_filenames):
    # Append filename to temp_ary
    temp_ary.append(filename)
    # If it is the end append temp_ary no matter what
    if index == len(image_filenames)-1:
        auction_filenames.append(temp_ary)
    else:
        # Select next filename root to check if it belongs to the same multipage file
        next_root_filename = "-".join(image_filenames[index+1].replace(".jpg", "").split("-")[0:3])
        # If the next filename is another page of the same file go on and keep appending to temp_ary
        if next_root_filename in filename:
            pass
        # Else apppend temp ary and empty it
        else:
            auction_filenames.append(temp_ary)
            temp_ary = []

In [246]:
# Check if it coincides with the number of pdf's records gotten from the auction webstie
if len(auction_filenames) == 458:
    print("Ente gut, alles gut!")

Ente gut, alles gut!


In [247]:
# Return if the line corresponds to the category of an iron. If so, return the name of the category
def is_iron_category(line):
    # Check if there is an iron category in the line
    has_iron_category = [category in line for category in iron_categories]
    # Business logic
    if True in has_iron_category:
        return True, iron_categories[has_iron_category.index(True)] 
    return False, None

In [248]:
# Return if the line corresponds to a data entry. If so, return the line 
def is_data_entry(line):
    # Number of digits in the line
    number_of_digits = sum([line.count(digit) for digit in digits])
    # Bussiness logic
    has_livestock_type = [livestock_type in line for livestock_type in livestock_types]
    if (number_of_digits > 10) and (True in has_livestock_type):
        # Return True, the data line, the livestock type of the data line 
        return True, line, livestock_types[has_livestock_type.index(True)]
    return False, None, None

In [249]:
# Correct common errors in data line that come out of OCR routine
def formatted_data_line(line, livestock_type, date, iron_category):
    # Make total_lines and faulty_lines accesible
    global total_lines
    global faulty_lines
    # Replace " ," by ",". Some numbers have this pattern and appear broken
    line = line.replace(" ,", ",")
    # Remove double spaces if any
    while "  " in line:
        line.replace("  ", " ")
    # Remove commas and dots from numbers
    line = line.replace(",", "").replace(".", "")
    # Check if age exists or not
    if line.index(livestock_type) == 0: # If age is not present
        age, numerical_data = None, line.replace("{0} ".format(livestock_type), "")
    else: # If age is present
        age, numerical_data = line.split(" {0} ".format(livestock_type))
    # Check that the correct number of columns is in place
    if len(numerical_data.split(" ")) != 6:
        # Add to the number of faulty lines encountered
        faulty_lines += 1
        # Print line
        print("### Error in line: ###")
        print(line)
        return None
    # Add to the number of total lines processed
    total_lines +=1
    try:
        output = (date, age, livestock_type, *map(int, numerical_data.split(" ")), iron_category)
        return output
    except:
        print("### Error: ###")
        print(line)
        sys.exit("### sys.exit ###")

In [250]:
# Method that pulls data from image and append it to dataframe. It returns the last iron brand category 
def append_image_to_dataframe(auction_filename, iron_category, date):
    # Make df accesible
    global df
    # Variables to hold temp dataframe information
    data = []
    columns = ["date", "age", "type", "quantity", "weight", "price_min", "price_max", "price_avg", "price_unit", "category"]
    # Locate and load the image
    img_path = "img/{0}".format(auction_filename)
    img = Image.open(img_path)
    # Apply OCR routine to image
    text = pytesseract.image_to_string(img)
    text_lines = text.split("\n")
    # Loop through each line and prepare dataframe to append
    for line in text_lines:
        # Check if line corresponds to an iron category or a data entry
        output_is_iron_category = is_iron_category(line)
        output_is_data_entry = is_data_entry(line)
        # Set the iron_category variable if the line describes an iron category
        if output_is_iron_category[0] == True: iron_category = output_is_iron_category[1]
        # Append to data to data array if line is a data entry
        if output_is_data_entry[0] == True: 
            # Construct formatted line
            formatted_line = formatted_data_line(output_is_data_entry[1], output_is_data_entry[2], date, iron_category)
            # Append line if it is not None
            if formatted_line != None: data.append(formatted_line)
    # Create temp dataframe and append it to main dataframe
    temp_df = pd.DataFrame(data=data, columns=columns)
    df = df.append(temp_df)
    # Return the last iron category found in the text
    return iron_category

In [251]:
# Dataframe
df = pd.DataFrame([])

In [252]:
# Initialize progress bar
bar = progressbar.ProgressBar()

# Count to limit number of files for testing
count = -1

# Loop through each auction and process the image data into the dataframe
for auction_ary in bar(auction_filenames):
    # Get date str: YY-MM-DD
    date_str = "-".join(auction_ary[0].replace(".jpg", "").split("-")[0:3]) 
    # Construct date object
    year, month, day = map(int, date_str.split("-"))
    year = year + 2000
    date = datetime.date(year, month, day)
    ####################################
    '''final_count = 1
    # Break on the count file iteration
    count += 1
    if count == final_count:
        break'''
    ####################################
    # Category of livestock: Number of iron brands: 1st male, 2nd male, 1st female, 2nd female.
    iron_category = None
    # Auction filename
    for auction_filename in auction_ary:
        iron_category = append_image_to_dataframe(auction_filename, iron_category, date)

print("#### Error Analysis ####")
print("-------------------------------------")
print("Faulty lines: {0}".format(faulty_lines))
print("Total lines: {0}".format(total_lines))
print("=====================================")
print("Percentage of faults: {0:.2f}%".format(faulty_lines*100/total_lines))

  0% (1 of 458) |                         | Elapsed Time: 0:00:04 ETA:  0:34:46

### Error in line: ###
DEST ML 2 1 18 2200 2200 2200 259600


  1% (5 of 458) |                         | Elapsed Time: 0:00:24 ETA:  0:36:39

### Error in line: ###
XX TO 1 1 522 2500 2740 2625 1399295
### Error in line: ###
DEST HL 10 1 13 800 2350 1733 246260


  1% (8 of 458) |                         | Elapsed Time: 0:00:37 ETA:  0:35:19

### Error in line: ###
BEST ML 2 1 19 2600 2600 2600 309400


  2% (11 of 458) |                        | Elapsed Time: 0:00:48 ETA:  0:30:44

### Error in line: ###
DEST ML 26 1 16 3000 3000 3000 348692


  2% (13 of 458) |                        | Elapsed Time: 0:00:54 ETA:  0:27:11

### Error in line: ###
2 MC 1 10 263 2800 2950 2883 757289


  3% (18 of 458) |                        | Elapsed Time: 0:01:13 ETA:  0:27:01

### Error in line: ###
XX BM 29 255 2450 2450 2450 6251 72


  4% (20 of 458) |#                       | Elapsed Time: 0:01:23 ETA:  0:28:25

### Error in line: ###
DEST HL 21 1 12 2300 2400 2350 265905


  5% (25 of 458) |#                       | Elapsed Time: 0:01:44 ETA:  0:32:18

### Error in line: ###
DEST HL 28 1 18 2500 2500 2500 294643


  5% (27 of 458) |#                       | Elapsed Time: 0:01:54 ETA:  0:32:37

### Error in line: ###
DEST HL 2 1 17 2250 2250 2250 263250


  6% (30 of 458) |#                       | Elapsed Time: 0:02:07 ETA:  0:30:02

### Error in line: ###
DEST HL 1 1 109 2450 2450 2450 267718
### Error in line: ###
XX VE 1 1 323 1950 2450 2255 755280


                                                                                 6% (31 of 458) |#                       | Elapsed Time: 0:02:10 ETA:  0:30:07

### Error in line: ###
DEST HL 45 1 18 2500 2950 2738 324329


  7% (33 of 458) |#                       | Elapsed Time: 0:02:19 ETA:  0:29:48

### Error in line: ###
DEST HL 25 1 18 2350 2600 2475 292492


  7% (35 of 458) |#                       | Elapsed Time: 0:02:29 ETA:  0:29:24

### Error in line: ###
1 3/4 ML 1 17 235 2950 3350 3225 766185


  8% (37 of 458) |#                       | Elapsed Time: 0:02:38 ETA:  0:29:28

### Error in line: ###
DEST HL 4 1 10 2500 2500 2500 275000
### Error in line: ###
DEST ML 24 1 11 3300 3300 3300 367125
### Error in line: ###
DEST HL 3 1 15 2350 2350 2350 269467


  8% (41 of 458) |##                      | Elapsed Time: 0:02:57 ETA:  0:31:28

### Error in line: ###
1 3/4 HL 232 1900 1900 1900 440800


 10% (47 of 458) |##                      | Elapsed Time: 0:03:23 ETA:  0:28:44

### Error in line: ###
DEST HL 24 1 16 2600 2700 2650 307800
### Error in line: ###
DEST ML 14 1 14 3200 3500 3350 372357


                                                                                10% (48 of 458) |##                      | Elapsed Time: 0:03:30 ETA:  0:32:12

### Error in line: ###
XX TO 7 460 2980 3060 3020 13991 14


 11% (51 of 458) |##                      | Elapsed Time: 0:03:43 ETA:  0:29:38

### Error in line: ###
DEST ML 2 1 19 2800 2800 2800 333200


 11% (53 of 458) |##                      | Elapsed Time: 0:03:52 ETA:  0:29:33

### Error in line: ###
DEST HL 1 1 18 2800 2800 2800 330400


 12% (57 of 458) |##                      | Elapsed Time: 0:04:10 ETA:  0:29:46

### Error in line: ###
2 1/4 MC 1 13 295 3080 3280 3207 949044


 14% (65 of 458) |###                     | Elapsed Time: 0:04:48 ETA:  0:29:38

### Error in line: ###
DEST HL 2 1 15 2950 2950 2950 339250
### Error in line: ###
DEST HL 5 1 14 2700 2700 2700 308880
### Error in line: ###
DEST ML 3 1 18 3050 3050 3050 359900


 16% (75 of 458) |###                     | Elapsed Time: 0:05:39 ETA:  0:31:06

### Error in line: ###
DEST ML 2 1 15 3400 3600 3500 401200


                                                                                16% (76 of 458) |###                     | Elapsed Time: 0:05:42 ETA:  0:31:01

### Error in line: ###
DEST ML 24 1 14 3300 3600 3467 403058


 17% (78 of 458) |####                    | Elapsed Time: 0:05:51 ETA:  0:30:28

### Error in line: ###
2 MC 1 12 260 3350 3400 3383 879504


 17% (80 of 458) |####                    | Elapsed Time: 0:06:00 ETA:  0:29:53

### Error in line: ###
DEST ML 5 1 10 2700 3450 3033 337780


 17% (82 of 458) |####                    | Elapsed Time: 0:06:13 ETA:  0:29:40

### Error in line: ###
DEST HL 29 1 17 3000 3100 3067 358462
### Error in line: ###
DEST ML 1 1 14 3300 3300 3300 376200


 18% (84 of 458) |####                    | Elapsed Time: 0:06:22 ETA:  0:29:49

### Error in line: ###
XX VP 1 458 2440 2440 2440 11 17520


 19% (90 of 458) |####                    | Elapsed Time: 0:06:51 ETA:  0:30:06

### Error in line: ###
XX TO 2 81 1 2800 2840 2820 2288400


 20% (92 of 458) |####                    | Elapsed Time: 0:07:00 ETA:  0:27:42

### Error in line: ###
DEST ML 5 1 17 3300 3350 3325 389020


 52% (241 of 458) |############           | Elapsed Time: 0:19:31 ETA:  0:17:41

### Error in line: ###
MC 10 1750000 1750000 1750000


 61% (283 of 458) |##############         | Elapsed Time: 0:23:01 ETA:  0:14:09

### Error in line: ###
VE 10 1900000 1900000 1900000


 72% (330 of 458) |################       | Elapsed Time: 0:26:28 ETA:  0:09:21

### Error in line: ###
HV 2 2075000 2075000 2075000


 83% (383 of 458) |###################    | Elapsed Time: 0:30:18 ETA:  0:05:09

### Error in line: ###
3 VE 37 434 2940 3400 3228 1 405944
### Error in line: ###
3 HV 2 449 3640 3640 3640 1 634360


 84% (387 of 458) |###################    | Elapsed Time: 0:30:35 ETA:  0:04:52

### Error in line: ###
2 VE 1 268 1800 1 800 1 800 482400


 85% (390 of 458) |###################    | Elapsed Time: 0:30:50 ETA:  0:05:10

### Error in line: ###
3 VE 80 466 3360 3580 3445 1 608620
### Error in line: ###
2 ML 20 269 4550 4550 4550 1 223950


                                                                                85% (391 of 458) |###################    | Elapsed Time: 0:30:56 ETA:  0:05:05

### Error in line: ###
1 1/3 ML 76 237 4350 4800 4520 1071 630


 86% (394 of 458) |###################    | Elapsed Time: 0:31:08 ETA:  0:04:57

### Error in line: ###
3 HV 2 459 3640 3640 3640 1 670760
### Error in line: ###
3 VE 89 477 3000 3460 3254 1 556306


                                                                                86% (395 of 458) |###################    | Elapsed Time: 0:31:14 ETA:  0:04:53

### Error in line: ###
2 MC 25 258 3750 4450 4033 1 042267


                                                                                86% (396 of 458) |###################    | Elapsed Time: 0:31:17 ETA:  0:04:49

### Error in line: ###
3 VE 28 505 3280 3480 3367 1 702000


                                                                                86% (397 of 458) |###################    | Elapsed Time: 0:31:23 ETA:  0:04:42

### Error in line: ###
2 3/4 VE 23 384 2680 3100 2850 1 093825


 87% (400 of 458) |####################   | Elapsed Time: 0:31:35 ETA:  0:04:08

### Error in line: ###
3 HV 1 464 3560 3560 3560 1651 840
### Error in line: ###
2 ML 24 252 4900 4900 4900 1 234800


                                                                                87% (401 of 458) |####################   | Elapsed Time: 0:31:40 ETA:  0:04:24

### Error in line: ###
3 HV 2 447 3580 3580 3580 1 600260
### Error in line: ###
3 VE 28 442 2320 3340 3002 1 327248


                                                                                87% (402 of 458) |####################   | Elapsed Time: 0:31:43 ETA:  0:03:56

### Error in line: ###
3 VE 5 488 3420 3620 3535 1 723785
### Error in line: ###
3 MC 30 415 4040 4060 4047 1 679347
### Error in line: ###
3 VE 40 534 3200 3700 3502 1 874829
### Error in line: ###
3 VP 1 484 2600 2600 2600 1 258400


                                                                                87% (403 of 458) |####################   | Elapsed Time: 0:31:50 ETA:  0:04:11

### Error in line: ###
3 VE 40 447 2620 3440 3065 1 378988


                                                                                88% (404 of 458) |####################   | Elapsed Time: 0:31:53 ETA:  0:03:50

### Error in line: ###
TO 250 15000 1 200000 277137


                                                                                88% (405 of 458) |####################   | Elapsed Time: 0:31:59 ETA:  0:04:08

### Error in line: ###
3 HV 2 407 3620 3620 3620 1 473340


 89% (408 of 458) |####################   | Elapsed Time: 0:32:12 ETA:  0:03:45

### Error in line: ###
2 HV 8 278 3900 3900 3900 1 084200


 90% (413 of 458) |####################   | Elapsed Time: 0:32:34 ETA:  0:03:28

### Error in line: ###
ML 136 770000 810000 796667
### Error in line: ###
2 MC 41 270 4700 4700 4700 1 271 350


 90% (415 of 458) |####################   | Elapsed Time: 0:32:44 ETA:  0:03:14

### Error in line: ###
3 VE 6 490 3620 4020 3820 1 873400
### Error in line: ###
3 TO 2 463 4260 4260 4260 1 972380
### Error in line: ###
3 TO 1 438 4000 4000 4000 1 752000
### Error in line: ###
3 TO 2 428 4340 4340 4340 1 857520


                                                                                90% (416 of 458) |####################   | Elapsed Time: 0:32:50 ETA:  0:03:12

### Error in line: ###
3 TO 3 535 2880 4100 3640 1 988640
### Error in line: ###
2 HV 8 252 3850 4000 3925 991 175
### Error in line: ###
3 HV 3 470 3940 3940 3940 1851 800


 91% (419 of 458) |#####################  | Elapsed Time: 0:33:05 ETA:  0:03:14

### Error in line: ###
2 MC 79 262 4660 5000 4852 1 270550
### Error in line: ###
3 TO 1 694 4340 4340 4340 3011 960


                                                                                91% (420 of 458) |#####################  | Elapsed Time: 0:33:12 ETA:  0:03:09

### Error in line: ###
2 HV 4 263 3300 3550 3425 901 650


 92% (422 of 458) |#####################  | Elapsed Time: 0:33:21 ETA:  0:03:07

### Error in line: ###
2 3/4 VE 39 382 3100 3640 3327 1 269100
### Error in line: ###
3 VE 38 422 2300 3720 3331 1 409267
### Error in line: ###
3 TO 1 430 4000 4000 4000 1 720000


 92% (424 of 458) |#####################  | Elapsed Time: 0:33:30 ETA:  0:02:56

### Error in line: ###
3 HV 2 402 3900 3900 3900 1 567800


 93% (426 of 458) |#####################  | Elapsed Time: 0:33:39 ETA:  0:02:33

### Error in line: ###
3 HV 1 426 3940 3940 3940 1 678440
### Error in line: ###
3 VE 33 430 2840 3600 3384 1 456720
### Error in line: ###
3 VP 16 423 3300 3550 3400 1 439333


                                                                                93% (427 of 458) |#####################  | Elapsed Time: 0:33:42 ETA:  0:02:17

### Error in line: ###
1 HL 14 142 4200 4300 4250 601 350


                                                                                93% (428 of 458) |#####################  | Elapsed Time: 0:33:48 ETA:  0:02:22

### Error in line: ###
2 MC 54 266 4200 4900 4662 1 242550
### Error in line: ###
3 TO 5 464 3900 3900 3900 1 809600


                                                                                93% (429 of 458) |#####################  | Elapsed Time: 0:33:51 ETA:  0:02:06

### Error in line: ###
3 VE 34 479 3460 4000 3722 1 792428
### Error in line: ###
2 HV 25 272 3600 4200 3933 1071 000


                                                                                93% (430 of 458) |#####################  | Elapsed Time: 0:33:57 ETA:  0:02:11

### Error in line: ###
3 MC 28 409 3760 4380 4047 1 653600
### Error in line: ###
3 TO 2 443 3900 4000 3950 1 748700


 95% (437 of 458) |#####################  | Elapsed Time: 0:34:26 ETA:  0:01:29

### Error in line: ###
3 HV 38 417 3640 3840 3747 1561 573


                                                                                95% (438 of 458) |#####################  | Elapsed Time: 0:34:32 ETA:  0:01:31

### Error in line: ###
2 3/4 VE 42 384 1 700 3220 2973 1141053
### Error in line: ###
1 1/3 ML 71 234 4250 4950 4675 1091 200


 96% (440 of 458) |###################### | Elapsed Time: 0:34:40 ETA:  0:01:21

### Error in line: ###
3 HV 1 402 3300 3300 3300 1 326600


                                                                                96% (441 of 458) |###################### | Elapsed Time: 0:34:43 ETA:  0:01:10

### Error in line: ###
3 VE 8 467 3000 3440 3225 1 509035
### Error in line: ###
3 VE 94 462 2700 3640 3323 1 532671


                                                                                96% (442 of 458) |###################### | Elapsed Time: 0:34:49 ETA:  0:01:12

### Error in line: ###
3 HV 1 440 3580 3580 3580 1 575200


                                                                                96% (443 of 458) |###################### | Elapsed Time: 0:34:52 ETA:  0:01:01

### Error in line: ###
2 3/4 VE 14 377 2700 3200 2867 1 077333
### Error in line: ###
1 1/2 ML 1 211 150000 150000 150000 31 650000
### Error in line: ###
3 VE 49 467 3000 3540 3300 1 547568
### Error in line: ###
3 VP 2 437 3200 3200 3200 1 398400


                                                                                96% (444 of 458) |###################### | Elapsed Time: 0:34:58 ETA:  0:01:02

### Error in line: ###
3 VE 20 430 2750 3420 3022 1 299340


                                                                                97% (445 of 458) |###################### | Elapsed Time: 0:35:01 ETA:  0:00:53

### Error in line: ###
3 HV 1 402 3340 3340 3340 1 342680
### Error in line: ###
3 VE 61 464 3260 3440 3336 1 547562
### Error in line: ###
3 MC 22 469 3900 4320 4073 1 906473


 98% (449 of 458) |###################### | Elapsed Time: 0:35:19 ETA:  0:00:38

### Error in line: ###
3 HV 3 489 3840 3840 3840 1 877760
### Error in line: ###
2 MC 78 270 4440 4800 4588 1 239664


                                                                                98% (450 of 458) |###################### | Elapsed Time: 0:35:25 ETA:  0:00:36

### Error in line: ###
3 VP 5 472 3350 3400 3375 1 593500


 99% (454 of 458) |###################### | Elapsed Time: 0:35:42 ETA:  0:00:18

### Error in line: ###
2 3/4 VE 45 369 2 800 3580 3132 1 153550


100% (458 of 458) |#######################| Elapsed Time: 0:35:59 Time: 0:35:59


#### Error Analysis ####
-------------------------------------
Faulty lines: 108
Total lines: 11301
Percentage of faults: 0.96%
