In [1]:
# import libs
try:
    from PIL import Image
except ImportError:
    import Image
import cv2
import pytesseract
import os
import numpy as np
import pandas as pd
import re
from pdf2image import convert_from_bytes

In [2]:
# Some help functions 
def get_conf(page_gray):
    '''return a average confidence value of OCR result '''
    df = pytesseract.image_to_data(page_gray,output_type='data.frame')
    df.drop(df[df.conf==-1].index.values,inplace=True)
    df.reset_index()
    return df.conf.mean()

In [3]:
def deskew(image):
    '''deskew the image'''
    gray = cv2.bitwise_not(image)
    temp_arr = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
    coords = np.column_stack(np.where(temp_arr > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

In [4]:
'''
Main part of OCR:
pages_df: save extracted text for each pdf file, index by page
OCR_dic : dict for saving df of each pdf, filename is the key
'''

def extract_text_with_ocr(file_list, dir_source="."):
    OCR_dic={} 
    for file in file_list:
        if file.endswith(".pdf"):
            # convert pdf into image
            print(os.path.join(dir_source + "/" + file))
            pdf_file = convert_from_bytes(open(os.path.join(dir_source + "/" + file), 'rb').read())
            # create a df to save each pdf's text
            pages_df = pd.DataFrame(columns=['conf','text'])
            for (i,page) in enumerate(pdf_file) :
                try:
                    # transfer image of pdf_file into array
                    page_arr = np.asarray(page)
                    # transfer into grayscale
                    page_arr_gray = cv2.cvtColor(page_arr,cv2.COLOR_BGR2GRAY)
                    # deskew the page
                    page_deskew = deskew(page_arr_gray)
                    # cal confidence value
                    page_conf = get_conf(page_deskew)
                    # extract string 
                    pages_df = pd.concat([pages_df, pd.DataFrame([{'conf': page_conf,'text': pytesseract.image_to_string(page_deskew)}])], ignore_index=True)
                except:
                    # if can't extract then give some notes into df
                    pages_df =  pd.concat([pages_df, pd.DataFrame([{'conf': -1,'text': 'N/A'}])], ignore_index=True)
                    continue
            # save df into a dict with filename as key        
            OCR_dic[file]=pages_df
            print('{} is done'.format(file))
    return OCR_dic

In [5]:
source_dir = "./output_pdfs"

In [6]:
from os import listdir
from os.path import isfile, join

In [7]:
files = [f for f in listdir(source_dir) if isfile(join(source_dir, f))]

In [8]:
ocr = extract_text_with_ocr(files, source_dir)

./output_pdfs/New_working doc_RFP.pdf
New_working doc_RFP.pdf is done
./output_pdfs/Reference_Jibestream - responses.pdf
Reference_Jibestream - responses.pdf is done
./output_pdfs/Jibestream responses to MemorialCare Questions.pdf
Jibestream responses to MemorialCare Questions.pdf is done
./output_pdfs/Answers to questions.pdf
Answers to questions.pdf is done
./output_pdfs/Response-RFP 956.pdf
Response-RFP 956.pdf is done
./output_pdfs/16.07Jibestream Answers_IngkaQuestionnaire.pdf
16.07Jibestream Answers_IngkaQuestionnaire.pdf is done
./output_pdfs/Siemens_AAH RFP Submission - Location Based Services 2-22-19.pdf
Siemens_AAH RFP Submission - Location Based Services 2-22-19.pdf is done
./output_pdfs/Jibestream response V1.pdf
Jibestream response V1.pdf is done
./output_pdfs/Responses to Jibestream questions 2020-02-18.pdf
Responses to Jibestream questions 2020-02-18.pdf is done
./output_pdfs/Answered-Inpixon New Supplier & Service[2] DW.pdf
Answered-Inpixon New Supplier & Service[2] DW.

In [9]:
ocr

{'New_working doc_RFP.pdf':          conf                                               text
 0   54.623063  aoc\nech © 941 AN, Fr Oo 3:28 pe aC\nCancet oo...
 1   95.797235  Executive Summary\n\nSonae Sierra is seeking p...
 2   89.485972  ABOUT JIBESTREAM\n\nJibestream is the premier ...
 3   91.209774  Jibestream’s Notable Clients\n\nWestfield\n\n|...
 4   86.269008  Commercially Deployed Examples (Retail)\n\nOne...
 5   95.028154  Scope of Work\n\nFRONTEND REQUIREMENTS\n\nREQ ...
 6   88.418675  REQ 1.7 Search per store, tag, brand, category...
 7   91.118370  [7] Centrepoint ]\n\n@e—9 Ko\n\nAbercrombie .....
 8   91.275464  Turn by Turn indoor navigation is fully compat...
 9   94.685586  See image below from our AR demo:\n\nREQ 1.12 ...
 10  90.912575  OAKBROOK CENTER\n\n‘A GOP PROPERTY\n\nSTORES &...
 11  86.866949  Jibestream allows you to add any object to the...
 12  87.050246  REQ 2.3 Temporary changes to the floor plan: F...
 13  84.035828  Map Team > Dovercourt > Map Edito

In [10]:
ocr["New_working doc_RFP.pdf"]

Unnamed: 0,conf,text
0,54.623063,"aoc\nech © 941 AN, Fr Oo 3:28 pe aC\nCancet oo..."
1,95.797235,Executive Summary\n\nSonae Sierra is seeking p...
2,89.485972,ABOUT JIBESTREAM\n\nJibestream is the premier ...
3,91.209774,Jibestream’s Notable Clients\n\nWestfield\n\n|...
4,86.269008,Commercially Deployed Examples (Retail)\n\nOne...
5,95.028154,Scope of Work\n\nFRONTEND REQUIREMENTS\n\nREQ ...
6,88.418675,"REQ 1.7 Search per store, tag, brand, category..."
7,91.11837,[7] Centrepoint ]\n\n@e—9 Ko\n\nAbercrombie .....
8,91.275464,Turn by Turn indoor navigation is fully compat...
9,94.685586,See image below from our AR demo:\n\nREQ 1.12 ...


In [11]:
list(ocr.keys())

['New_working doc_RFP.pdf',
 'Reference_Jibestream - responses.pdf',
 'Jibestream responses to MemorialCare Questions.pdf',
 'Answers to questions.pdf',
 'Response-RFP 956.pdf',
 '16.07Jibestream Answers_IngkaQuestionnaire.pdf',
 'Siemens_AAH RFP Submission - Location Based Services 2-22-19.pdf',
 'Jibestream response V1.pdf',
 'Responses to Jibestream questions 2020-02-18.pdf',
 'Answered-Inpixon New Supplier & Service[2] DW.pdf']

In [12]:
content = ""

In [13]:
for key in ocr.keys():
    for index, row in ocr[key].iterrows():
        if row['conf'] > 80:
            content += row['text']

In [14]:
parse = content.replace("\n", " ")

In [15]:
#open text file
text_file = open("./train.pdf", "w")

In [16]:
#write string to file
text_file.write(parse)
#close file
text_file.close()