In [None]:
!sudo apt install tesseract-ocr
!sudo apt-get install tesseract-ocr-kor
!pip install pytesseract==0.3.9

[sudo] password for cvlabserver: 

In [None]:
import cv2
import re
import pandas as pd
import numpy as np
import pytesseract
from pytesseract import Output
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

### Data Load

In [None]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

### Metric

In [None]:
def get_accuracy(answer_df, predict_df):
    return accuracy_score(answer_df['text'].values, predict_df['text'].values)

### PyTesseract Model

In [None]:
class PyTesseract:
    def __init__(self, lang='kor'):
        self.lang = lang
    
    def load_image(self, img_path):
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        return img

    def text_preprocessing(self, text):
        text = text.replace('\n', '')
        text = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]','', text)
        text = text.strip()
        return text
    
    def prediction(self, img_path_list):
        preds = []
        for img_path in tqdm(img_path_list):
            img = self.load_image(img_path)
            text = pytesseract.image_to_string(img, lang=self.lang)
            text = self.text_preprocessing(text)
            preds.append(text)
        print('Done.')
        return preds

### Define PyTesseract

In [None]:
tesseract_model = PyTesseract()

### Validation (=Train dataset)

In [None]:
train_predicts = tesseract_model.prediction(train_df['img_path'].values)

In [None]:
train_predict_df = train_df.copy()
train_predict_df['text'] = train_predicts
print('Train Accuracy : ', get_accuracy(train_df, train_predict_df))

In [None]:
# 경계선 검출
## 경로는 각자 지정.
image_gray = cv2.imread("./train/train_00001.png", cv2.IMREAD_GRAYSCALE)

median_intensity = np.median(image_gray)
lower_threshold = int(max(0, (1.0 - 0.33) * median_intensity))
upper_threshold = int(min(255, (1.0 + 0.33) * median_intensity))

In [None]:
# 출력 코드
image_canny = cv2.Canny(image_gray, lower_threshold, upper_threshold)

plt.imshow(image_canny, cmap='gray')
plt.show()

In [None]:
# 배경 제거(이미지 이진화)
image_grey = cv2.imread('./train/train_00117.png', cv2.IMREAD_GRAYSCALE)

In [None]:
# Adaptive Thresholding 적용 
max_output_value = 255   # 출력 픽셀 강도의 최대값
neighborhood_size = 99
subtract_from_mean = 10
image_binarized = cv2.adaptiveThreshold(image_grey,
                                       max_output_value,
                                       cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                       cv2.THRESH_BINARY,
                                       neighborhood_size,
                                       subtract_from_mean)


In [None]:
# 출력
plt.imshow(image_binarized, cmap='gray')
plt.show()

In [None]:
# 제출 코드
submit = pd.read_csv('./sample_submission.csv')
submit['text'] = test_predicts
submit.to_csv('./submit.csv', index=False, encoding="utf-8-sig")