In [1]:
import cv2
import re
import pandas as pd
import numpy as np
import easyocr
from tqdm.auto import tqdm
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [29]:
train_df

Unnamed: 0,img_path,text
0,./train/train_00001.png,골목미용실
1,./train/train_00002.png,한성부동산
2,./train/train_00003.png,홍라운지
3,./train/train_00004.png,모단걸응접실
4,./train/train_00005.png,씨앗양식
...,...,...
12154,./train/train_12161.png,넬슨
12155,./train/train_12162.png,앤드루 램버트 지음
12156,./train/train_12163.png,박아람 옮김
12157,./train/train_12164.png,판결의


In [30]:
test_df

Unnamed: 0,img_path
0,./test/test_00001.png
1,./test/test_00002.png
2,./test/test_00003.png
3,./test/test_00004.png
4,./test/test_00005.png
...,...
3664,./test/test_03669.png
3665,./test/test_03670.png
3666,./test/test_03671.png
3667,./test/test_03672.png


In [31]:
def get_accuracy(answer_df, predict_df):
    return accuracy_score(answer_df['text'].values, predict_df['text'].values)

In [43]:
class EasyOCR:
    def __init__(self, lang='kor'):
        self.lang = lang
    
    def load_image(self, img_path):
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        return img

    def text_preprocessing(self, text):
        result = ''
        for word in text:
            word = word.replace('\n', '')
            word = re.sub('[-=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'…》]','', word)
            word = word.strip()
            result += word
        return result
    
    def prediction(self, img_path_list):
        preds = []
        for img_path in tqdm(img_path_list):
            img = self.load_image(img_path)
            reader = easyocr.Reader(['ko', 'en'], gpu = False, verbose = False)
            text = reader.readtext(img, detail = 0)
            text = self.text_preprocessing(text)
            preds.append(text)
        print('Done.')
        return preds

In [44]:
easyocr_model = EasyOCR()

In [45]:
train_predicts = easyocr_model.prediction(train_df['img_path'].values)

  0%|          | 0/12159 [00:00<?, ?it/s]

Done.


In [46]:
train_predicts

['골목미용실',
 '굶패도동설',
 '용라 운지',
 '모단결접실',
 '빼앗일식',
 '하정',
 '미소한의원',
 '화원',
 '가구',
 '백화 점',
 'D7위기',
 '오무큼 V식',
 '1가츠반사',
 '저온숙성 수제가춧전문점',
 '상산 치 과',
 '유항오리',
 '방수물',
 '하우뵈',
 '좌작나무',
 '김강',
 '포장',
 '줄기',
 '조 헤어산',
 '까끄미 조아',
 '영웬',
 '테익이운신',
 '헬',
 '부동산OI',
 '물',
 '선우약 국',
 '및양',
 '돼지국밥',
 '세신',
 '전기',
 '피노키오',
 '유경하어',
 '이진해어',
 '철거 전문',
 '성주자원',
 '상상 필럽',
 '스러리카페',
 '방',
 '그레',
 '마트',
 '바로',
 '머리',
 '연',
 '줄',
 '운가네',
 '떠리',
 '국구',
 '물리즈 아카데미',
 ';원즈줄인지인',
 '월 친공구 절',
 '똑동',
 '회원',
 '주',
 '처문 식물',
 '유동',
 '노래방',
 '전설의그집',
 '나주옥',
 '최고등급',
 '드래곤보물',
 '카페트아야',
 '토발숙 녀복',
 '오렌지',
 '동 창 산 업',
 '심 클 렉 선',
 '더',
 'DooO선',
 '내용에악몽',
 '한강정밀',
 '수노',
 '훈 오때 스텔',
 '바이린',
 '취미늄음악학원',
 '달름제',
 '노레업',
 '숙약',
 '호습희망',
 '공인충개사',
 '바른야기반찬',
 '친방학원',
 '화인마취동증의학과',
 '호정한문교실',
 '물이언z인',
 '상승인데리어',
 '구쪽X',
 '핸디맨트터',
 '옷',
 '만 드논 집',
 '헤',
 '원',
 '빌',
 '민 물 장어',
 '패밀리',
 '꼬마 김밥',
 '신불당점',
 '티니',
 '타이마사지',
 '김라전예어',
 '벽창호',
 '팀성',
 '분 통 산',
 '글로벌여빵',
 '장생',
 '공부방',
 '임름픽',
 'f기솟',
 '스나이움',
 '상소관',
 '팀다',
 '

In [48]:
train_predict_df = train_df.copy()
train_predict_df['text'] = train_predicts
print('Train Accuracy : ', get_accuracy(train_df, train_predict_df))

Train Accuracy :  0.4598239986841023


In [50]:
test_predicts = easyocr_model.prediction(test_df['img_path'].values)

  0%|          | 0/3669 [00:00<?, ?it/s]

Done.


In [51]:
submit = pd.read_csv('./sample_submission.csv')
submit['text'] = test_predicts
submit.to_csv('./easyocr01.csv', index=False, encoding="utf-8-sig")