## Import Library

In [1]:
import os
import json
import pandas as pd

### Set pandas option

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max.colwidth', 100)

In [3]:
os.getcwd()

'/mnt/d/sjeon/BoostCamp_AI_Tech/main_course/Project/workspace/ocr_pj/sumin'

### Set Path

In [4]:
WORK_DIR = "/mnt/d/sjeon/BoostCamp_AI_Tech/main_course/Project/workspace/local"

DATA_DIR_HH = WORK_DIR + '/sample_data_HH'
DATA_DIR_NY = WORK_DIR + '/sample_data_NY'

info_path_hh = DATA_DIR_HH + '/info.json'
info_path_ny = DATA_DIR_NY + '/info.json'

In [5]:
def read_json(path: str) -> dict:
    with open(path, 'r', encoding = 'utf-8') as f:
        info = json.load(f)
    return info

In [6]:
info_hh = read_json(info_path_hh)
info_ny = read_json(info_path_ny)

---

## Category Settings
> 분류할 Category List   
추가 변경 가능성 존재  
참고 문서 : [Notion Page - Text Category](https://www.notion.so/jeongsu-823/OCR-Project-6a9215cf38df4bc1b3ce337d745b3a4c#55b18cfe810643d7bbab76b3cfa1fa94)
- '0': 'UNKNOWN', '1': 'name', '2': 'phone', '3': 'email', '4': 'position', '5': 'company', '6': 'department', '7': 'address', '8': 'site', '9': 'account', '10': 'wise'

In [7]:
CATEGORIES = info_hh['categories']
CATEGORIES

{'0': 'UNKNOWN',
 '1': 'name',
 '2': 'phone',
 '3': 'email',
 '4': 'position',
 '5': 'company',
 '6': 'department',
 '7': 'address',
 '8': 'site',
 '9': 'account',
 '10': 'wise'}

---

## Feature Engineering

### API Features Info
> API의 output features
- 현재는 OCR API output 과 generated Data Info 양식을 맞춰둔 상태

In [8]:
print(f"API Features Info: {list(info_hh['annotations'][0].keys())}")
print(f"API Features Info: {list(info_hh['annotations'][0]['ocr']['word'][0].keys())}")

API Features Info: ['image_id', 'ocr']
API Features Info: ['category_id', 'points', 'orientation', 'text']


In [81]:
features_origin = ['file_name', 'category_id', 'points', 'point_1', 'point_2', 'point_3', 'point_4', 'orientation', 'text']
features_origin

['file_name',
 'category_id',
 'points',
 'point_1',
 'point_2',
 'point_3',
 'point_4',
 'orientation',
 'text']

In [104]:
# json to DataFrame 
def convert_to_dataframe(info: dict, features: list=features_origin) -> pd.DataFrame:
    image_len = len(info['images'])
    df_namecard = pd.DataFrame(columns = features)

    for idx in range(image_len):
        # image_id = info['annotations'][idx]['image_id']
        file_name = info['images'][idx]['file']
        words = info['annotations'][idx]['ocr']['word']
        temp_dict = {}
        
        for word in words:
            category_id = word['category_id']
            points = word['points']
            point_1, point_2, point_3, point_4 = points
            # print(points)
            # print(point_1, point_2, point_3, point_4)
            # break
            orientation = word['orientation']
            text = word['text']
            temp_dict = {
                'file_name' : file_name,
                'category_id' : category_id,
                'points' : [points],
                'point_1' : point_1,
                'point_2' : point_2,
                'point_3' : point_3,
                'point_4' : point_4,
                'orientation' : orientation,
                'text' : text
            }
            print(df_namecard)
            df_namecard = pd.concat([df_namecard, pd.DataFrame(temp_dict)])
    
    return df_namecard

In [105]:
df_namecard_hh_origin = convert_to_dataframe(info_hh)
df_namecard_ny_origin = convert_to_dataframe(info_ny)

Empty DataFrame
Columns: [file_name, category_id, points, point_1, point_2, point_3, point_4, orientation, text]
Index: []


ValueError: All arrays must be of the same length

In [91]:
df_namecard_origin = pd.concat([df_namecard_hh_origin, df_namecard_ny_origin])
df_namecard_origin.head()

Unnamed: 0,file_name,category_id,points,point_1,point_2,point_3,point_4,orientation,text
0,0000.png,1,"[[81, 224.0], [294, 224.0], [294, 276.0], [81, 276.0]]",,,,,Horizontal,정윤서
0,0000.png,4,"[[339.0, 233.0], [372.0, 233.0], [372.0, 276.0], [339.0, 276.0]]",,,,,Horizontal,SI
0,0000.png,6,"[[303.0, 176.0], [372.0, 176.0], [372.0, 202.0], [303.0, 202.0]]",,,,,Horizontal,지원팀
0,0000.png,5,"[[446, 65], [801, 65], [801, 146], [446, 146]]",,,,,Horizontal,지니하우스
0,0000.png,10,"[[83, 41], [801, 41], [801, 65], [83, 65]]",,,,,Horizontal,만약 우리가 할 수 있는 일을 모두 한다면 우리들은 우리자신에 깜짝 놀랄 것이다.


## Feature Engineering
> 학습을 위한 Feature Engineering  
- Points 기반 features : width, height, ratio, area  
- Text 기반 features : text, include_AT_SIGN, phone_type_text, is_alpha, is_alnum, text_length

In [39]:
# Points based Features
features_by_points = ['point_1', 'point_2', 'point_3', 'point_4', 'points', 'width', 'height', 'ratio(h/w)', 'area']

# Text based Features
features_by_text = ['orientation', 'text', 'include_AT_SIGN', 'phone_type_text', 'is_alpha', 'is_alnum', 'text_length']

# All Features
features = features_by_points + features_by_text

# label
label = ['category_id']

In [40]:
features

['point_1',
 'point_2',
 'point_3',
 'point_4',
 'width',
 'height',
 'ratio(h/w)',
 'area',
 'orientation',
 'text',
 'include_AT_SIGN',
 'phone_type_text',
 'is_alpha',
 'is_alnum',
 'text_length']

### Feature Engineering function

- FeatureEngineering class 로 만들 예정

In [15]:
# width (가로 길이) : 시계 방향 기준 첫 번째 좌표와 두 번째 좌표의 x 값 차이

def calculate_width(points: list) -> float:
    point_1, point_2, point_3, point_4 = points
    width = abs(point_2[0] - point_1[0])
    
    return width

In [16]:
# height (세로 길이) : 시계 방향 기준 첫 번째 좌표와 네 번째 좌표의 y 값 차이

# Calculate Height
def calculate_height(points: list) -> float:
    point_1, point_2, point_3, point_4 = points
    height = abs(point_1[1] - point_4[1])
    
    return height

In [17]:
# ratio(h/w) (가로 길이에 대한 세로 길이의 비율) : 세로 길이 / 가로 길이

def calculate_ratio(width: float, height: float) -> float:
    ratio = height / width
    
    return ratio

In [18]:
# include_AT_SIGN : text 안에 '@' 이 포함된 경우 1, 아닌 경우 0

def check_include_at_sign(text: str) -> int:
    '''
        Check '@' is included
    '''
    if '@' in text:
        return 1
    else:
        return 0

In [19]:
# phone_type_text : 숫자 or '.' or '+' or '(' or ')' or '-' or ' ' 만 포함된 경우 1, 아닌 경우 0

def check_phone_type_text(text: str) -> int:
    '''
        Verify it is phone type text
    '''
    phone_type_char = '0123456789.+()- '
    
    for c in text:
        if c not in phone_type_char:
            return 0
    return 1

In [20]:
# is_alpha : Text 구성이 알파벳 또는 한글로만 이루어진 경우

def check_is_alpha(text: str) -> int:
    if text.isalpha():
        return 1
    else:
        return 0

In [21]:
# is_alnum : 알파벳 또는 한글 또는 숫자로만 이루어진 경우

def check_is_alnum(text: str) -> int:
    if text.isalnum():
        return 1
    else:
        return 0

In [22]:
# text_length : Text 의 길이

def calculate_text_length(text: str) -> int:
    text_length = len(text)
    
    return text_length

In [41]:
width = df_namecard_origin['points'].transform(calculate_width)
height = df_namecard_origin['points'].transform(calculate_height)
text = df_namecard_origin['text']

### Feature Engineering Output 생성
> df_namcard: DataFrame -> tab_data: np.array

In [42]:
df_namecard = pd.DataFrame()

In [43]:
# features by points

df_namecard['width'] = width
df_namecard['height'] = height
df_namecard['ratio(h/w)'] = height / width
df_namecard['area'] = height * width

In [44]:
# features by text

df_namecard['include_AT_SIGN'] = text.transform(check_include_at_sign)
df_namecard['phone_type_text'] = text.transform(check_phone_type_text)
df_namecard['is_alpha'] = text.transform(check_is_alpha)
df_namecard['is_alnum'] = text.transform(check_is_alnum)
df_namecard['text_length'] = text.transform(calculate_text_length)

In [45]:
# label

df_namecard['category_id'] = df_namecard_origin['category_id']

In [46]:
df_namecard.tail()

Unnamed: 0,width,height,ratio(h/w),area,include_AT_SIGN,phone_type_text,is_alpha,is_alnum,text_length,category_id
0,75.0,26.0,0.346667,1950.0,0,0,1,1,3,1
0,130.0,18.0,0.138462,2340.0,0,1,0,0,12,2
0,266.0,23.0,0.086466,6118.0,1,0,0,0,23,3
0,190.0,24.0,0.126316,4560.0,0,0,0,0,18,8
0,89.0,19.0,0.213483,1691.0,0,1,0,0,8,2


In [69]:
tab_data = df_namecard.to_numpy()

In [70]:
tab_data

array([[213.0, 52.0, 0.24413145539906103, ..., 1, 3, 1],
       [33.0, 43.0, 1.303030303030303, ..., 1, 2, 4],
       [69.0, 26.0, 0.37681159420289856, ..., 1, 3, 6],
       ...,
       [266.0, 23.0, 0.08646616541353383, ..., 0, 23, 3],
       [190.0, 24.0, 0.12631578947368421, ..., 0, 18, 8],
       [89.0, 19.0, 0.21348314606741572, ..., 0, 8, 2]], dtype=object)