In [4]:
# open train.csv as pandas dataframe
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

train = pd.read_csv('train.csv')
valid = pd.read_csv('dev.csv')
test = pd.read_csv('test.csv')

In [5]:
train.shape, valid.shape, test.shape

((9324, 6), (550, 6), (1100, 4))

In [6]:
train.shape[0] + valid.shape[0] + test.shape[0]

10974

In [7]:
dataset = pd.concat([train, valid, test], ignore_index=True)
dataset.shape

(10974, 6)

In [8]:
dataset.isnull().sum()

id                 0
source             0
sentence_1         0
sentence_2         0
label           1100
binary-label    1100
dtype: int64

In [9]:
dataset.head()

Unnamed: 0,id,source,sentence_1,sentence_2,label,binary-label
0,boostcamp-sts-v1-train-000,nsmc-sampled,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요~,"반전도 있고,사랑도 있고재미도있네요.",2.2,0.0
1,boostcamp-sts-v1-train-001,slack-rtt,앗 제가 접근권한이 없다고 뜹니다;;,"오, 액세스 권한이 없다고 합니다.",4.2,1.0
2,boostcamp-sts-v1-train-002,petition-sampled,주택청약조건 변경해주세요.,주택청약 무주택기준 변경해주세요.,2.4,0.0
3,boostcamp-sts-v1-train-003,slack-sampled,입사후 처음 대면으로 만나 반가웠습니다.,화상으로만 보다가 리얼로 만나니 정말 반가웠습니다.,3.0,1.0
4,boostcamp-sts-v1-train-004,slack-sampled,뿌듯뿌듯 하네요!!,꼬옥 실제로 한번 뵈어요 뿌뿌뿌~!~!,0.0,0.0


In [10]:
import transformers
from tqdm import tqdm

tokenizer = transformers.AutoTokenizer.from_pretrained('klue/roberta-base', max_length=128)

def tokenizing(self, dataframe):
    data = []
    tokens_data = []  # 토큰 데이터를 저장할 리스트
    for idx, item in tqdm(dataframe.iterrows(), desc='tokenizing', total=len(dataframe)):
        # 두 입력 문장을 [SEP] 토큰으로 이어붙여서 전처리합니다.
        text = '[SEP]'.join([item[text_column] for text_column in ['sentence_1', 'sentence_2']])
        outputs = tokenizer(text, add_special_tokens=True, padding='max_length', truncation=True)
        data.append(outputs['input_ids'])
        tokens = tokenizer.convert_ids_to_tokens(outputs['input_ids'])  # token id를 tokens로 변환
        tokens_data.append(tokens)  # 토큰 리스트 추가
    return data, tokens_data


data, tokens = tokenizing(tokenizer, dataset)

  from .autonotebook import tqdm as notebook_tqdm
tokenizing: 100%|██████████| 10974/10974 [00:06<00:00, 1744.18it/s]


In [11]:
# make a new column 'tokens' in the dataframe
dataset['tokens'] = tokens

In [12]:
dataset.shape

(10974, 7)

In [13]:
dataset.head(10)

Unnamed: 0,id,source,sentence_1,sentence_2,label,binary-label,tokens
0,boostcamp-sts-v1-train-000,nsmc-sampled,스릴도있고 반전도 있고 여느 한국영화 쓰레기들하고는 차원이 다르네요~,"반전도 있고,사랑도 있고재미도있네요.",2.2,0.0,"[[CLS], 스릴, ##도, ##있, ##고, 반전, ##도, 있, ##고, 여느..."
1,boostcamp-sts-v1-train-001,slack-rtt,앗 제가 접근권한이 없다고 뜹니다;;,"오, 액세스 권한이 없다고 합니다.",4.2,1.0,"[[CLS], 앗, 제, ##가, 접근, ##권, ##한, ##이, 없, ##다고,..."
2,boostcamp-sts-v1-train-002,petition-sampled,주택청약조건 변경해주세요.,주택청약 무주택기준 변경해주세요.,2.4,0.0,"[[CLS], 주택, ##청, ##약, ##조건, 변경, ##해, ##주, ##세요..."
3,boostcamp-sts-v1-train-003,slack-sampled,입사후 처음 대면으로 만나 반가웠습니다.,화상으로만 보다가 리얼로 만나니 정말 반가웠습니다.,3.0,1.0,"[[CLS], 입사, ##후, 처음, 대면, ##으로, 만나, 반가웠, ##습, #..."
4,boostcamp-sts-v1-train-004,slack-sampled,뿌듯뿌듯 하네요!!,꼬옥 실제로 한번 뵈어요 뿌뿌뿌~!~!,0.0,0.0,"[[CLS], 뿌듯, ##뿌, ##듯, 하, ##네, ##요, !, !, [SEP]..."
5,boostcamp-sts-v1-train-005,nsmc-rtt,오마이가뜨지져스크롸이스트휏,오 마이 갓 지저스 스크론 이스트 팬,2.6,1.0,"[[CLS], [UNK], [SEP], 오, 마이, 갓, 지저, ##스, 스크, #..."
6,boostcamp-sts-v1-train-006,slack-rtt,전 암만 찍어도 까만 하늘.. ㅠㅠ,암만 찍어도 하늘은 까맣다.. ㅠㅠ,3.6,1.0,"[[CLS], 전, 암, ##만, 찍, ##어도, 까만, 하늘, ., ., ㅠㅠ, ..."
7,boostcamp-sts-v1-train-007,nsmc-sampled,이렇게 귀여운 쥐들은 처음이네요.ㅎㅎㅎ,이렇게 지겨운 공포영화는 처음..,0.6,0.0,"[[CLS], 이렇게, 귀여운, 쥐, ##들, ##은, 처음, ##이, ##네, #..."
8,boostcamp-sts-v1-train-008,petition-sampled,미세먼지 해결이 가장 시급한 문제입니다!,가장 시급한 것이 신생아실 관리입니다!!!,0.4,0.0,"[[CLS], 미세먼지, 해결, ##이, 가장, 시급, ##한, 문제, ##입니다,..."
9,boostcamp-sts-v1-train-009,petition-sampled,크림하우스 환불조치해주세요.,크림하우스 환불조치할 수 있도록해주세여,4.2,1.0,"[[CLS], 크림, ##하우스, 환불, ##조치, ##해, ##주, ##세요, ...."


In [14]:
# find '[UNK]' token in dataset['tokens'] and make a new df
unk_list = []
for i in range(len(dataset['tokens'])):
    if '[UNK]' in dataset['tokens'][i]:
        unk_list.append(i)
        
unk_df = dataset.loc[unk_list]
unk_df

Unnamed: 0,id,source,sentence_1,sentence_2,label,binary-label,tokens
1,boostcamp-sts-v1-train-001,slack-rtt,앗 제가 접근권한이 없다고 뜹니다;;,"오, 액세스 권한이 없다고 합니다.",4.2,1.0,"[[CLS], 앗, 제, ##가, 접근, ##권, ##한, ##이, 없, ##다고,..."
5,boostcamp-sts-v1-train-005,nsmc-rtt,오마이가뜨지져스크롸이스트휏,오 마이 갓 지저스 스크론 이스트 팬,2.6,1.0,"[[CLS], [UNK], [SEP], 오, 마이, 갓, 지저, ##스, 스크, #..."
158,boostcamp-sts-v1-train-158,slack-rtt,처음 뵌 분들과 빠르게 친해질 수 있을 것 같은 느낌!,처음 만나는 사람들과 금방 친해질 수 있는 것 같아요!,3.4,1.0,"[[CLS], 처음, [UNK], 분, ##들, ##과, 빠르, ##게, 친해, #..."
187,boostcamp-sts-v1-train-187,slack-sampled,와아아아안전 좋아요오오,꺄오오올!!!!! 환영합니다아아아,0.0,0.0,"[[CLS], 와, ##아아, ##아, ##안전, 좋아, ##요, ##오, ##오,..."
227,boostcamp-sts-v1-train-227,nsmc-sampled,수지 목소리 너무좋아ㅜㅜㅜ,소소한재미ㅜㅋ여탯껏삶을다시돌아보게하는영화ㅜ,0.0,0.0,"[[CLS], 수지, 목소리, 너무, ##좋, ##아, ##ㅜㅜ, ##ㅜ, [SEP..."
...,...,...,...,...,...,...,...
10907,boostcamp-sts-v1-test-1033,nsmc-sampled,삼가고인의 명복을 빕니다,삼가 고인의 명복을 빕니다,,,"[[CLS], 삼가, ##고, ##인, ##의, 명복, ##을, [UNK], [SE..."
10911,boostcamp-sts-v1-test-1037,slack-sampled,담에는 오프에서 얼굴 뵐 수 있기를,제 머리가 어깨에 닿기 전에는 한번 뵐 수 있길,,,"[[CLS], 담, ##에, ##는, 오프, ##에서, 얼굴, [UNK], 수, 있..."
10934,boostcamp-sts-v1-test-1060,nsmc-sampled,정말 재밌게봤습니다 ^^,정말 재밌게봣습니다^_^,,,"[[CLS], 정말, 재밌, ##게, ##봤, ##습, ##니다, ^, ^, [SE..."
10937,boostcamp-sts-v1-test-1063,slack-rtt,항상 요맘때쯤 비가와서 아쉬웠는데 이번 봄은 벚꽃 개나리 진달래가 모두 한자리에 모...,"이맘때쯤 비가 와서 아쉬웠는데, 이번 봄, 벚꽃, 개나리, 만개한 벚꽃이 만발하는 ...",,,"[[CLS], 항상, 요, ##맘, ##때, ##쯤, 비, ##가와, ##서, 아쉬..."


In [15]:
# make a txt file with unk_df 'tokens' data
with open('unk.txt', 'w') as f:
    for i in range(len(unk_df)):
        f.write(str(unk_df['sentence_1'].values[i]) + ' @ ' + str(unk_df['sentence_2'].values[i]) + '\n' + str(unk_df['tokens'].values[i]) + '\n' + '****************************************************************************************************' + '\n')

In [16]:
test_df = pd.DataFrame({
    'sentence_1': ['#오운완 #어쩌구 #저쩌구 << 이자리에 해시태그가 있었음', '띄어쓰기를안해버리기', '맞춤법 틀리면 외 않되? 쓰고싶은대로쓰면돼지'],
    'sentence_2': ['()()() () << 이자리에 빈괄호 있었음', '띄어쓰기를      두번이상 해도    한번만          띄어쓰기가 된다고? ', '맟춥뻡 틀려버리기~']
})
test_df

Unnamed: 0,sentence_1,sentence_2
0,#오운완 #어쩌구 #저쩌구 << 이자리에 해시태그가 있었음,()()() () << 이자리에 빈괄호 있었음
1,띄어쓰기를안해버리기,띄어쓰기를 두번이상 해도 한번만 띄어쓰기가 된다고?
2,맞춤법 틀리면 외 않되? 쓰고싶은대로쓰면돼지,맟춥뻡 틀려버리기~


In [17]:
# import re

# def delete_etcs(dataframe):
#     for i in range(len(dataframe['sentence_1'])):
#         dataframe['sentence_1'][i] = re.sub(r'#', '', dataframe['sentence_1'][i])
#         dataframe['sentence_2'][i] = re.sub(r'#', '', dataframe['sentence_2'][i])
#         dataframe['sentence_1'][i] = re.sub(r'\(\)', '', dataframe['sentence_1'][i])
#         dataframe['sentence_2'][i] = re.sub(r'\(\)', '', dataframe['sentence_2'][i])
#     return dataframe

# delete_etcs(test_df)

In [18]:
# for each values in train['sentence_1'] or train['sentence_2], apply spacing
from pykospacing import Spacing

def apply_spacing(dataframe):
    spacing = Spacing()
    for i in tqdm(range(len(dataframe['sentence_1'])), desc='apply_spacing'):
        dataframe.loc[i, 'sentence_1'] = spacing(dataframe.loc[i, 'sentence_1'])
        dataframe.loc[i, 'sentence_2'] = spacing(dataframe.loc[i, 'sentence_2'])
    return dataframe

apply_spacing(test_df)


apply_spacing: 100%|██████████| 3/3 [00:00<00:00,  6.41it/s]


Unnamed: 0,sentence_1,sentence_2
0,#오운 완 #어쩌구 #저쩌구 << 이 자리에 해시 태그가 있었음,()()() () << 이 자리에 빈괄호 있었음
1,띄어쓰기를 안 해버리기,띄어쓰기를 두 번 이상 해도 한 번만 띄어쓰기가 된다고?
2,맞춤법 틀리면 외 않되? 쓰고 싶은 대로 쓰면 돼지,맟춥뻡 틀려버리기~


In [19]:
'''
네이버 맞춤법 api 사용
- 이모티콘 사라짐: '👌👌👌' -> ''
- 신조어 등은 안바뀜: '갬성' -> '갬성', '퐈이야' -> '퐈이야' 등


'''

import requests
import re
import json
import time
import pandas as pd
import html


def check_spell(dataframe):
    def get_passport_key():
        url = "https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=%EB%A7%9E%EC%B6%A4%EB%B2%95%EA%B2%80%EC%82%AC%EA%B8%B0"
        response = requests.get(url)

        if response.status_code == 200:
            html = response.text
            match = re.search(r'passportKey=([a-zA-Z0-9-_]+)', html)
            if match:
                passport_key = match.group(1)
                print(f"passportKey found: {passport_key}")
                return passport_key
            else:
                raise ValueError("passportKey not found in the HTML response.")
        else:
            raise ConnectionError(f"Failed to fetch the page, status code: {response.status_code}")

    # 맞춤법 검사를 처리하는 내부 함수
    def _spell_check_request(text, passport_key):
        payload = {
            'passportKey': passport_key,
            '_callback': passport_key,
            'q': text,
            'color_blindness': '0'
        }

        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
            'referer': 'https://search.naver.com/',
        }

        start_time = time.time()
        r = requests.get("https://m.search.naver.com/p/csearch/ocontent/util/SpellerProxy", params=payload, headers=headers)
        passed_time = time.time() - start_time

        json_match = re.search(r'\{.*\}', r.text)
        if json_match:
            json_data = json_match.group(0)
            data = json.loads(json_data)
            html = data['message']['result']['html']
            return _remove_tags(html)
        else:
            raise ValueError("No JSON data found in the response.")

    def _remove_tags(text):
        text = '<content>{}</content>'.format(text).replace('<br>','')
        result = ''.join(re.sub(r'<[^>]+>', '', text))
        return result

    def check(text, passport_key):
        try:
            return _spell_check_request(text, passport_key)
        except ValueError as e:
            if 'No JSON data found in the response' in str(e):
                print("passport_key expired, fetching a new one.")
                passport_key = get_passport_key()  # 새로운 passport_key 가져오기
                return _spell_check_request(text, passport_key)
            else:
                raise

    passport_key = get_passport_key()

    for i in tqdm(range(len(dataframe['sentence_1'])), desc='check_spell'):
        dataframe.loc[i, 'sentence_1'] = html.unescape(check(dataframe.loc[i, 'sentence_1'], passport_key))
        dataframe.loc[i, 'sentence_2'] = html.unescape(check(dataframe.loc[i, 'sentence_2'], passport_key))        

    return dataframe




passportKey found: d97f0b07208e9fd39556aee101e1a171867d6ab4


check_spell: 100%|██████████| 3/3 [00:05<00:00,  1.74s/it]


Unnamed: 0,sentence_1,sentence_2
0,#오운 완 #어쩌고 #저쩌고 << 이 자리에 해시 태그가 있었음,()()() () << 이 자리에 빈 괄호 있었음
1,띄어쓰기를 안 해버리기,띄어쓰기를 두 번 이상 해도 한 번만 띄어쓰기가 된다고?
2,맞춤법 틀리면 왜 안돼? 쓰고 싶은 대로 쓰면 되지,맟춥뻡 틀려버리기~


In [20]:
local_test = pd.DataFrame({
    'sentence_1': ['👌', '갬성있네', '넵 알겠습니다 ㅋ', '퐈이야', '저번에 뵌거같은데'],
    'sentence_2': ['맞앜ㅋㅋㅋㅋ뭔데욬ㅋㅋㅋ', '진ㅉ ㅏ? 정말로?', '문재인정부', '봬용', '쭈뼛쭈뼛']
})

check_spell(local_test)

passportKey found: d97f0b07208e9fd39556aee101e1a171867d6ab4


check_spell: 100%|██████████| 5/5 [00:00<00:00, 18.87it/s]


Unnamed: 0,sentence_1,sentence_2
0,,맞앜ㅋㅋㅋㅋ뭔데욬ㅋㅋㅋ
1,갬성 있네,진ㅉ ㅏ? 정말로?
2,넵 알겠습니다 ㅋ,문재인 정부
3,퐈이야,봬요
4,저번에 뵌 거 같은데,쭈뼛쭈뼛


In [21]:
# use soynlp to correct spelling
from soynlp.normalizer import repeat_normalize as r_n

def repeat_normalize(dataframe):
    for i in tqdm(range(len(dataframe['sentence_1'])), desc='repeat_normalize'):
        dataframe.loc[i, 'sentence_1'] = r_n(dataframe.loc[i, 'sentence_1'], num_repeats=2)
        dataframe.loc[i, 'sentence_2'] = r_n(dataframe.loc[i, 'sentence_2'], num_repeats=2)
    return dataframe

local_test = pd.DataFrame({
    'sentence_1': ['맞앜ㅋㅋㅋㅋ', '와하하하하하하'],
    'sentence_2': ['맞앜ㅋㅋㅋㅋ!!!!!!!!!!!!', '캬캬캬캬캬캬캬']
})
    
repeat_normalize(local_test)

repeat_normalize: 100%|██████████| 2/2 [00:00<00:00, 2535.09it/s]


Unnamed: 0,sentence_1,sentence_2
0,맞앜ㅋㅋ,맞앜ㅋㅋ!!!!!!!!!!!!
1,와하하,캬캬


TEST 1

In [199]:
# # copy dataset
# dataset_filter_1 = dataset.copy()
# dataset_filter_1.shape

(10974, 7)

In [200]:
# # apply all the functions
# # dataset_filter_1 = delete_etcs(dataset_filter_1)
# dataset_filter_1 = apply_spacing(dataset_filter_1)
# dataset_filter_1 = check_spell(dataset_filter_1)
# dataset_filter_1 = repeat_normalize(dataset_filter_1)

apply_spacing: 100%|██████████| 10974/10974 [09:26<00:00, 19.37it/s]


passportKey found: d97f0b07208e9fd39556aee101e1a171867d6ab4


check_spell: 100%|██████████| 10974/10974 [12:05<00:00, 15.13it/s]
repeat_normalize: 100%|██████████| 10974/10974 [00:04<00:00, 2196.46it/s]


In [201]:
# data, tokens = tokenizing(tokenizer, dataset_filter_1)
# dataset_filter_1['tokens'] = tokens

# # find '[UNK]' token in dataset_filter_1['tokens'] and make a new df
# unk_list = []
# for i in range(len(dataset_filter_1['tokens'])):
#     if '[UNK]' in dataset_filter_1['tokens'][i]:
#         unk_list.append(i)
        
# unk_df = dataset_filter_1.loc[unk_list]

# # make a txt file with unk_df 'tokens' data
# with open('unk_filter_1.txt', 'w') as f:
#     for i in range(len(unk_df)):
#         f.write(str(unk_df['sentence_1'].values[i]) + ' @ ' + str(unk_df['sentence_2'].values[i]) + '\n' + str(unk_df['tokens'].values[i]) + '\n' + '****************************************************************************************************' + '\n')

tokenizing: 100%|██████████| 10974/10974 [00:06<00:00, 1809.50it/s]


FINAL: preprocess train.csv to train_preprocessed.csv

In [205]:
# # load train.csv
# train = pd.read_csv('train.csv')

# # spacing
# train_spacing = apply_spacing(train)
# train_spacing.to_csv('train_spacing.csv', index=False)

# # spell check
# train_spell = check_spell(train)
# train_spell.to_csv('train_spell.csv', index=False)

# # repeat normalize
# train_repeat = repeat_normalize(train)
# train_repeat.to_csv('train_repeat.csv', index=False)

# train_spacing_spell = apply_spacing(train)
# train_spacing_spell = check_spell(train_spacing_spell)
# train_spacing_spell.to_csv('train_spacing_spell.csv', index=False)

# train_spacing_repeat = apply_spacing(train)
# train_spacing_repeat = repeat_normalize(train_spacing_repeat)
# train_spacing_repeat.to_csv('train_spacing_repeat.csv', index=False)

# train_spell_repeat = check_spell(train)
# train_spell_repeat = repeat_normalize(train_spell_repeat)
# train_spell_repeat.to_csv('train_spell_repeat.csv', index=False)

# train_preproc_all = apply_spacing(train)
# train_preproc_all = check_spell(train_preproc_all)
# train_preproc_all = repeat_normalize(train_preproc_all)
# train_preproc_all.to_csv('train_all.csv', index=False)

apply_spacing: 100%|██████████| 9324/9324 [07:59<00:00, 19.46it/s]


passportKey found: d97f0b07208e9fd39556aee101e1a171867d6ab4


check_spell: 100%|██████████| 9324/9324 [10:26<00:00, 14.87it/s]
repeat_normalize: 100%|██████████| 9324/9324 [00:03<00:00, 2450.75it/s]
apply_spacing: 100%|██████████| 9324/9324 [08:03<00:00, 19.30it/s]


passportKey found: d97f0b07208e9fd39556aee101e1a171867d6ab4


check_spell: 100%|██████████| 9324/9324 [09:30<00:00, 16.33it/s]
repeat_normalize: 100%|██████████| 9324/9324 [00:03<00:00, 2458.72it/s]


In [24]:
# dev = pd.read_csv('dev.csv')

# dev_spacing = apply_spacing(dev)
# dev_spacing.to_csv('dev_spacing.csv', index=False)

# dev_spell = check_spell(dev)
# dev_spell.to_csv('dev_spell.csv', index=False)

# dev_repeat = repeat_normalize(dev)
# dev_repeat.to_csv('dev_repeat.csv', index=False)

# dev_spacing_spell = apply_spacing(dev)
# dev_spacing_spell = check_spell(dev_spacing_spell)
# dev_spacing_spell.to_csv('dev_spacing_spell.csv', index=False)

# dev_spacing_repeat = apply_spacing(dev)
# dev_spacing_repeat = repeat_normalize(dev_spacing_repeat)
# dev_spacing_repeat.to_csv('dev_spacing_repeat.csv', index=False)

# dev_spell_repeat = check_spell(dev)
# dev_spell_repeat = repeat_normalize(dev_spell_repeat)
# dev_spell_repeat.to_csv('dev_spell_repeat.csv', index=False)

# dev_preproc_all = apply_spacing(dev)
# dev_preproc_all = check_spell(dev_preproc_all)
# dev_preproc_all = repeat_normalize(dev_preproc_all)
# dev_preproc_all.to_csv('dev_all.csv', index=False)

apply_spacing: 100%|██████████| 550/550 [00:27<00:00, 19.67it/s]


passportKey found: d97f0b07208e9fd39556aee101e1a171867d6ab4


check_spell: 100%|██████████| 550/550 [00:37<00:00, 14.77it/s]
repeat_normalize: 100%|██████████| 550/550 [00:00<00:00, 3496.96it/s]
apply_spacing: 100%|██████████| 550/550 [00:28<00:00, 19.54it/s]


passportKey found: d97f0b07208e9fd39556aee101e1a171867d6ab4


check_spell: 100%|██████████| 550/550 [00:35<00:00, 15.38it/s]
apply_spacing: 100%|██████████| 550/550 [00:27<00:00, 19.79it/s]
repeat_normalize: 100%|██████████| 550/550 [00:00<00:00, 3578.71it/s]


passportKey found: d97f0b07208e9fd39556aee101e1a171867d6ab4


check_spell: 100%|██████████| 550/550 [00:30<00:00, 18.13it/s]
repeat_normalize: 100%|██████████| 550/550 [00:00<00:00, 3504.34it/s]
apply_spacing: 100%|██████████| 550/550 [00:27<00:00, 19.69it/s]


passportKey found: d97f0b07208e9fd39556aee101e1a171867d6ab4


check_spell: 100%|██████████| 550/550 [00:30<00:00, 17.98it/s]
repeat_normalize: 100%|██████████| 550/550 [00:00<00:00, 3439.11it/s]


In [25]:
# test = pd.read_csv('test.csv')

# test_spacing = apply_spacing(test)
# test_spacing.to_csv('test_spacing.csv', index=False)

# test_spell = check_spell(test)
# test_spell.to_csv('test_spell.csv', index=False)

# test_repeat = repeat_normalize(test)
# test_repeat.to_csv('test_repeat.csv', index=False)

# test_spacing_spell = apply_spacing(test)
# test_spacing_spell = check_spell(test_spacing_spell)
# test_spacing_spell.to_csv('test_spacing_spell.csv', index=False)

# test_spacing_repeat = apply_spacing(test)
# test_spacing_repeat = repeat_normalize(test_spacing_repeat)
# test_spacing_repeat.to_csv('test_spacing_repeat.csv', index=False)

# test_spell_repeat = check_spell(test)
# test_spell_repeat = repeat_normalize(test_spell_repeat)
# test_spell_repeat.to_csv('test_spell_repeat.csv', index=False)

# test_preproc_all = apply_spacing(test)
# test_preproc_all = check_spell(test_preproc_all)
# test_preproc_all = repeat_normalize(test_preproc_all)
# test_preproc_all.to_csv('test_all.csv', index=False)

apply_spacing: 100%|██████████| 1100/1100 [00:55<00:00, 19.88it/s]


passportKey found: d97f0b07208e9fd39556aee101e1a171867d6ab4


check_spell: 100%|██████████| 1100/1100 [01:09<00:00, 15.93it/s]
repeat_normalize: 100%|██████████| 1100/1100 [00:00<00:00, 4957.67it/s]
apply_spacing: 100%|██████████| 1100/1100 [00:55<00:00, 19.78it/s]


passportKey found: d97f0b07208e9fd39556aee101e1a171867d6ab4


check_spell: 100%|██████████| 1100/1100 [01:04<00:00, 17.03it/s]
apply_spacing: 100%|██████████| 1100/1100 [00:56<00:00, 19.55it/s]
repeat_normalize: 100%|██████████| 1100/1100 [00:00<00:00, 4930.33it/s]


passportKey found: b7584e2dbf34edd27f75b6430787c04eb65feb52


check_spell: 100%|██████████| 1100/1100 [01:06<00:00, 16.55it/s]
repeat_normalize: 100%|██████████| 1100/1100 [00:00<00:00, 4853.54it/s]
apply_spacing: 100%|██████████| 1100/1100 [01:00<00:00, 18.07it/s]


passportKey found: b7584e2dbf34edd27f75b6430787c04eb65feb52


check_spell: 100%|██████████| 1100/1100 [01:05<00:00, 16.67it/s]
repeat_normalize: 100%|██████████| 1100/1100 [00:00<00:00, 4954.95it/s]


In [None]:
# # tokenizer: add_token() -> add <PERSON>, <ADDRESS> tokens
# tokenizer.add_tokens(['<PERSON>'])

In [None]:
# def normalize_slang(text):
#     # 줄임말 -> 정식 표현으로 변환
#     slang_dict = {
#         '넵': '네',
#         ' 훅 ': ' 갑자기 '
#     }

#     # 줄임말 사전 기반으로 정규화
#     for slang, formal in slang_dict.items():
#         text = text.replace(slang, formal)
    
#     return text

# # 예시 문장
# text = "넘 졸귀탱 ㅋㅋ 진짜 짱이야 ㅠㅠ"

# # 줄임말 정규화 적용
# normalized_text = normalize_slang(text)
# print(normalized_text)  # "너무 졸라 귀엽다 웃음 진짜 정말이야 슬픔"