In [7]:
'''
네이버 맞춤법 api 사용
- 이모티콘 사라짐: '👌👌👌' -> ''
- 신조어 등은 안바뀜: '갬성' -> '갬성', '퐈이야' -> '퐈이야' 등


'''

import requests
import re
import json
import time
import pandas as pd
import html
from pykospacing import Spacing
from tqdm import tqdm
from soynlp.normalizer import repeat_normalize


def prep_spacing(text):
    spacing = Spacing()
    return spacing(text)


def prep_naver(text):
    def get_passport_key():
        url = "https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=%EB%A7%9E%EC%B6%A4%EB%B2%95%EA%B2%80%EC%82%AC%EA%B8%B0"
        response = requests.get(url)

        if response.status_code == 200:
            html = response.text
            match = re.search(r'passportKey=([a-zA-Z0-9-_]+)', html)
            if match:
                passport_key = match.group(1)
                return passport_key
            else:
                raise ValueError("passportKey not found in the HTML response.")
        else:
            raise ConnectionError(f"Failed to fetch the page, status code: {response.status_code}")

    # 맞춤법 검사를 처리하는 내부 함수
    def _spell_check_request(text, passport_key):
        payload = {
            'passportKey': passport_key,
            '_callback': passport_key,
            'q': text,
            'color_blindness': '0'
        }

        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
            'referer': 'https://search.naver.com/',
        }

        start_time = time.time()
        r = requests.get("https://m.search.naver.com/p/csearch/ocontent/util/SpellerProxy", params=payload, headers=headers)
        passed_time = time.time() - start_time

        json_match = re.search(r'\{.*\}', r.text)
        if json_match:
            json_data = json_match.group(0)
            data = json.loads(json_data)
            html = data['message']['result']['html']
            return _remove_tags(html)
        else:
            raise ValueError("No JSON data found in the response.")

    def _remove_tags(text):
        text = '<content>{}</content>'.format(text).replace('<br>','')
        result = ''.join(re.sub(r'<[^>]+>', '', text))
        return result

    def check(text, passport_key):
        try:
            return _spell_check_request(text, passport_key)
        except ValueError as e:
            if 'No JSON data found in the response' in str(e):
                print("passport_key expired, fetching a new one.")
                passport_key = get_passport_key()  # 새로운 passport_key 가져오기
                return _spell_check_request(text, passport_key)
            else:
                raise

    passport_key = get_passport_key()
    text = html.unescape(check(text, passport_key))

    return text


def prep_repeats(text):
    
    # 반복되는 감탄사나 비슷한 표현을 줄이는 함수
    def normalize_text(text):
        # 반복되는 글자(감탄사, 웃음소리 등)를 최대 2개로 줄임
        normalized_text = repeat_normalize(text, num_repeats=2)
        return normalized_text
    
    text = normalize_text(text)

    # 느낌표, 물음표, 점(.) 등의 문장 부호 반복을 최대 2개로 줄임
    text = re.sub(r'([!?.])\1{2,}', r'\1\1', text)
    
    # 한글 자음, 모음 반복 (예: ㅋㅋㅋ, ㅠㅠㅠㅠ 등)
    text = re.sub(r'([ㄱ-ㅎㅏ-ㅣ])\1{2,}', r'\1\1', text)
            
    # 이모지나 특수 문자의 반복 줄이기
    text = re.sub(r'([\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF\U0001F700-\U0001F77F])\1{2,}', r'\1\1', text)  # 이모지
    text = re.sub(r'([#$%&*])\1{2,}', r'\1\1', text)  # 특수문자

    # 표준 공백 외 다른 공백 문자들도 처리
    text = re.sub(r'[\u00A0\u1680\u2000-\u200A\u202F\u205F\u3000]', ' ', text)
        
    # 여러 개의 공백을 하나의 공백으로 줄임
    text = re.sub(r'\s{2,}', ' ', text)
    
    return text

def preprocess(df):
    df['sentence_1'] = df['sentence_1'].apply(prep_repeats)
    df['sentence_2'] = df['sentence_2'].apply(prep_repeats)
    # df['sentence_1'] = df['sentence_1'].apply(prep_naver)
    # df['sentence_2'] = df['sentence_2'].apply(prep_naver)

    return df

In [2]:
'''
네이버 맞춤법 api 사용
- 이모티콘 사라짐: '👌👌👌' -> ''
- 신조어 등은 안바뀜: '갬성' -> '갬성', '퐈이야' -> '퐈이야' 등


'''

import requests
import re
import json
import time
import pandas as pd
import html


def check_spell(dataframe):
    def get_passport_key():
        url = "https://search.naver.com/search.naver?where=nexearch&sm=top_hty&fbm=0&ie=utf8&query=%EB%A7%9E%EC%B6%A4%EB%B2%95%EA%B2%80%EC%82%AC%EA%B8%B0"
        response = requests.get(url)

        if response.status_code == 200:
            html = response.text
            match = re.search(r'passportKey=([a-zA-Z0-9-_]+)', html)
            if match:
                passport_key = match.group(1)
                print(f"passportKey found: {passport_key}")
                return passport_key
            else:
                raise ValueError("passportKey not found in the HTML response.")
        else:
            raise ConnectionError(f"Failed to fetch the page, status code: {response.status_code}")

    # 맞춤법 검사를 처리하는 내부 함수
    def _spell_check_request(text, passport_key):
        payload = {
            'passportKey': passport_key,
            '_callback': passport_key,
            'q': text,
            'color_blindness': '0'
        }

        headers = {
            'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/57.0.2987.133 Safari/537.36',
            'referer': 'https://search.naver.com/',
        }

        start_time = time.time()
        r = requests.get("https://m.search.naver.com/p/csearch/ocontent/util/SpellerProxy", params=payload, headers=headers)
        passed_time = time.time() - start_time

        json_match = re.search(r'\{.*\}', r.text)
        if json_match:
            json_data = json_match.group(0)
            data = json.loads(json_data)
            html = data['message']['result']['html']
            return _remove_tags(html)
        else:
            raise ValueError("No JSON data found in the response.")

    def _remove_tags(text):
        text = '<content>{}</content>'.format(text).replace('<br>','')
        result = ''.join(re.sub(r'<[^>]+>', '', text))
        return result

    def check(text, passport_key):
        try:
            return _spell_check_request(text, passport_key)
        except ValueError as e:
            if 'No JSON data found in the response' in str(e):
                print("passport_key expired, fetching a new one.")
                passport_key = get_passport_key()  # 새로운 passport_key 가져오기
                return _spell_check_request(text, passport_key)
            else:
                raise

    passport_key = get_passport_key()

    for i in tqdm(range(len(dataframe['sentence_1'])), desc='check_spell'):
        dataframe.loc[i, 'sentence_1'] = html.unescape(check(dataframe.loc[i, 'sentence_1'], passport_key))
        dataframe.loc[i, 'sentence_2'] = html.unescape(check(dataframe.loc[i, 'sentence_2'], passport_key))        

    return dataframe




In [9]:
train = pd.read_csv('train.csv')
train_preprop_v2 = preprocess(train)
train_preprop_v2 = check_spell(train_preprop_v2)
train_preprop_v2.to_csv('train_preprop_v2.csv', index=False)

passportKey found: 2caa5e5496fed85692709081d304c63cf6eaa3bc


check_spell: 100%|██████████| 9324/9324 [10:52<00:00, 14.28it/s]


In [15]:
dev = pd.read_csv('dev.csv')
dev_preprop_v2 = preprocess(dev)
dev_preprop_v2 = check_spell(dev_preprop_v2)
dev_preprop_v2.to_csv('dev_preprop_v2.csv', index=False)

passportKey found: 2caa5e5496fed85692709081d304c63cf6eaa3bc


check_spell: 100%|██████████| 550/550 [00:36<00:00, 15.09it/s]


In [8]:
dev_preprop_v2 = pd.read_csv('dev_preprop_v2.csv')
dev_preprop_v2_no_label = dev_preprop_v2.drop(columns=['label', 'binary-label'])
dev_preprop_v2_no_label.to_csv('dev_preprop_v2_no_label.csv', index=False)

In [16]:
test = pd.read_csv('test.csv')
test_preprop_v2 = preprocess(test)
test_preprop_v2 = check_spell(test_preprop_v2)
test_preprop_v2.to_csv('test_preprop_v2.csv', index=False)

passportKey found: 2caa5e5496fed85692709081d304c63cf6eaa3bc


check_spell: 100%|██████████| 1100/1100 [01:22<00:00, 13.35it/s]


In [12]:
# train_temp: train row 10개만 뽑아서 테스트
train_temp = train[20:50]
train_temp = preprocess(train_temp)
train_temp


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentence_1'] = df['sentence_1'].apply(prep_repeats)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentence_2'] = df['sentence_2'].apply(prep_repeats)


Unnamed: 0,id,source,sentence_1,sentence_2,label,binary-label
20,boostcamp-sts-v1-train-020,slack-sampled,앞머리 새로 하셨습니다. ^^,가방에 넣어 다니면서 조금씩 먹습니다. ^^,0.0,0.0
21,boostcamp-sts-v1-train-021,petition-rtt,김기덕 조재현 성폭행 철저히 수사해 주세요!,김기덕·조재현 성폭행 의혹 철저히 수사하라!,4.2,1.0
22,boostcamp-sts-v1-train-022,slack-sampled,답답할 때 보면 속이 뻥 뚫릴 것 같아요,양보단 한입 먹는 순간 고삐 풀릴 것 같아요 ㅋㅋ,0.0,0.0
23,boostcamp-sts-v1-train-023,nsmc-sampled,노래와 잘 어우러지는 영상 덕분인지 짧지만 강한 인상이 남네요..,조금 유치하지만 가볍게 볼 수는 있는 영화네요!,0.0,0.0
24,boostcamp-sts-v1-train-024,nsmc-rtt,군대 가기 전에 봤었는데 진짜 윈터스 같은 사람이 상관이면 목숨 걸고 싸워도 후회는...,입대하기 전에 봤는데 윈터스 같은 사람이 진심으로 아껴준다면 목숨을 걸고 싸워도 후...,4.2,1.0
25,boostcamp-sts-v1-train-025,petition-rtt,국민청원에 올린 글 삭제하는 청와대 뉴미디어정책실은 억울한 피해자를 죽이고 경찰에게...,"국민청원 글을 삭제하는 청와대 뉴미디어정책실은 부당한 피해자를 살해하고, 경찰에 증...",4.0,1.0
26,boostcamp-sts-v1-train-026,petition-sampled,전두환을 처벌해 주세요,이재용을 구속해 주세요,0.2,0.0
27,boostcamp-sts-v1-train-027,slack-rtt,"마지막으로 리모트 근무의 장점에 대해 이야기했는데, 시간을 효율적으로 사용할 수 있...","마지막으로 재택근무의 장점에 대해 이야기를 나누었고, 시간을 효율적으로 사용할 수 ...",4.2,1.0
28,boostcamp-sts-v1-train-028,slack-rtt,겨울산이 예쁘지만 산을 잘 못 타서 대리만족 중입니다,겨울산은 예쁜데 제가 등산을 잘 못해서 대만족입니다.,3.2,1.0
29,boostcamp-sts-v1-train-029,nsmc-rtt,한 사람의 파멸을 적나라하게 드러내 준 영화,한 사람의 파멸을 드러내는 영화,3.6,1.0


In [14]:
train_temp = check_spell(train_temp)
train_temp

passportKey found: 2caa5e5496fed85692709081d304c63cf6eaa3bc


check_spell:   0%|          | 0/30 [00:00<?, ?it/s]


KeyError: 0

In [3]:
n = 120
n = n-2
s2 = test['sentence_1'][n]

# s2 = '수많은반례을이겨냇따'

print(f'original     : {s2}')
s2_r = prep_repeats(s2)
print(f'after repeats: {s2_r}')
s2_rs = prep_spacing(s2_r)
print(f'after spacing: {s2_rs}')
s2_rsn = prep_naver(s2_rs)
print(f'after naver  : {s2_rsn}')

print()

s3 = s2
print(f'original     : {s3}')
s3_r = prep_repeats(s3)
print(f'after repeats: {s3_r}')
s3_rn = prep_naver(s3_r)
print(f'after naver  : {s3_rn}')


original     : 여운이 남는다라는 표현을 이럴때 쓰는것이죠!


NameError: name 'repeat_normalize' is not defined

In [44]:



# 예시 문장
text = "오오오오!! 아아아아 캬캬캬캬!! 너무 웃겨ㅋㅋㅋㅋ!!"
normalized_text = normalize_text(text)
print(normalized_text)

오오!! 아아 캬캬!! 너무 웃겨ㅋㅋ!!


실험

In [52]:
import transformers

tokenizer = transformers.AutoTokenizer.from_pretrained("klue/roberta-base", max_length=128)

In [136]:
from tqdm import tqdm

def tokenizing(tokenizer, dataframe):
    # 새 열을 추가할 준비
    dataframe['s1_tokens'] = None
    dataframe['s2_tokens'] = None
    
    for idx, item in tqdm(dataframe.iterrows(), desc='tokenizing', total=len(dataframe)):
        # sentence_1 토크나이징
        s1_outputs = tokenizer(
            item['sentence_1'],
            add_special_tokens=True,
            truncation=True,
            max_length=128
        )
        # sentence_2 토크나이징
        s2_outputs = tokenizer(
            item['sentence_2'],
            add_special_tokens=True,
            truncation=True,
            max_length=128
        )
        
        # 토크나이징된 결과를 각각 새 열에 저장
        dataframe.at[idx, 's1_tokens'] = tokenizer.convert_ids_to_tokens(s1_outputs['input_ids'])
        dataframe.at[idx, 's2_tokens'] = tokenizer.convert_ids_to_tokens(s2_outputs['input_ids'])
    
    return dataframe
        
# 예시로 test 데이터프레임에 대해 적용
test_tokenized = tokenizing(tokenizer, preprocess(test))


ConnectionError: Failed to fetch the page, status code: 403

In [126]:
# find [UNK] tokens in s1_tokens and s2_tokens
# make a new df 'test_unk' that contains rows with [UNK] tokens
def find_unk(df):
    count = 0
    unk_indices = []
    for idx, item in df.iterrows():
        if '[UNK]' in item['s1_tokens'] or '[UNK]' in item['s2_tokens']:
            unk_indices.append(idx)
            count += 1
    print(f'Found {count} rows with [UNK] tokens')
    return df.loc[unk_indices]

test_unk = find_unk(test_tokenized)
# test_unk.to_csv('a_unk.csv', index=False)


Found 41 rows with [UNK] tokens


In [129]:
# print 2nd row of test_unk

with open('slang.txt', 'w') as f:
    for n in range(len(test_unk)):
        if '[UNK]' in test_unk.iloc[n]['s1_tokens']:
            f.write(f'{test_unk.iloc[n]["sentence_1"]}\n')
            f.write(f'{test_unk.iloc[n]["s1_tokens"]}\n')
        if '[UNK]' in test_unk.iloc[n]['s2_tokens']:
            f.write(f'{test_unk.iloc[n]["sentence_2"]}\n')
            f.write(f'{test_unk.iloc[n]["s2_tokens"]}\n')


In [None]:
slang = {
    '여쭙도록': '물어보도록',
    '뿅간다': '멋있다',
    '있나욯ㅎㅎ': '있나요ㅎㅎ',
    '궁굼합니다': '궁금합니다',
    
    
}

In [105]:
# id to token example
id = [0, 1, 2]
token = tokenizer.convert_ids_to_tokens(id)
token


['[CLS]', '[PAD]', '[SEP]']

In [104]:
test_tokenized

In [101]:
# id to token
tokens = tokenizer.convert_ids_to_tokens(test_tokenized[0][0])
tokens

['[CLS]',
 '가상',
 '##화',
 '##폐',
 '##거래소',
 '폐쇄',
 '##하',
 '##지',
 '말',
 '##고',
 '[SEP]',
 '가상',
 '##화',
 '##폐',
 '거래소',
 '폐쇄',
 '반대',
 '##합니다',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PA