In [1]:
# data preprocessing
# nltk.download("popular")          # nltk error solving

import string

# 특수 문자와 구두점이 포함된 예제 텍스트
text = "토큰화(Tokenization)는 문자열을 작은 단위, 즉 '토큰(Token)'으로 분리하는 과정입니다.@#"

# 특수 문자 및 구두점 제거
cleaned_text = text.translate(str.maketrans('', '', string.punctuation))

print("Original Text:")
print(text)

print("\nCleaned Text after Removing Special Characters and Punctuation:")
print(cleaned_text)

Original Text:
토큰화(Tokenization)는 문자열을 작은 단위, 즉 '토큰(Token)'으로 분리하는 과정입니다.@#

Cleaned Text after Removing Special Characters and Punctuation:
토큰화Tokenization는 문자열을 작은 단위 즉 토큰Token으로 분리하는 과정입니다


In [2]:
# 대소문자가 혼합된 예제 텍스트
text = "NLP is very helpful for Everyone."

# into lower cases
lowercase_text = text.lower()

print("Original Text:")
print(text)
print("\nLowercase Text:")
print(lowercase_text)

Original Text:
NLP is very helpful for Everyone.

Lowercase Text:
nlp is very helpful for everyone.


In [4]:
# 축약형이 포함된 예시 텍스트
text = "I can't believe it's so hot today. Let's go to the park."

# 축약 및 확장 형태 사전
contraction_dict = {
    "can't": "cannot",
    "it's": "it is",
    "let's": 'let us'
}

# 텍스트를 단어로 토큰화
words = text.split()

# 축약을 확장
expanded_text = [contraction_dict[word] if word in contraction_dict else word for word in words]

expanded_text = ' '.join(expanded_text)

print("Original text:")
print(text)

print("\nExpanded Text with Contractions:")
print(expanded_text)

Original text:
I can't believe it's so hot today. Let's go to the park.

Expanded Text with Contractions:
I cannot believe it is so hot today. Let's go to the park.


In [5]:
# !pip install textblob

from textblob import TextBlob       # 철차 교정 라이브러리

# misspelled case
text = "Today is Tursday."

# Instantiate TextBlob
blob = TextBlob(text)

# error correction
corrected_text = str(blob.correct())

print("Original Text")
print(text)

print("\nCorrected Text with Spelling Mistakes Fized:")
print(corrected_text)

Original Text
Today is Tursday.

Corrected Text with Spelling Mistakes Fized:
Today is Thursday.


In [8]:
# example for tokenization
import nltk     # python packages for natural language processing
from nltk.tokenize import word_tokenize, sent_tokenize

# text for tokenization
text = "토큰화(Tokenization)는 문자열을 작은 단위, 즉 '토큰(Token)'으로 분리하는 과정입니다."


# turn text into word tokens
words = word_tokenize(text)

# turn text into sentence tokens
sentences = sent_tokenize(text)

print("Original text:")
print(text)

print("\nTokenized Words:")
print(words)

print("\nTokenized Sentences:")
print(sentences)

Original text:
토큰화(Tokenization)는 문자열을 작은 단위, 즉 '토큰(Token)'으로 분리하는 과정입니다.

Tokenized Words:
['토큰화', '(', 'Tokenization', ')', '는', '문자열을', '작은', '단위', ',', '즉', "'토큰", '(', 'Token', ')', "'으로", '분리하는', '과정입니다', '.']

Tokenized Sentences:
["토큰화(Tokenization)는 문자열을 작은 단위, 즉 '토큰(Token)'으로 분리하는 과정입니다."]
