# nltkを使った前処理

In [17]:
import nltk

In [18]:
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 単語分割

In [19]:
sentence = "He's a German Shepherd. They are some of the smartest dogs!"

In [20]:
word_tokenize(sentence)

['He',
 "'s",
 'a',
 'German',
 'Shepherd',
 '.',
 'They',
 'are',
 'some',
 'of',
 'the',
 'smartest',
 'dogs',
 '!']

## 文章ごとに区切る

In [21]:
from nltk.tokenize import sent_tokenize

In [22]:
article = "Researchers at Hokkaido University in Japan have recently developed an innovative autonomous system that can serve as a walking aid for patients in nursing facilities or hospitals. This robotic medical walker, presented in a paper published in SpringerLink's Artificial Life and Robotics journal, does not require any intervention from nurses and caregivers, as it can autonomously locate patients and approach them when they wish to go for a walk or move to a different place within their healthcare facility."

In [23]:
sent_tokenize(article)

['Researchers at Hokkaido University in Japan have recently developed an innovative autonomous system that can serve as a walking aid for patients in nursing facilities or hospitals.',
 "This robotic medical walker, presented in a paper published in SpringerLink's Artificial Life and Robotics journal, does not require any intervention from nurses and caregivers, as it can autonomously locate patients and approach them when they wish to go for a walk or move to a different place within their healthcare facility."]

## 文章収縮

例：He's →　He is

In [24]:
!pip install contractions



In [25]:
import contractions

In [26]:
contracted_text = "He's going to shopping. Lucy didn't go with him."

In [27]:
expanded_all = []
# 形態素解析をして抜き出す
for word in contracted_text.split():
  #抜き出した単語をconstractionsにかけて格納
  expanded_all.append(contractions.fix(word))

In [28]:
expanded_all

['he is', 'going', 'to', 'shopping.', 'Lucy', 'did not', 'go', 'with', 'him.']

## .splitを使って分割する方法

In [29]:
sentence = "He's fast The boy ran up the hill, he can't come back down!"

In [30]:
result = sentence.split()
result

["He's",
 'fast',
 'The',
 'boy',
 'ran',
 'up',
 'the',
 'hill,',
 'he',
 "can't",
 'come',
 'back',
 'down!']

In [31]:
sentence = "He has a bachelor degree. His specialty is economics"

In [32]:
# 区切りを指定して分割
result = sentence.split('.')
result

['He has a bachelor degree', ' His specialty is economics']

## 正規表現を使った文章処理

In [33]:
import re

In [36]:
txt = "one-two+three#four five"
## 文字だけを抜く
result = print(re.split('\w+',txt)) # \wをcharで+は最後まで繰り返して探す。
result

['', '-', '+', '#', ' ', '']


In [37]:
num = "one-two-three-#four"

In [41]:
# ‐と#でテキストを区切る
result = print(re.split('[-+#]',num))
result

['one', 'two', 'three', '', 'four']


## 単語を置換する

In [42]:
substitute = "one and two and three and four and five"
result = re.sub(' ', '-',substitute) #半角空を-で埋める
result

'one-and-two-and-three-and-four-and-five'