In [1]:
from nltk.tokenize import word_tokenize
sentence = "hi, how are you?"
sentence.split()

['hi,', 'how', 'are', 'you?']

In [3]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [4]:
word_tokenize(sentence)

['hi', ',', 'how', 'are', 'you', '?']

# bag of words

In [5]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    "hello, how are you?",
    "im getting bored at home. And you? What do you think?",
    "did you know about counts",
    "let's see if this works!",
    "YES!!!!"
]
ctv = CountVectorizer()
ctv.fit(corpus)
corpus_transformed = ctv.transform(corpus)

In [6]:
corpus_transformed

<5x23 sparse matrix of type '<class 'numpy.int64'>'
	with 25 stored elements in Compressed Sparse Row format>

In [7]:
type(corpus_transformed)

scipy.sparse.csr.csr_matrix

In [8]:
print(corpus_transformed)

  (0, 2)	1
  (0, 9)	1
  (0, 11)	1
  (0, 22)	1
  (1, 1)	1
  (1, 3)	1
  (1, 4)	1
  (1, 7)	1
  (1, 8)	1
  (1, 10)	1
  (1, 13)	1
  (1, 17)	1
  (1, 19)	1
  (1, 22)	2
  (2, 0)	1
  (2, 5)	1
  (2, 6)	1
  (2, 14)	1
  (2, 22)	1
  (3, 12)	1
  (3, 15)	1
  (3, 16)	1
  (3, 18)	1
  (3, 20)	1
  (4, 21)	1


In [9]:
ctv.vocabulary_

{'hello': 9,
 'how': 11,
 'are': 2,
 'you': 22,
 'im': 13,
 'getting': 8,
 'bored': 4,
 'at': 3,
 'home': 10,
 'and': 1,
 'what': 19,
 'do': 7,
 'think': 17,
 'did': 6,
 'know': 14,
 'about': 0,
 'counts': 5,
 'let': 15,
 'see': 16,
 'if': 12,
 'this': 18,
 'works': 20,
 'yes': 21}

In [10]:
ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)
ctv.fit(corpus)
corpus_transformed = ctv.transform(corpus)
print(ctv.vocabulary_)

{'hello': 14, ',': 2, 'how': 16, 'are': 7, 'you': 27, '?': 4, 'im': 18, 'getting': 13, 'bored': 9, 'at': 8, 'home': 15, '.': 3, 'and': 6, 'what': 24, 'do': 12, 'think': 22, 'did': 11, 'know': 19, 'about': 5, 'counts': 10, 'let': 20, "'s": 1, 'see': 21, 'if': 17, 'this': 23, 'works': 25, '!': 0, 'yes': 26}


# 単純ベイズ分類器

※参考：『機械学習図鑑』 p.72~ 

1. 目的変数のカテゴリごとに、文にある単語が出現するか（例：「感動」）をカウントする
2. 1の結果をもとに、ある単語が出現したという条件のもとで、文がどのカテゴリに属するかの確率を計算する
3. カテゴリの生起確率と2と予測データの 1 or 0  の登場／非登場の積 p または 1-p との積を計算する

# 語幹化と見出し語化

In [12]:
nltk.download("wordnet")

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [13]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer

# 単語化（辞書の見出しにの乗っている単語にする）
lemmatizer = WordNetLemmatizer()
# 単純に語幹に
stemmer = SnowballStemmer("english")

words = ["fishing", "fishes", "fished"]

for word in words:
    print(f"word={word}")
    print(f"stemmed_word={stemmer.stem(word)}")
    print(f"lemma={lemmatizer.lemmatize(word)}")
    print("")

word=fishing
stemmed_word=fish
lemma=fishing

word=fishes
stemmed_word=fish
lemma=fish

word=fished
stemmed_word=fish
lemma=fished



# 特異値分解

In [1]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer

# この例では 10000 個のデータセットまで読みこむ
corpus = pd.read_csv("../input/imdb.csv", nrows=10000)

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
corpus = corpus["review"].values
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfv.fit(corpus)
corpus_transformed = tfv.transform(corpus)

In [3]:
# SVDを要素数10で
svd = decomposition.TruncatedSVD(n_components=10)
corpus_svd = svd.fit(corpus_transformed)

In [6]:
tfv.get_feature_names()

['\x10own',
 '!',
 '#',
 '$',
 '%',
 '&',
 "'",
 "''",
 "''headin",
 "''scarface",
 "''the",
 "''wallace",
 "'.",
 "'007",
 "'00s",
 "'01",
 "'02",
 "'03",
 "'04",
 "'05",
 "'06",
 "'07",
 "'10",
 "'10.5",
 "'12",
 "'15",
 "'1st",
 "'20s",
 "'20th",
 "'28",
 "'30",
 "'30s",
 "'30s-'40s",
 "'30s-ray",
 "'30s/'40s",
 "'34",
 "'40",
 "'40s",
 "'42",
 "'43",
 "'45",
 "'50",
 "'50s",
 "'50s/early",
 "'51",
 "'54-'55",
 "'59",
 "'60",
 "'60s",
 "'60s-early",
 "'61",
 "'63",
 "'66",
 "'68",
 "'69",
 "'70",
 "'70's-style",
 "'70s",
 "'70s-style",
 "'72",
 "'73",
 "'75",
 "'76",
 "'78",
 "'79",
 "'80",
 "'80's.",
 "'80s",
 "'83",
 "'84",
 "'86",
 "'88",
 "'89",
 "'90",
 "'90s",
 "'91",
 "'94",
 "'95",
 "'96",
 "'97",
 "'aankhen",
 "'aankhen'-",
 "'aavjo",
 "'abbot",
 "'abolitionist",
 "'abolitionists",
 "'about",
 "'acacia",
 "'accidentally",
 "'ace",
 "'achillies",
 "'act",
 "'acting",
 "'action",
 "'actor",
 "'actors",
 "'actress",
 "'addiction",
 "'additional",
 "'admire",
 "'adolescent",
 "

In [7]:
corpus_svd.components_[0]

array([2.12859175e-05, 6.19763303e-02, 1.48845634e-03, ...,
       4.74104096e-05, 5.72102552e-06, 3.27187387e-05])

In [5]:
# 1つ目のサンプルを選び、特徴量とSVDのスコアの辞書を作成
sample_index = 0
feature_scores = dict(
    zip(tfv.get_feature_names(), corpus_svd.components_[sample_index])
)
# スコアの降順で並び替えて、上位Nのトピックを表示
N = 5

['the', ',', '.', 'a', 'and']


In [8]:
# 複数サンプルについて
for sample_index in range(5):
    feature_scores = dict(
        zip(tfv.get_feature_names(), corpus_svd.components_[sample_index])
    )
    print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])

['the', ',', '.', 'a', 'and']
['br', '<', '>', '/', '-']
['i', 'movie', '!', 'it', 'was']
[',', '!', "''", '``', 'you']
['!', 'the', "''", '``', '...']


In [9]:
import string
import re

string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
re.escape(string.punctuation)

'!"\\#\\$%\\&\'\\(\\)\\*\\+,\\-\\./:;<=>\\?@\\[\\\\\\]\\^_`\\{\\|\\}\\~'

In [11]:
from common.utils import clean_text

# テキストから句読点等を削除
corpus = pd.read_csv("../input/imdb.csv", nrows=10000)
corpus.loc[:, "review"] = corpus["review"].apply(clean_text)
# 以下同様
corpus = corpus["review"].values
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfv.fit(corpus)
corpus_transformed = tfv.transform(corpus)
# SVDを要素数10で
svd = decomposition.TruncatedSVD(n_components=10)
corpus_svd = svd.fit(corpus_transformed)

# 複数サンプルについて
for sample_index in range(5):
    feature_scores = dict(
        zip(tfv.get_feature_names(), corpus_svd.components_[sample_index])
    )
    print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])

['the', 'a', 'and', 'of', 'to']
['i', 'movie', 'it', 'was', 'this']
['the', 'was', 'i', 'were', 'of']
['her', 'was', 'she', 'i', 'he']
['br', 'to', 'they', 'he', 'show']
