## NLTK Natura Language Tool Kit

In [7]:
import nltk

In [13]:
# 分词
my_text = "The coolest job in the next 10 years will be statisticians. \
           People think I'm joking, but who would've guessed that \
           computer engineers would've been the cooleast job of the 1990s?"

nltk_tokens = nltk.word_tokenize(my_text)
print(nltk_tokens)

['The', 'coolest', 'job', 'in', 'the', 'next', '10', 'years', 'will', 'be', 'statisticians', '.', 'People', 'think', 'I', "'m", 'joking', ',', 'but', 'who', 'would', "'ve", 'guessed', 'that', 'computer', 'engineers', 'would', "'ve", 'been', 'the', 'cooleast', 'job', 'of', 'the', '1990s', '?']


In [14]:
# tweet分词
from nltk.tokenize import TweetTokenizer
tt = TweetTokenizer(strip_handles=True, reduce_len=True)
tweet = '@mate: I looooooove this city!!!!!!! #love #foreverhere'
tt.tokenize(tweet)

[':', 'I', 'looove', 'this', 'city', '!', '!', '!', '#love', '#foreverhere']

In [15]:
# 词干提取
from nltk.stem import *
stemmer = LancasterStemmer()
print([stemmer.stem(word) for word in nltk_tokens])

['the', 'coolest', 'job', 'in', 'the', 'next', '10', 'year', 'wil', 'be', 'stat', '.', 'peopl', 'think', 'i', "'m", 'jok', ',', 'but', 'who', 'would', "'ve", 'guess', 'that', 'comput', 'engin', 'would', "'ve", 'been', 'the', 'cooleast', 'job', 'of', 'the', '1990s', '?']


In [16]:
# 词性标注
print(nltk.pos_tag(nltk_tokens))

[('The', 'DT'), ('coolest', 'JJS'), ('job', 'NN'), ('in', 'IN'), ('the', 'DT'), ('next', 'JJ'), ('10', 'CD'), ('years', 'NNS'), ('will', 'MD'), ('be', 'VB'), ('statisticians', 'NNS'), ('.', '.'), ('People', 'NNS'), ('think', 'VBP'), ('I', 'PRP'), ("'m", 'VBP'), ('joking', 'VBG'), (',', ','), ('but', 'CC'), ('who', 'WP'), ('would', 'MD'), ("'ve", 'VBP'), ('guessed', 'VBN'), ('that', 'IN'), ('computer', 'NN'), ('engineers', 'NNS'), ('would', 'MD'), ("'ve", 'VBP'), ('been', 'VBN'), ('the', 'DT'), ('cooleast', 'JJ'), ('job', 'NN'), ('of', 'IN'), ('the', 'DT'), ('1990s', 'CD'), ('?', '.')]


In [18]:
# 命名实体识别
text = "Elvis Aaron Presley was an American singer and actor. Born in \
      Tupelo, Mississippi, when Presley was 13 years old he and his \
      family relocated to Memphis, Tennessee."

chunks = nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(text)))

In [20]:
nltk.pos_tag(nltk.word_tokenize(text))

[('Elvis', 'NNP'),
 ('Aaron', 'NNP'),
 ('Presley', 'NNP'),
 ('was', 'VBD'),
 ('an', 'DT'),
 ('American', 'JJ'),
 ('singer', 'NN'),
 ('and', 'CC'),
 ('actor', 'NN'),
 ('.', '.'),
 ('Born', 'VBN'),
 ('in', 'IN'),
 ('Tupelo', 'NNP'),
 (',', ','),
 ('Mississippi', 'NNP'),
 (',', ','),
 ('when', 'WRB'),
 ('Presley', 'NNP'),
 ('was', 'VBD'),
 ('13', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 ('he', 'PRP'),
 ('and', 'CC'),
 ('his', 'PRP$'),
 ('family', 'NN'),
 ('relocated', 'VBD'),
 ('to', 'TO'),
 ('Memphis', 'NNP'),
 (',', ','),
 ('Tennessee', 'NNP'),
 ('.', '.')]

In [21]:
print(chunks)

(S
  (PERSON Elvis/NNP)
  (PERSON Aaron/NNP Presley/NNP)
  was/VBD
  an/DT
  (GPE American/JJ)
  singer/NN
  and/CC
  actor/NN
  ./.
  Born/VBN
  in/IN
  (GPE Tupelo/NNP)
  ,/,
  (GPE Mississippi/NNP)
  ,/,
  when/WRB
  (PERSON Presley/NNP)
  was/VBD
  13/CD
  years/NNS
  old/JJ
  he/PRP
  and/CC
  his/PRP$
  family/NN
  relocated/VBD
  to/TO
  (GPE Memphis/NNP)
  ,/,
  (GPE Tennessee/NNP)
  ./.)


In [22]:
# 停用词
from sklearn.feature_extraction import text
stop_words = text.ENGLISH_STOP_WORDS

In [24]:
print(stop_words)

frozenset({'call', 'may', 'bill', 'get', 'latter', 'off', 'done', 'name', 'per', 'whole', 'yours', 'one', 'from', 're', 'fire', 'co', 'hundred', 'others', 'where', 'an', 'being', 'already', 'together', 'us', 'if', 'might', 'toward', 'until', 'around', 'there', 'behind', 'both', 'find', 'after', 'and', 'out', 'eleven', 'however', 'so', 'except', 'but', 'he', 'side', 'its', 'when', 'last', 'while', 'besides', 'also', 'because', 'sixty', 'eg', 'fifteen', 'another', 'etc', 'why', 'something', 'those', 'interest', 'six', 'latterly', 'ever', 'hers', 'anything', 'front', 'before', 'i', 'seem', 'their', 'anyone', 'mostly', 'made', 'ten', 'will', 'hereby', 'thence', 'empty', 'two', 'whither', 'than', 'has', 'me', 'to', 'found', 'forty', 'throughout', 'beforehand', 'been', 'un', 'her', 'moreover', 'becomes', 'former', 'here', 'hereafter', 'often', 'describe', 'whoever', 'under', 'formerly', 'take', 'themselves', 'top', 'amount', 'thick', 'with', 'thin', 'as', 'by', 'how', 'less', 'own', 'otherwi

In [25]:
len(stop_words)

318

In [26]:
from nltk.corpus import stopwords
print(stopwords.words('english'))

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

In [27]:
print(stopwords.words('german'))

['aber', 'alle', 'allem', 'allen', 'aller', 'alles', 'als', 'also', 'am', 'an', 'ander', 'andere', 'anderem', 'anderen', 'anderer', 'anderes', 'anderm', 'andern', 'anderr', 'anders', 'auch', 'auf', 'aus', 'bei', 'bin', 'bis', 'bist', 'da', 'damit', 'dann', 'der', 'den', 'des', 'dem', 'die', 'das', 'dass', 'daß', 'derselbe', 'derselben', 'denselben', 'desselben', 'demselben', 'dieselbe', 'dieselben', 'dasselbe', 'dazu', 'dein', 'deine', 'deinem', 'deinen', 'deiner', 'deines', 'denn', 'derer', 'dessen', 'dich', 'dir', 'du', 'dies', 'diese', 'diesem', 'diesen', 'dieser', 'dieses', 'doch', 'dort', 'durch', 'ein', 'eine', 'einem', 'einen', 'einer', 'eines', 'einig', 'einige', 'einigem', 'einigen', 'einiger', 'einiges', 'einmal', 'er', 'ihn', 'ihm', 'es', 'etwas', 'euer', 'eure', 'eurem', 'euren', 'eurer', 'eures', 'für', 'gegen', 'gewesen', 'hab', 'habe', 'haben', 'hat', 'hatte', 'hatten', 'hier', 'hin', 'hinter', 'ich', 'mich', 'mir', 'ihr', 'ihre', 'ihrem', 'ihren', 'ihrer', 'ihres', 'euc

In [28]:
# 文本分类例子
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import accuracy_score
import numpy as np

cat = ['sci.med', 'sci.space']
to_remove = ('headers', 'footers', 'quotes')
news_train = fetch_20newsgroups(subset='train',
                                remove=to_remove,
                                categories=cat)
news_test = fetch_20newsgroups(subset='test', remove=to_remove, categories=cat)

In [29]:
tfidf = TfidfVectorizer()
X_train = tfidf.fit_transform(news_train.data)
X_test = tfidf.transform(news_test.data)
y_train = news_train.target
y_test = news_test.target

In [32]:
%%time
clf = SGDClassifier(fit_intercept=False, n_jobs=-1)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('Accuracy = {}'.format(accuracy_score(y_test, y_pred)))

Accuracy = 0.8911392405063291
Wall time: 56.6 ms


In [38]:
def clean_and_stem_text(text):
    tokens = nltk.word_tokenize(text.lower())
    clean_tokens = [word for word in tokens if word not in stop_words]
    stem_tokens = [stemmer.stem(tokens) for tokens in clean_tokens]
    return ' '.join(stem_tokens)

In [39]:
cleaned_docs_train = [clean_and_stem_text(text) for text in news_train.data]
cleaned_docs_test = [clean_and_stem_text(text) for text in news_test.data]

In [42]:
X1_train = tfidf.fit_transform(cleaned_docs_train)
X1_text = tfidf.transform(cleaned_docs_test)
clf.fit(X1_train, y_train)
y1_pred = clf.predict(X1_text)
print('Accuracy = {}'.format(accuracy_score(y_test, y1_pred)))

Accuracy = 0.8936708860759494


## 隐含狄利克雷分布 Latent Dirichlet Allocation

In [44]:
import gensim


def tokenize(text):
    return [
        token.lower()  # 转小写
        for token in gensim.utils.simple_preprocess(
            text)  # 分词，过滤小于2个字母大于15个字母的单词
        if token not in gensim.parsing.preprocessing.STOPWORDS  # 去掉停用词
    ]

In [46]:
len(gensim.parsing.preprocessing.STOPWORDS)

337

In [55]:
text_data = fetch_20newsgroups(
    categories=['rec.autos', 'sci.med', 'talk.politics.mideast'],
    random_state=101,
    remove=to_remove)
docs = text_data.data
print(len(docs))

1752


In [60]:
print(docs[0])


Gee, what do I do?  My LDL is only 50-60. (and my HDL is only 23-25)
I must be risking something, but Is it the same risk as those with 
very high LDL?


What about exercise and a low-fat diet?  What are the long-term 
effects of this drug?



In [67]:
print(docs[9])

Back when I was building round tail light 2002s they were Bimmers.  It was
only when the (red suspendered, Reganomics generated, quiche eating) Yuppies
got into the market >-( that they became Beamers and the hood ornaments started
disappering.


In [72]:
processed_docs = [tokenize(doc) for doc in docs]
word_dic = gensim.corpora.Dictionary(processed_docs)  # 类似词袋字典对象，有多种属性

In [78]:
print(word_dic.num_docs)
print(word_dic.num_pos)
print(word_dic.num_nnz)

1752
177568
122067


In [79]:
len(word_dic)  # 包括不同的词

23901

In [85]:
word_dic.filter_extremes(no_below=10, no_above=0.2)  # 保留至少出现10次的词，且不超过20%
bow = [word_dic.doc2bow(doc) for doc in processed_docs]  # 生成1752个文档的词袋列表

In [86]:
len(bow)

1752

In [84]:
type(bow[0])

list

In [94]:
lda_model = gensim.models.LdaMulticore(
    bow,  # 词袋列表
    num_topics=3,  # 分类主题数
    id2word=word_dic,  # Dictionary对象
    passes=16,  #  通道
    iterations=500)

In [95]:
# 打印所有主题
lda_model.print_topics(-1)

[(0,
  '0.013*"armenian" + 0.011*"armenians" + 0.011*"said" + 0.010*"turkish" + 0.006*"jews" + 0.006*"israel" + 0.005*"turkey" + 0.005*"israeli" + 0.005*"went" + 0.005*"government"'),
 (1,
  '0.012*"edu" + 0.009*"com" + 0.008*"health" + 0.007*"medical" + 0.007*"university" + 0.007*"use" + 0.006*"new" + 0.006*"information" + 0.005*"research" + 0.005*"number"'),
 (2,
  '0.012*"car" + 0.008*"think" + 0.006*"good" + 0.006*"time" + 0.005*"israel" + 0.005*"right" + 0.004*"problem" + 0.004*"cars" + 0.004*"new" + 0.004*"ve"')]

In [96]:
new_doc = "I've shown the doctor my new car. He loved its big wheels!"
bow_doc = word_dic.doc2bow(tokenize(new_doc))

In [97]:
bow_doc

[(167, 1), (487, 1), (491, 1), (551, 1), (559, 1), (1627, 1), (2173, 1)]

In [117]:
for index, score in sorted(lda_model[bow_doc],
                           key=lambda tup: tup[1],
                           reverse=True):
    print('Score: {}, Topic: {}'.format(score, lda_model.print_topic(index, 5)))

Score: 0.9090130925178528, Topic: 0.012*"car" + 0.008*"think" + 0.006*"good" + 0.006*"time" + 0.005*"israel"
Score: 0.046745315194129944, Topic: 0.012*"edu" + 0.009*"com" + 0.008*"health" + 0.007*"medical" + 0.007*"university"
Score: 0.04424162209033966, Topic: 0.013*"armenian" + 0.011*"armenians" + 0.011*"said" + 0.010*"turkish" + 0.006*"jews"


In [105]:
lda_model[bow_doc]

[(0, 0.044241548), (1, 0.046725716), (2, 0.90903276)]

In [106]:
sorted(lda_model[bow_doc], key=lambda tup: -1 * tup[1])

[(2, 0.9090123), (1, 0.04674605), (0, 0.044241637)]

In [108]:
lda_model.print_topic(2, 5)

'0.012*"car" + 0.008*"think" + 0.006*"good" + 0.006*"time" + 0.005*"israel"'

## Word2Vec

In [118]:
from gensim.models import Word2Vec
from nltk.corpus import movie_reviews
w2v = Word2Vec(movie_reviews.sents(), workers=4)
w2v.init_sims(replace=True)  # 固定模型，不再更新

In [121]:
w2v.wv.most_similar('house', topn=10)

[('wedding', 0.8668364882469177),
 ('hotel', 0.8647732138633728),
 ('apartment', 0.8610095977783203),
 ('country', 0.856749415397644),
 ('body', 0.8553558588027954),
 ('station', 0.8445186614990234),
 ('local', 0.8440505266189575),
 ('party', 0.8419867753982544),
 ('bar', 0.8363257050514221),
 ('living', 0.8347684741020203)]

In [122]:
w2v.wv.most_similar('countryside', topn=10)

[('paralyzed', 0.9465136528015137),
 ('causing', 0.9431409239768982),
 ('border', 0.9419759511947632),
 ('online', 0.9393081068992615),
 ('farmers', 0.9381197690963745),
 ('internal', 0.9377422332763672),
 ('wound', 0.9375422596931458),
 ('mining', 0.9375393986701965),
 ('union', 0.937467634677887),
 ('rig', 0.9372416734695435)]

In [123]:
w2v.wv['house']

array([-0.02427886,  0.09660256, -0.00637537, -0.15946592,  0.14721759,
       -0.1368917 , -0.12395467, -0.16306636, -0.02574619, -0.00405329,
        0.05253716, -0.05105058,  0.07331239,  0.04443988,  0.06931428,
       -0.04729541,  0.00521677,  0.0782991 ,  0.1307938 , -0.16089483,
       -0.06671596,  0.06017288, -0.08920097,  0.08610193,  0.05218294,
        0.0019553 ,  0.03128374, -0.02785585, -0.16492818,  0.15375887,
       -0.10946473,  0.14962548,  0.06548597, -0.00377436, -0.04982037,
        0.03007328,  0.18657072,  0.14839248,  0.02354456, -0.07350426,
        0.03854439, -0.02674351,  0.01844085, -0.00599654, -0.00094211,
        0.00873501,  0.00364529,  0.08921657,  0.09013582,  0.08390693,
        0.1057057 ,  0.10874041,  0.02966209, -0.02938473,  0.06717335,
       -0.06369507,  0.0848505 ,  0.09073838, -0.10945512, -0.03622925,
       -0.16745006, -0.03348833, -0.06317479,  0.1475254 ,  0.09494581,
       -0.17496276, -0.10336761,  0.24626589, -0.10652564, -0.03

In [124]:
w2v.wv['hotel']

array([-6.48700371e-02,  7.00271726e-02, -7.67491460e-02, -6.84995055e-02,
        9.55135524e-02, -1.18068002e-01, -1.72842339e-01, -1.64749339e-01,
       -5.28507009e-02, -9.40757245e-03, -4.23805369e-03, -7.75362998e-02,
        1.07264578e-01,  4.99418564e-02,  5.68794012e-02, -6.74575195e-02,
        8.57803375e-02,  1.17778189e-01,  1.87019721e-01, -1.21144824e-01,
       -1.65744275e-01,  5.42791784e-02, -4.14649732e-02,  6.22955151e-02,
        6.93899244e-02, -4.25398797e-02, -8.10289010e-02, -5.18517531e-02,
       -1.30884618e-01,  5.47503196e-02, -1.29633218e-01,  9.46059078e-02,
        5.73539883e-02,  1.78467035e-02, -8.18712637e-02,  1.13160320e-01,
        1.77643284e-01,  1.31120637e-01,  5.33079170e-02, -3.50715742e-02,
        2.15164050e-02,  2.21600458e-02, -5.59382699e-02,  4.99225147e-02,
       -4.78981389e-03, -3.33971158e-02,  2.51089353e-02, -5.58985211e-02,
        9.57650691e-02,  9.81157273e-02,  1.17004447e-01,  1.09301731e-01,
        2.22818702e-02,  

In [125]:
w2v.wv['hotel'].shape

(100,)

In [126]:
w2v.wv.most_similar(positive=['woman', 'king'], topn=5)  # 多正例相似

[('lady', 0.8865196704864502),
 ('partner', 0.8856627941131592),
 ('former', 0.8688048124313354),
 ('daughter', 0.8662325143814087),
 ('jack', 0.8644030094146729)]

In [128]:
w2v.wv.most_similar(positive=['woman', 'king'], negative=['queen'],
                    topn=5)  # 正例相似，加负例

[('man', 0.821386992931366),
 ('boy', 0.7598280906677246),
 ('child', 0.7559627294540405),
 ('father', 0.7236120700836182),
 ('girl', 0.7173840999603271)]

In [129]:
w2v.wv.most_similar(positive=['woman', 'queen'], topn=5)

[('lady', 0.9145256280899048),
 ('lawyer', 0.8811653852462769),
 ('former', 0.878139853477478),
 ('partner', 0.87492436170578),
 ('girl', 0.8732819557189941)]

In [130]:
w2v.wv.doesnt_match(['bed', 'pillow', 'cake', 'mattress'])  # 不匹配的单词

  vectors = vstack(self.word_vec(word, use_norm=True) for word in used_words).astype(REAL)


'cake'

In [131]:
w2v.wv.similarity('woman', 'girl'), w2v.similarity('woman', 'boy')

  """Entry point for launching an IPython kernel.


(0.9102032, 0.8507879)