In [1]:
import nltk

#  1、Sentences Segment（分句）

In [2]:
sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 

In [3]:
paragraph = "The first time I heard that song was in Hawaii on radio. I was just a kid, and loved it very much! What a fantastic song!" 

sentences = sent_tokenizer.tokenize(paragraph)

sentences

['The first time I heard that song was in Hawaii on radio.',
 'I was just a kid, and loved it very much!',
 'What a fantastic song!']

#  2、Tokenize sentences （分词）

In [4]:
from nltk.tokenize import WordPunctTokenizer 

sentence = "Are you old enough to remember Michael Jackson attending the Grammys \
            with Brooke Shields and Webster sat on his lap during the show?" 

words = WordPunctTokenizer().tokenize(sentence)  

words

['Are',
 'you',
 'old',
 'enough',
 'to',
 'remember',
 'Michael',
 'Jackson',
 'attending',
 'the',
 'Grammys',
 'with',
 'Brooke',
 'Shields',
 'and',
 'Webster',
 'sat',
 'on',
 'his',
 'lap',
 'during',
 'the',
 'show',
 '?']

In [5]:
text = 'That U.S.A. poster-print costs $12.40...' 

pattern = r"""(?x)                   # set flag to allow verbose regexps 
              (?:[A-Z]\.)+           # abbreviations, e.g. U.S.A. 
              |\d+(?:\.\d+)?%?       # numbers, incl. currency and percentages 
              |\w+(?:[-']\w+)*       # words w/ optional internal hyphens/apostrophe 
              |\.\.\.                # ellipsis 
              |(?:[.,;"'?():-_`])    # special characters with meanings 
            """  

nltk.regexp_tokenize(text, pattern)  


['That', 'U.S.A.', 'poster-print', 'costs', '12.40', '...']

#  Tokenize and tag some text:

In [6]:
sentence = """At eight o'clock on Thursday morning
... Arthur didn't feel very good."""

tokens = nltk.word_tokenize(sentence)

tokens


['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

In [7]:
tagged = nltk.pos_tag(tokens)
tagged

[('At', 'IN'),
 ('eight', 'CD'),
 ("o'clock", 'NN'),
 ('on', 'IN'),
 ('Thursday', 'NNP'),
 ('morning', 'NN'),
 ('Arthur', 'NNP'),
 ('did', 'VBD'),
 ("n't", 'RB'),
 ('feel', 'VB'),
 ('very', 'RB'),
 ('good', 'JJ'),
 ('.', '.')]

#  Display a parse tree:

In [8]:
entities = nltk.chunk.ne_chunk(tagged)
entities


LookupError: 

===========================================================================
NLTK was unable to find the gs file!
Use software specific configuration paramaters or set the PATH environment variable.
===========================================================================

Tree('S', [('At', 'IN'), ('eight', 'CD'), ("o'clock", 'NN'), ('on', 'IN'), ('Thursday', 'NNP'), ('morning', 'NN'), Tree('PERSON', [('Arthur', 'NNP')]), ('did', 'VBD'), ("n't", 'RB'), ('feel', 'VB'), ('very', 'RB'), ('good', 'JJ'), ('.', '.')])

In [9]:

from nltk.corpus import treebank

t = treebank.parsed_sents('wsj_0001.mrg')[0]

t.draw()

In [10]:
from nltk.book import *

*** Introductory Examples for the NLTK Book ***
Loading text1, ..., text9 and sent1, ..., sent9
Type the name of the text or sentence to view it.
Type: 'texts()' or 'sents()' to list the materials.
text1: Moby Dick by Herman Melville 1851
text2: Sense and Sensibility by Jane Austen 1811
text3: The Book of Genesis
text4: Inaugural Address Corpus
text5: Chat Corpus
text6: Monty Python and the Holy Grail
text7: Wall Street Journal
text8: Personals Corpus
text9: The Man Who Was Thursday by G . K . Chesterton 1908


In [11]:
text1

<Text: Moby Dick by Herman Melville 1851>

In [12]:
text1.concordance("monstrous")

Displaying 11 of 11 matches:
ong the former , one was of a most monstrous size . ... This came towards us , 
ON OF THE PSALMS . " Touching that monstrous bulk of the whale or ork we have r
ll over with a heathenish array of monstrous clubs and spears . Some were thick
d as you gazed , and wondered what monstrous cannibal and savage could ever hav
that has survived the flood ; most monstrous and most mountainous ! That Himmal
they might scout at Moby Dick as a monstrous fable , or still worse and more de
th of Radney .'" CHAPTER 55 Of the Monstrous Pictures of Whales . I shall ere l
ing Scenes . In connexion with the monstrous pictures of whales , I am strongly
ere to enter upon those still more monstrous stories of them which are to be fo
ght have been rummaged out of this monstrous cabinet there is no telling . But 
of Whale - Bones ; for Whales of a monstrous size are oftentimes cast up dead u


In [13]:
text1.similar("monstrous")

true contemptible christian abundant few part mean careful puzzled
mystifying passing curious loving wise doleful gamesome singular
delightfully perilous fearless


In [14]:
text2.common_contexts(["monstrous","very"])

a_pretty am_glad a_lucky is_pretty be_glad


In [15]:
text3.generate('luck')

In [16]:
text3.count('smote') / len(text3)

0.00011169689929407559

In [17]:
len(text3) / len(set(text3))

16.050197203298673

# 抽取词干 并归类

In [18]:
from pandas import DataFrame
import pandas as pd
d = ['pets insurance','pets insure','pet insurance','pet insur','pet insurance"','pet insu']
df = DataFrame(d)
df.columns = ['Words']
df

Unnamed: 0,Words
0,pets insurance
1,pets insure
2,pet insurance
3,pet insur
4,"pet insurance"""
5,pet insu


In [19]:
# 去除标点符号等特殊字符的正则表达式分词器

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize 
from nltk.stem.porter import *

stemmer = PorterStemmer()

wnl = WordNetLemmatizer()

tokenizer = nltk.RegexpTokenizer(r'w+')

df["Stemming Words"] = ""
df["Count"] = 1

j = 0
while (j <= 5):
    for word in word_tokenize(df["Words"][j]):  # 分词
        df["Stemming Words"][j] = df["Stemming Words"][j] + " " + stemmer.stem(word)  # stemming
    j=j + 1
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Words,Stemming Words,Count
0,pets insurance,pet insur,1
1,pets insure,pet insur,1
2,pet insurance,pet insur,1
3,pet insur,pet insur,1
4,"pet insurance""",pet insur '',1
5,pet insu,pet insu,1


In [20]:
wnl.lemmatize('left')

'left'

In [21]:
tokenizer.tokenize( ' pets insur ')

[]

In [22]:
uniqueWords = df.groupby(['Stemming Words'], as_index = False).sum()
uniqueWords

Unnamed: 0,Stemming Words,Count
0,pet insu,1
1,pet insur,4
2,pet insur '',1


In [23]:
# Levenshtein edit-distance 有很多不同的计算距离的方法

from nltk.metrics import edit_distance
minDistance = 0.8
distance = -1
lastWord = ""
j = 0
while (j < 1):
    lastWord = uniqueWords["Stemming Words"][j]
    distance = edit_distance(uniqueWords["Stemming Words"][j], uniqueWords["Stemming Words"][j + 1])
    if (distance > minDistance):
        uniqueWords["Stemming Words"][j] = uniqueWords["Stemming Words"][j + 1]
    j += 1
uniqueWords

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if sys.path[0] == '':


Unnamed: 0,Stemming Words,Count
0,pet insur,1
1,pet insur,4
2,pet insur '',1


In [24]:
uniqueWords = uniqueWords.groupby(['Stemming Words'], as_index = False).sum()
uniqueWords

Unnamed: 0,Stemming Words,Count
0,pet insur,5
1,pet insur '',1


# 停用词移除(Stop word removal)

In [25]:
from nltk.corpus import stopwords

stoplist = stopwords.words('english')
text = "This is just a test"
cleanwordlist = [word for word in text.split() if word not in stoplist]
print(cleanwordlist)

['This', 'test']


In [26]:
from nltk.metrics import edit_distance
print(edit_distance("rain", "rainbow"))

3


In [27]:
# 4.4 不同的解析器类型
# 4.4.1 递归下降解析器
# 4.4.2 移位-规约解析器
# 4.4.3 图表解析器
# 4.4.4 正则表达式解析器
import nltk
from nltk.chunk.regexp import *
chunk_rules = ChunkRule("<.*>+", "chunk everything")
reg_parser = RegexpParser('''
NP: {<DT>? <JJ>* <NN>*} # NP
P: {<IN>}               # Preposition
V: {<V.*>}              # Verb
PP: {<P> <NP>}          # PP -> P NP
VP: {<V> <NP|PP>*}      # VP -> V (NP|PP)*
''')
test_sent = "Mr. Obama played a big role in the Health insurance bill"
test_sent_pos = nltk.pos_tag(nltk.word_tokenize(test_sent))
paresed_out = reg_parser.parse(test_sent_pos)
print(paresed_out)

(S
  Mr./NNP
  Obama/NNP
  (VP
    (V played/VBD)
    (NP a/DT big/JJ role/NN)
    (PP (P in/IN) (NP the/DT)))
  Health/NNP
  (NP insurance/NN bill/NN))


In [28]:
# 4.5 依存性文本解析(dependency parsing, DP)
# 基于概率的投射依存性解析器(probabilistic, projective dependency parser)
from nltk.parse.stanford import StanfordParser
# https://nlp.stanford.edu/software/stanford-parser-full-2017-06-09.zip
english_parser = StanfordParser()
english_parser.raw_parse_sents(("this is the english parser test"))

LookupError: 

===========================================================================
  NLTK was unable to find stanford-parser\.jar! Set the CLASSPATH
  environment variable.

  For more information, on stanford-parser\.jar, see:
    <https://nlp.stanford.edu/software/lex-parser.shtml>
===========================================================================

In [29]:
%pwd

'C:\\zhang\\projects\\small\\tensor__cpu\\NLTK'

# 文本分类

In [30]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import csv
def preprocessing(text):
    #text = text.decode("utf8")
    # tokenize into words
    tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    # remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    # remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]
    # lower capitalization
    tokens = [word.lower() for word in tokens]
    # lemmatize
    lmtzr = WordNetLemmatizer()
    tokens = [lmtzr.lemmatize(word) for word in tokens]
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [31]:
sms = open('./Machine-Learning-with-R-datasets-master/SMSSpamCollection.txt', encoding='utf8') # check the structure of this file!
sms_data = []
sms_labels = []
csv_reader = csv.reader(sms, delimiter = '\t')
for line in csv_reader:
    # adding the sms_id
    sms_labels.append(line[0])
    # adding the cleaned text We are calling preprocessing method
    sms_data.append(preprocessing(line[1]))
sms.close()


In [32]:
# 6.3 采样操作
import sklearn
import numpy as np
trainset_size = int(round(len(sms_data)*0.70))
# i chose this threshold for 70:30 train and test split.
print('The training set size for this classifier is ' + str(trainset_size) + '\n')
x_train = np.array([''.join(el) for el in sms_data[0: trainset_size]])
y_train = np.array([el for el in sms_labels[0: trainset_size]])
x_test = np.array([''.join(el) for el in sms_data[trainset_size+1:len(sms_data)]])
y_test = np.array([el for el in sms_labels[trainset_size+1:len(sms_labels)]])
         #or el in sms_labels[trainset_size+1:len(sms_labels)]

print(x_train)
print(y_train)

from sklearn.feature_extraction.text import CountVectorizer
sms_exp = []
for line in sms_data:
    sms_exp.append(preprocessing(line))
vectorizer = CountVectorizer(min_df = 1, encoding='utf-8')
X_exp = vectorizer.fit_transform(sms_exp)
print("||".join(vectorizer.get_feature_names()))
print(X_exp.toarray())

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(min_df = 2, ngram_range=(1, 2),
                             stop_words = 'english', strip_accents = 'unicode', norm = 'l2')
X_train = vectorizer.fit_transform(x_train)
X_test = vectorizer.transform(x_test)

The training set size for this classifier is 3900

[ 'jurong point crazy.. available bugis great world buffet ... cine got amore wat ...'
 'lar ... joking wif oni ...'
 'free entry wkly comp win cup final tkts 21st may 2005 text 87121 receive entry question std txt rate apply 08452810075over18'
 ...,
 'tell call 09066358152 claim £5000 prize enter mobile personal detail prompt careful'
 "thank you 've wonderful" 'otherwise part time job na-tuition..']
['ham' 'ham' 'spam' ..., 'spam' 'ham' 'ham']
[[0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 ..., 
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]
 [0 0 0 ..., 0 0 0]]


In [33]:
# 6.3.1 朴素贝叶斯法

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

clf = MultinomialNB().fit(X_train, y_train)
y_nb_predicted = clf.predict(X_test)
print(y_nb_predicted)
print('\n confusion_matrix \n')
#cm = confusion_matrix(y_test, y_pred)
cm = confusion_matrix(y_test, y_nb_predicted)
print(cm)
print('\n Here is the classification report:')
print(classification_report(y_test, y_nb_predicted))

feature_names = vectorizer.get_feature_names()
coefs = clf.coef_
intercept = clf.intercept_
coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
n = 10
top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1])
for (coef_1, fn_1), (coef_2, fn_2) in top:
    print('\t%.4f\t%-15s\t\t%.4f\t%-15s' %(coef_1, fn_1, coef_2, fn_2))

['ham' 'ham' 'ham' ..., 'ham' 'ham' 'ham']

 confusion_matrix 

[[1443    0]
 [  52  176]]

 Here is the classification report:
             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      1443
       spam       1.00      0.77      0.87       228

avg / total       0.97      0.97      0.97      1671

	-9.0054	15             		-5.9355	free           
	-9.0054	1hr            		-6.1671	txt            
	-9.0054	1st lovely     		-6.2624	text           
	-9.0054	2go            		-6.3697	claim          
	-9.0054	2morrow        		-6.4459	stop           
	-9.0054	2mrw           		-6.4544	mobile         
	-9.0054	2nd inning     		-6.5163	reply          
	-9.0054	2nd sm         		-6.5269	prize          
	-9.0054	30             		-6.5440	service        
	-9.0054	30 want        		-6.5837	tone           


In [34]:
# 6.3.2 决策树
from sklearn import tree
clf = tree.DecisionTreeClassifier().fit(X_train.toarray(), y_train)
y_tree_predicted = clf.predict(X_test.toarray())
print(y_tree_predicted)
print('\n Here is the classification report:')
print(classification_report(y_test, y_tree_predicted))

['ham' 'ham' 'ham' ..., 'ham' 'spam' 'ham']

 Here is the classification report:
             precision    recall  f1-score   support

        ham       0.97      0.98      0.97      1443
       spam       0.87      0.80      0.83       228

avg / total       0.95      0.96      0.95      1671



In [35]:
# 6.3.3 随机梯度下降法
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
clf = SGDClassifier(alpha = 0.0001, n_iter=50).fit(X_train, y_train)
y_pred = clf.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, y_pred))
print(' \n confusion_matrix \n')
cm = confusion_matrix(y_test, y_pred)
print(cm)


 Here is the classification report:
             precision    recall  f1-score   support

        ham       0.99      1.00      0.99      1443
       spam       0.97      0.91      0.94       228

avg / total       0.98      0.98      0.98      1671

 
 confusion_matrix 

[[1436    7]
 [  20  208]]


In [36]:
# 6.3.4 逻辑回归
# 6.3.5 支持向量机
from sklearn.svm import LinearSVC
svm_classifier = LinearSVC().fit(X_train, y_train)
y_svm_predicted = svm_classifier.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, y_svm_predicted))
cm = confusion_matrix(y_test, y_pred)
print(cm)


 Here is the classification report:
             precision    recall  f1-score   support

        ham       0.98      1.00      0.99      1443
       spam       0.97      0.89      0.93       228

avg / total       0.98      0.98      0.98      1671

[[1436    7]
 [  20  208]]


In [37]:
# 6.4 随机森林
from sklearn.ensemble import RandomForestClassifier
RF_clf = RandomForestClassifier(n_estimators=10).fit(X_train, y_train)
predicted = RF_clf.predict(X_test)
print('\n Here is the classification report:')
print(classification_report(y_test, predicted))
cm = confusion_matrix(y_test, y_pred)
print(cm)


 Here is the classification report:
             precision    recall  f1-score   support

        ham       0.97      1.00      0.98      1443
       spam       0.98      0.80      0.88       228

avg / total       0.97      0.97      0.97      1671

[[1436    7]
 [  20  208]]


In [38]:
# 6.5 文本聚类
# K 均值法
from sklearn.cluster import KMeans, MiniBatchKMeans
from collections import defaultdict
true_k = 5
km = KMeans(n_clusters = true_k, init='k-means++', max_iter=100, n_init= 1)
kmini = MiniBatchKMeans(n_clusters=true_k, init='k-means++', n_init=1, init_size=1000, batch_size=1000, verbose=2)
km_model = km.fit(X_train)
kmini_model = kmini.fit(X_train)
print("For K-mean clustering ")
clustering = defaultdict(list)
for idx, label in enumerate(km_model.labels_):
    clustering[label].append(idx)
print("For K-mean Mini batch clustering ")
clustering = defaultdict(list)
for idx, label in enumerate(kmini_model.labels_):
    clustering[label].append(idx)

Init 1/1 with method: k-means++
Inertia for init 1/1: 963.147701
Minibatch iteration 1/400: mean batch inertia: 0.971154, ewa inertia: 0.971154 
Minibatch iteration 2/400: mean batch inertia: 0.962776, ewa inertia: 0.966859 
Minibatch iteration 3/400: mean batch inertia: 0.966800, ewa inertia: 0.966829 
Minibatch iteration 4/400: mean batch inertia: 0.974211, ewa inertia: 0.970614 
Minibatch iteration 5/400: mean batch inertia: 0.966033, ewa inertia: 0.968265 
Minibatch iteration 6/400: mean batch inertia: 0.962023, ewa inertia: 0.965065 
Minibatch iteration 7/400: mean batch inertia: 0.964243, ewa inertia: 0.964644 
Minibatch iteration 8/400: mean batch inertia: 0.968117, ewa inertia: 0.966424 
Minibatch iteration 9/400: mean batch inertia: 0.964304, ewa inertia: 0.965337 
Minibatch iteration 10/400: mean batch inertia: 0.969753, ewa inertia: 0.967601 
Minibatch iteration 11/400: mean batch inertia: 0.959598, ewa inertia: 0.963498 
Minibatch iteration 12/400: mean batch inertia: 0.960

In [40]:
# 6.6 文本中的主题建模
# https://pypi.python.org/pypi/gensim#downloads
import gensim
from gensim import corpora, models, similarities
from itertools import chain
import nltk
from nltk.corpus import stopwords
from operator import itemgetter
import re
documents = [document for document in sms_data]
stoplist = stopwords.words('english')
texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents]
print(texts)


dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
tfidf = models.TfidfModel(corpus)
corpus_tfidf = tfidf[corpus]
lsi = models.LsiModel(corpus_tfidf, id2word = dictionary, num_topics = 100)
# print(lsi.print_topics(20))
n_topics = 5
lda = models.LdaModel(corpus_tfidf, id2word = dictionary, num_topics = n_topics)
for i in range(0, n_topics):
    temp = lda.show_topic(i, 10)
    terms = []
    for term in temp:
        terms.append(str(term[0]))
    print("Top 10 terms for topic #" + str(i) + ": " + ",".join(terms))



Top 10 terms for topic #0: call,...,prize,time,claim,free,get,dinner,okie,let
Top 10 terms for topic #1: coming,...,plan,home,know,day,today,yup,call,wat
Top 10 terms for topic #2: call,later,sorry,'ll,...,home,meeting,free,got,get
Top 10 terms for topic #3: ...,come,get,wat,need,yeah,got,going,lor,home
Top 10 terms for topic #4: ...,n't,lor,come,see,get,make,want,number,stop




# Getting Started with gensim