# NLP 入门

## 正则表达式进行文本搜索

In [2]:
# Text search across the sentence using Regular expression
import re
words = ['very', 'nice', 'lecture', 'day', 'moon']
expression = '|'.join(words)
print(expression)
re.findall(expression, 'i attended a very nice lecture last year', re.M)

very|nice|lecture|day|moon


['very', 'nice', 'lecture']

## 将文本转换为列表
### 读取一个文本文件并根据需要将它转化为一列单词或一列句子

In [4]:
text_file = 'nlp_starts.txt'
# Method-1: Individual words as separate elements of the list
with open(text_file) as f:
    words = f.read().split()
print(words)

['Once', 'when', 'I', 'was', 'six', 'years', 'old', 'I', 'saw', 'a', 'magnificent', 'picture', 'in', 'a', 'book,', 'called', 'True', 'Stories', 'from', 'Nature,', 'about', 'the', 'primeval', 'forest.', 'It', 'was', 'a', 'picture', 'of', 'a', 'boa', 'constrictor', 'in', 'the', 'act', 'of', 'swallowing', 'an', 'animal.', 'Here', 'is', 'a', 'copy', 'of', 'the', 'drawing.', 'In', 'the', 'book', 'it', 'said:', '“Boa', 'constrictors', 'swallow', 'their', 'prey', 'whole,', 'without', 'chewing', 'it.', 'After', 'that', 'they', 'are', 'not', 'able', 'to', 'move,', 'and', 'they', 'sleep', 'through', 'the', 'six', 'months', 'that', 'they', 'need', 'for', 'digestion.”']


In [5]:
# Method-2: Whole text as single element of the list
f = open(text_file, 'r')
words_ = f.readlines()
print(words_)

['Once when I was six years old I saw a magnificent picture in a book, called \n', 'True Stories from Nature, about the primeval forest. It was a picture of a boa \n', 'constrictor in the act of swallowing an animal. Here is a copy of the drawing. \n', 'In the book it said: “Boa constrictors swallow their prey whole, without \n', 'chewing it. After that they are not able to move, and they sleep through the \n', 'six months that they need for digestion.”\n', '\n']


### 文本预处理
#### 比如：将一个单词替换为另一个单词，删除或添加某些特定类型的单词等

In [6]:
sentence = 'John has been selected for the trial phase this time. Congrats!'
sentence = sentence.lower()
# defining the positive and negative words explicitly
positive_words = ['awesome', 'good', 'nice', 'super', 'fun', 'delightful', 'congrats']
negative_words = ['awful', 'lame', 'horrible', 'bad']
sentence = sentence.replace('!', '')
sentence

'john has been selected for the trial phase this time. congrats'

In [8]:
words = sentence.split(' ')
print(words)
result = set(words)-set(positive_words)
print(result)

['john', 'has', 'been', 'selected', 'for', 'the', 'trial', 'phase', 'this', 'time.', 'congrats']
{'time.', 'john', 'for', 'trial', 'has', 'been', 'this', 'the', 'phase', 'selected'}


### 从网页中获取文本(urllib函数)

In [29]:
# Make sure both the packages are installed
import urllib3
from bs4 import BeautifulSoup
pool_object = urllib3.PoolManager()
target_url = 'www.baidu.com'
response_ = pool_object.request('GET', target_url)
final_html_txt = BeautifulSoup(response_.data)
print(final_html_txt)

### 移除停止词
### 停止词是搜索引擎会忽略的常用词(例如"the")

In [16]:
import nltk
from nltk import word_tokenize
sentence = "This book is about Deep Learning and Natural Language Processing!"
tokens = word_tokenize(sentence)
print(tokens)

['This', 'book', 'is', 'about', 'Deep', 'Learning', 'and', 'Natural', 'Language', 'Processing', '!']


In [18]:
# nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
new_tokens = [w for w in tokens if not w in stop_words]
new_tokens

['This', 'book', 'Deep', 'Learning', 'Natural', 'Language', 'Processing', '!']

### 计数向量化
### 计数向量化是一个SciKit-Learn库工具，它可以接受任何大量的文本，将每个独特的单词作为特征返回，并计算每个单词在文本中出现的次数

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
texts = ["Ramiess sings classic songs", "he listens to old pop ", "and rock music", 'and also listens to classical songs']
cv = CountVectorizer()
cv_fit = cv.fit_transform(texts)
print(cv.get_feature_names())
print(cv_fit.toarray())

['also', 'and', 'classic', 'classical', 'he', 'listens', 'music', 'old', 'pop', 'ramiess', 'rock', 'sings', 'songs', 'to']
[[0 0 1 0 0 0 0 0 0 1 0 1 1 0]
 [0 0 0 0 1 1 0 1 1 0 0 0 0 1]
 [0 1 0 0 0 0 1 0 0 0 1 0 0 0]
 [1 1 0 1 0 1 0 0 0 0 0 0 1 1]]


### TF-IDF分数(Sklearn Package)
### TF(term frequency)：术语频率, 表示特定单词的计数与文档中单词总数的比率
### IDF(inverse document frequency) ：反向文档频率, 指文档总数与包含特定单词的文档数量的对数比率
#### 例：文档中包含100个单词，其中“happy”出现了5次，则TF = （5/100）=0.05
#### 假设有1000万个文档，“happy”出现在其中1000个文件中，IDF = log(10000000/1000) = 4
#### 则 TF-IDF = 4*0.05 = 0.2
### 类似的指标是BM25，它根据文档与查询语句的关系来对文档进行评分。BM25使用每个文档的查询项对一组文档进行排名，而不考虑文档中查询关键字之间的关系如何。

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
texts = ["Ramiess sings classic songs", "he listens to old pop", "and rock music", ' and also listens to classical songs']
vect = TfidfVectorizer()
X = vect.fit_transform(texts)
print(X.todense())

[[0.         0.         0.52547275 0.         0.         0.
  0.         0.         0.         0.52547275 0.         0.52547275
  0.41428875 0.        ]
 [0.         0.         0.         0.         0.48546061 0.38274272
  0.         0.48546061 0.48546061 0.         0.         0.
  0.         0.38274272]
 [0.         0.48693426 0.         0.         0.         0.
  0.61761437 0.         0.         0.         0.61761437 0.
  0.         0.        ]
 [0.47212003 0.37222485 0.         0.47212003 0.         0.37222485
  0.         0.         0.         0.         0.         0.
  0.37222485 0.37222485]]


### 文本分类器(TextBlob Package)
### 文本可以被分为很多中，例如正面和负面

In [21]:
from textblob import TextBlob
from textblob.classifiers import NaiveBayesClassifier
data = [
    ('I love my country.', 'pos'),
    ('This is an amazing place!', 'pos'),
    ('I do not like the smell of this place.', 'neg'),
    ('I do not like this restaurant', 'neg'),
    ('I am tired of hearing your nonsense.', 'neg'),
    ("I always aspire to be like him", 'pos'),
    ("It's a horrible performance.", 'neg')
]
model = NaiveBayesClassifier(data)
model.classify("It's an awesome place!")

'pos'