## Importing Libraries

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

!pip install jieba
import jieba

## 1) Stopwords

In [1]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

In [2]:
print(len(stopwords.words('english')))
print(stopwords.words("english"))

179
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than

In [3]:
if "not" in stopwords.words("english"):
    print("yes")
    
# Can mess with sentiment analysis

yes


## 2) CMU Dict

In [4]:
entries = nltk.corpus.cmudict.entries()
print(len(entries))
print(entries)

133737
[('a', ['AH0']), ('a.', ['EY1']), ('a', ['EY1']), ...]


## 3) Wordnet

In [5]:

wn.synsets('motorcar')

[Synset('car.n.01')]

In [6]:
wn.synset('car.n.01').lemma_names()

['car', 'auto', 'automobile', 'machine', 'motorcar']

## 4) Stemming

In [7]:
# Porter Stemmer

stemmer_porter = PorterStemmer()
stemmer_porter.stem("Happiness")

'happi'

In [8]:
# Lancaster Stemmer

stemmer_lancaster = LancasterStemmer()
stemmer_lancaster.stem("happiness")

'happy'

In [9]:
# Snowball Stemmer (Multi-lingual)

print(len(SnowballStemmer.languages))

16


In [10]:
stemmer_snowball_english = SnowballStemmer("english")
stemmer_snowball_english.stem("happiness")

'happi'

In [11]:
text = "Am a quick brown fox that jumps over a lazy dog"
text = [stemmer_lancaster.stem(x) for x in text.split()]
text = " ".join(text)
print(text)

am a quick brown fox that jump ov a lazy dog


In [12]:
text = "This is a good movie"
text2 = "This is not a good movie"

text = [x for x in text.split() if x not in stopwords.words("english")]
text2 = [x for x in text2.split() if x not in stopwords.words("english")]

print(text)
print(text2)

['This', 'good', 'movie']
['This', 'good', 'movie']


### Removing stopwords makes a wreck in sentiment analysis

In [13]:
lemmatizer_wordnet = WordNetLemmatizer()
print(lemmatizer_wordnet.lemmatize("cacti"))

cactus


In [14]:
print(lemmatizer_wordnet.lemmatize("better", pos="a")) # POS: Part of Speech
print(lemmatizer_wordnet.lemmatize("better"))

good
better


### Chinese Segmentation using Jieba


In [15]:
seg = jieba.cut("中國哲學書電子化計劃", cut_all=True)
print(" ".join(seg))



You should consider upgrading via the 'C:\Users\chugh\anaconda3\python.exe -m pip install --upgrade pip' command.
Building prefix dict from the default dictionary ...
Dumping model to file cache C:\Users\chugh\AppData\Local\Temp\jieba.cache
Loading model cost 0.867 seconds.
Prefix dict has been built successfully.


中 國 哲 學 書 電 子化 計 劃
