# NLTK

#### Install NLTK

In [381]:
%%bash
pip install nltk



You are using pip version 9.0.1, however version 21.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.


#### Download models or corpora

In [109]:
%%bash
python -m nltk.downloader

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


#### Import and use

In [122]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/ana/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [123]:
import nltk

In [148]:
tweet = "RT @lOR42wsOEFcv3f: I fall too fast, crash too hard, forgive too easily and care too much... :( #amiright"

In [212]:
query = 'fast'

In [213]:
tweet.find(query)

31

#### Tokenization

In [214]:
tweet.split()

['RT',
 '@lOR42wsOEFcv3f:',
 'I',
 'fall',
 'too',
 'fast,',
 'crash',
 'too',
 'hard,',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much...',
 ':(',
 '#amiright']

In [215]:
[query in tweet.split()]

[False]

In [216]:
nltk.word_tokenize(tweet)

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

In [217]:
[query in nltk.word_tokenize(tweet)]

[True]

In [218]:
nltk.word_tokenize(tweet, language='spanish')

['RT',
 '@',
 'lOR42wsOEFcv3f',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':',
 '(',
 '#',
 'amiright']

In [219]:
from nltk.tokenize import RegexpTokenizer
custom_tokenizer = RegexpTokenizer('[a-zA-Z0-9]+', discard_empty=True)

In [220]:
custom_tokenizer.tokenize(tweet)

['RT',
 'lOR42wsOEFcv3f',
 'I',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 'amiright']

In [286]:
from nltk.tokenize import TweetTokenizer
tweet_tokenizer = TweetTokenizer(strip_handles=True, reduce_len=True)

In [287]:
tweet_tokenizer.tokenize(tweet)

['RT',
 ':',
 'I',
 'fall',
 'too',
 'fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(',
 '#amiright']

In [288]:
from nltk.tokenize import MWETokenizer
mwe = MWETokenizer()
mwe.add_mwe(('too', 'fast'))
mwe.tokenize(tweet_tokenizer.tokenize(tweet))

['RT',
 ':',
 'I',
 'fall',
 'too_fast',
 ',',
 'crash',
 'too',
 'hard',
 ',',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(',
 '#amiright']

In [295]:
query = 'too_fast'
query in mwe.tokenize(tweet_tokenizer.tokenize(tweet))

True

### Normalization

In [296]:
tweet.lower()

'rt @lor42wsoefcv3f: i fall too fast, crash too hard, forgive too easily and care too much... :( #amiright'

In [346]:
import re
import string

def normalize_tokens(tokenized_text):
    # Lowercase
    tokens = [t.lower() for t in tokenized_text]
    # Remove hashtags
    tokens = [t for t in tokens if not t.startswith('#')]
    # Remove punctuation
    tokens = [t for t in tokens if t not in string.punctuation]
    # Keep only letters
#     tokens = [t for t in tokens if re.match('^[a-z]+$', t)]
    # Normalize characters
#     tokens = [re.sub('á', 'a', t) for t in tokens]

    return tokens

In [347]:
spanish_query = 'muy rápido'
normalize_tokens(tweet_tokenizer.tokenize(spanish_query))

['muy', 'rápido']

In [348]:
normalize_tokens(tweet_tokenizer.tokenize(tweet))

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

#### Uniform normalization principle

In [349]:
query = 'TOO fast TOO furious'
tokenized_query = tweet_tokenizer.tokenize(query)
# normalized_query = normalize_tokens(tokenized_query)
normalized_query = tokenized_query
normalized_query

['TOO', 'fast', 'TOO', 'furious']

In [350]:
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
# normalized_tweet = normalize_tokens(tweet.split())
normalized_tweet

['rt',
 'i',
 'fall',
 'too',
 'fast',
 'crash',
 'too',
 'hard',
 'forgive',
 'too',
 'easily',
 'and',
 'care',
 'too',
 'much',
 '...',
 ':(']

In [351]:
common_words = set(normalized_query).intersection(normalized_tweet)
print(common_words)
print(len(common_words), "common word(s)")

{'fast'}
1 common word(s)


#### Stopwords

In [352]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/ana/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [353]:
from nltk.corpus import stopwords
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [354]:
blacklist_words = stopwords.words('english') + ['rt']

In [355]:
cleaned_tweet = [t for t in normalized_tweet if t not in blacklist_words]
print(cleaned_tweet)

['fall', 'fast', 'crash', 'hard', 'forgive', 'easily', 'care', 'much', '...', ':(']


#### Stemming / Lemmatization


In [382]:
from nltk.stem import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.stem import WordNetLemmatizer

In [387]:
stemmer = PorterStemmer()

[stemmer.stem(t) for t in cleaned_tweet]

['fall',
 'fast',
 'crash',
 'hard',
 'forgiv',
 'easili',
 'care',
 'much',
 '...',
 ':(']

In [389]:
stemmer = SnowballStemmer(language='english')

[stemmer.stem(t) for t in cleaned_tweet]

['fall',
 'fast',
 'crash',
 'hard',
 'forgiv',
 'easili',
 'care',
 'much',
 '...',
 ':(']

In [393]:
lemmatizer = WordNetLemmatizer()

[lemmatizer.lemmatize(t) for t in cleaned_tweet]

['fall',
 'fast',
 'crash',
 'hard',
 'forgive',
 'easily',
 'care',
 'much',
 '...',
 ':(']

In [412]:
tagged_tweet = nltk.pos_tag(cleaned_tweet)
print(tagged_tweet)

[('fall', 'NN'), ('fast', 'RB'), ('crash', 'JJ'), ('hard', 'JJ'), ('forgive', 'NN'), ('easily', 'RB'), ('care', 'VB'), ('much', 'JJ'), ('...', ':'), (':(', 'NN')]


In [436]:
from nltk.corpus import wordnet as wn
tag_map = {'J': wn.ADJ, 'V': wn.VERB, 'R': wn.ADV, 'N': wn.NOUN}
def get_lemmas(tokenized_text):
    tagged_text = nltk.pos_tag(tokenized_text)
    return [lemmatizer.lemmatize(w, pos=tag_map.get(p[0], wn.NOUN)) for (w, p) in tagged_text]


In [443]:
query = "the fastest!"
normalized_query = normalize_tokens(tweet_tokenizer.tokenize(query))
print(normalized_query)

['the', 'fastest']


In [476]:
lemmatized_tweet = get_lemmas(normalized_tweet)
lemmatized_query = get_lemmas(normalized_query)
print(lemmatized_tweet)
print(lemmatized_query)


['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']
['the', 'fast']


In [477]:
print("Common words:", set(lemmatized_tweet).intersection(set(lemmatized_query)))

Common words: {'the', 'fast'}


#### Vocabulary

In [454]:
from collections import Counter

Counter(normalized_tweet).most_common(5)

[('the', 2), ("i'm", 1), ('faster', 1), ('fastest', 1), ('than', 1)]

In [466]:
tweet = "I am so fast, I am the fastest!"
normalized_tweet = normalize_tokens(tweet_tokenizer.tokenize(tweet))
lemmatized_tweet = get_lemmas(normalized_tweet)
print(lemmatized_tweet)

['i', 'be', 'so', 'fast', 'i', 'be', 'the', 'fast']


In [468]:
print(Counter(normalized_tweet))
print(Counter(lemmatized_tweet))

Counter({'i': 2, 'am': 2, 'fastest': 1, 'the': 1, 'so': 1, 'fast': 1})
Counter({'be': 2, 'fast': 2, 'i': 2, 'so': 1, 'the': 1})


#### Sentence segmentation

In [478]:
query = "I am too fast. I am too furious."

In [479]:
from nltk.tokenize import sent_tokenize

In [480]:
sent_tokenize(query)

['I am too fast.', 'I am too furious.']

In [481]:
spanish_tokenizer = nltk.data.load('tokenizers/punkt/PY3/spanish.pickle')
spanish_query = 'Soy muy rápido! Estoy muy furioso!'
spanish_tokenizer.tokenize(spanish_query)

['Soy muy rápido!', 'Estoy muy furioso!']

In [482]:
sent_tokenize("J.K. Rowling is rich. I am not as rich as J.K.")

['J.K. Rowling is rich.', 'I am not as rich as J.K.']

In [483]:
from nltk.tokenize import PunktSentenceTokenizer
# PunktSentenceTokenizer??

#### Numericalization

In [487]:
get_lemmas(normalize_tokens(custom_tokenizer.tokenize("STAY TUNED!")))

['stay', 'tune']