In [4]:
import nltk
import nltk.corpus

In [5]:
from nltk.tokenize import word_tokenize

In [6]:
# string
hacker_string = "A computer hacker is any skilled computer expert that uses their technical knowledge to overcome a problem. While \"hacker\" can refer to any skilled computer programmer, the term has become associated in popular culture with a \"security hacker\", someone who, with their technical knowledge, uses bugs or exploits to break into computer systems."

In [7]:
# Tokenizing
hacker_string_tokens = word_tokenize(hacker_string)
hacker_string_tokens

['A',
 'computer',
 'hacker',
 'is',
 'any',
 'skilled',
 'computer',
 'expert',
 'that',
 'uses',
 'their',
 'technical',
 'knowledge',
 'to',
 'overcome',
 'a',
 'problem',
 '.',
 'While',
 '``',
 'hacker',
 "''",
 'can',
 'refer',
 'to',
 'any',
 'skilled',
 'computer',
 'programmer',
 ',',
 'the',
 'term',
 'has',
 'become',
 'associated',
 'in',
 'popular',
 'culture',
 'with',
 'a',
 '``',
 'security',
 'hacker',
 "''",
 ',',
 'someone',
 'who',
 ',',
 'with',
 'their',
 'technical',
 'knowledge',
 ',',
 'uses',
 'bugs',
 'or',
 'exploits',
 'to',
 'break',
 'into',
 'computer',
 'systems',
 '.']

In [8]:
# Checking the type and number of tokens
type(hacker_string_tokens), len(hacker_string_tokens)

(list, 63)

In [9]:
# Frequency of tokens
from nltk.probability import FreqDist
fdist = FreqDist()

In [10]:
for i in hacker_string_tokens:
    fdist[i] = fdist[i] + 1

fdist

FreqDist({'computer': 4, ',': 4, 'hacker': 3, 'to': 3, 'any': 2, 'skilled': 2, 'uses': 2, 'their': 2, 'technical': 2, 'knowledge': 2, ...})

In [11]:
# Ten most common tokens
top_10 = fdist.most_common(10)
top_10

[('computer', 4),
 (',', 4),
 ('hacker', 3),
 ('to', 3),
 ('any', 2),
 ('skilled', 2),
 ('uses', 2),
 ('their', 2),
 ('technical', 2),
 ('knowledge', 2)]

In [12]:
# Bigrams, ngram
black_smoke = "Did you know there was a tower, Where they look out to the land, To see the people quickly passing by"

In [13]:
black_smoke_tokens = word_tokenize(black_smoke)
black_smoke_tokens

['Did',
 'you',
 'know',
 'there',
 'was',
 'a',
 'tower',
 ',',
 'Where',
 'they',
 'look',
 'out',
 'to',
 'the',
 'land',
 ',',
 'To',
 'see',
 'the',
 'people',
 'quickly',
 'passing',
 'by']

In [15]:
bigrams_list = list(nltk.bigrams(black_smoke_tokens))
bigrams_list

[('Did', 'you'),
 ('you', 'know'),
 ('know', 'there'),
 ('there', 'was'),
 ('was', 'a'),
 ('a', 'tower'),
 ('tower', ','),
 (',', 'Where'),
 ('Where', 'they'),
 ('they', 'look'),
 ('look', 'out'),
 ('out', 'to'),
 ('to', 'the'),
 ('the', 'land'),
 ('land', ','),
 (',', 'To'),
 ('To', 'see'),
 ('see', 'the'),
 ('the', 'people'),
 ('people', 'quickly'),
 ('quickly', 'passing'),
 ('passing', 'by')]

In [16]:
trigrams_list = list(nltk.trigrams(black_smoke_tokens))
trigrams_list

[('Did', 'you', 'know'),
 ('you', 'know', 'there'),
 ('know', 'there', 'was'),
 ('there', 'was', 'a'),
 ('was', 'a', 'tower'),
 ('a', 'tower', ','),
 ('tower', ',', 'Where'),
 (',', 'Where', 'they'),
 ('Where', 'they', 'look'),
 ('they', 'look', 'out'),
 ('look', 'out', 'to'),
 ('out', 'to', 'the'),
 ('to', 'the', 'land'),
 ('the', 'land', ','),
 ('land', ',', 'To'),
 (',', 'To', 'see'),
 ('To', 'see', 'the'),
 ('see', 'the', 'people'),
 ('the', 'people', 'quickly'),
 ('people', 'quickly', 'passing'),
 ('quickly', 'passing', 'by')]

In [17]:
ngrams_list = list(nltk.ngrams(black_smoke_tokens, 4))
ngrams_list

[('Did', 'you', 'know', 'there'),
 ('you', 'know', 'there', 'was'),
 ('know', 'there', 'was', 'a'),
 ('there', 'was', 'a', 'tower'),
 ('was', 'a', 'tower', ','),
 ('a', 'tower', ',', 'Where'),
 ('tower', ',', 'Where', 'they'),
 (',', 'Where', 'they', 'look'),
 ('Where', 'they', 'look', 'out'),
 ('they', 'look', 'out', 'to'),
 ('look', 'out', 'to', 'the'),
 ('out', 'to', 'the', 'land'),
 ('to', 'the', 'land', ','),
 ('the', 'land', ',', 'To'),
 ('land', ',', 'To', 'see'),
 (',', 'To', 'see', 'the'),
 ('To', 'see', 'the', 'people'),
 ('see', 'the', 'people', 'quickly'),
 ('the', 'people', 'quickly', 'passing'),
 ('people', 'quickly', 'passing', 'by')]

In [21]:
# Stemming
from nltk.stem import PorterStemmer
pst = PorterStemmer()

In [22]:
pst.stem("winning"), pst.stem("studies"), pst.stem("buying")

('win', 'studi', 'buy')

In [24]:
# Lemmatization
from nltk.stem import wordnet
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [25]:
words_to_stem = ["cats", "cacti", "geese", "dog", "Buy"]

In [26]:
for i in words_to_stem:
    print(i + " : " + lemmatizer.lemmatize(i))

cats : cat
cacti : cactus
geese : goose
dog : dog
Buy : Buy


In [27]:
# PoS
peace = "What do you mean, 'I don't believe in God'? I talk to him everyday."

In [28]:
peace_tokens = word_tokenize(peace)

In [29]:
for i in peace_tokens:
    print(nltk.pos_tag([i]))

[('What', 'WP')]
[('do', 'VB')]
[('you', 'PRP')]
[('mean', 'NN')]
[(',', ',')]
[("'", "''")]
[('I', 'PRP')]
[('do', 'VB')]
[("n't", 'RB')]
[('believe', 'VB')]
[('in', 'IN')]
[('God', 'NNP')]
[("'", "''")]
[('?', '.')]
[('I', 'PRP')]
[('talk', 'NN')]
[('to', 'TO')]
[('him', 'PRP')]
[('everyday', 'NN')]
[('.', '.')]


In [30]:
mary = "Mary had a little lamb, whom she really loved"

In [31]:
mary_tokens = word_tokenize(mary)

In [33]:
for i in mary_tokens:
    print(nltk.pos_tag([i]))

[('Mary', 'NNP')]
[('had', 'VBD')]
[('a', 'DT')]
[('little', 'JJ')]
[('lamb', 'NN')]
[(',', ',')]
[('whom', 'WP')]
[('she', 'PRP')]
[('really', 'RB')]
[('loved', 'VBN')]


In [None]:
# Names Entity Recognition
