In [2]:
import nltk
from nltk.tag import DefaultTagger
tagger = DefaultTagger('NN')
tagger.tag(['Hello', 'World'])

[('Hello', 'NN'), ('World', 'NN')]

In [3]:
tagger.tag_sents([['Hello', 'world','.'], ['How', 'are', 'you', '?']])

[[('Hello', 'NN'), ('world', 'NN'), ('.', 'NN')],
 [('How', 'NN'), ('are', 'NN'), ('you', 'NN'), ('?', 'NN')]]

In [4]:
text = nltk.word_tokenize("And now we are in the second module")
nltk.pos_tag(text)

[('And', 'CC'),
 ('now', 'RB'),
 ('we', 'PRP'),
 ('are', 'VBP'),
 ('in', 'IN'),
 ('the', 'DT'),
 ('second', 'JJ'),
 ('module', 'NN')]

In [5]:
text = nltk.word_tokenize("NLTK from NLP")
nltk.pos_tag(text)

[('NLTK', 'NN'), ('from', 'IN'), ('NLP', 'NNP')]

In [6]:
#CC - coordinating conjunction, NN - noun etc. - detailed explanations
nltk.download('tagsets')
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\arnav\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


In [7]:
nltk.help.upenn_tagset('CC')

CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet


In [8]:
from nltk import pos_tag
sent = "Neha is a pretty girl who studies in TISS Chembur."
tokens = nltk.word_tokenize(sent)
pos = pos_tag(tokens)
print(pos) #output is a list of tuples - word, tag

[('Neha', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('pretty', 'JJ'), ('girl', 'NN'), ('who', 'WP'), ('studies', 'NNS'), ('in', 'IN'), ('TISS', 'NNP'), ('Chembur', 'NNP'), ('.', '.')]


In [10]:
import spacy
nlp = spacy.load("en_core_web_sm")
text = "Neha is a pretty girl who studies in TISS Chembur."
doc = nlp(text)

print("Original Text: ", text)
print("PoS tagging result: ")
for token in doc:
    #print(f"{token.text}:{token.pos_:}")
    print(f"{token.text:{10}}\t{token.tag_:{10}}\t{spacy.explain(token.tag_)}")

Original Text:  Neha is a pretty girl who studies in TISS Chembur.
PoS tagging result: 
Neha      	NNP       	noun, proper singular
is        	VBZ       	verb, 3rd person singular present
a         	DT        	determiner
pretty    	JJ        	adjective (English), other noun-modifier (Chinese)
girl      	NN        	noun, singular or mass
who       	WP        	wh-pronoun, personal
studies   	VBZ       	verb, 3rd person singular present
in        	IN        	conjunction, subordinating or preposition
TISS      	NNP       	noun, proper singular
Chembur   	NNP       	noun, proper singular
.         	.         	punctuation mark, sentence closer


In [12]:
import string

text = "this is an exemplar text for showing cleaning etc. for text."

words = nltk.word_tokenize(text)
stopwords = nltk.corpus.stopwords.words("english")

cleaned = [word.lower() for word in words if (word not in stopwords) and len(word)>2]

tagged = nltk.pos_tag(cleaned)
print(tagged)

#normalization - simply bringing to uniform case; standardization is removal of punctuation etc.

[('exemplar', 'JJ'), ('text', 'NN'), ('showing', 'VBG'), ('cleaning', 'VBG'), ('etc', 'JJ'), ('text', 'NN')]


In [14]:
nltk.corpus.brown.tagged_words(categories = 'news')

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [15]:
import nltk   #imp
from nltk.corpus import brown
brown.categories()

['adventure',
 'belles_lettres',
 'editorial',
 'fiction',
 'government',
 'hobbies',
 'humor',
 'learned',
 'lore',
 'mystery',
 'news',
 'religion',
 'reviews',
 'romance',
 'science_fiction']

In [17]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\arnav\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [18]:
pos_tags = nltk.pos_tag(tokens) #, tagset = 'universal')

In [19]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arnav\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [20]:
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\arnav\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!


True

In [21]:
sample_text = "F. Henly was born in San Francisco and works at Microsoft."
tokens = nltk.word_tokenize(sample_text)
tagged_tokens = nltk.pos_tag(tokens)


entities  = nltk.ne_chunk(tagged_tokens)
print(entities)
#Chunking - extracting information from text - names, locations etc. 
#breaks sentences in the last words to identify entities 

(S
  F./NNP
  (PERSON Henly/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE San/NNP Francisco/NNP)
  and/CC
  works/VBZ
  at/IN
  (ORGANIZATION Microsoft/NNP)
  ./.)


In [23]:
entities.draw() #see taskbar

In [24]:
import pprint

data = {
    "user": {
        "name": "Alice",
        "age": 30,
        "details": {
            "city": "New York",
            "occupation": "Engineer",
            "hobbies": ["reading", "hiking", "coding"]
        }
    },
    "products": [
        {"id": 1, "name": "Laptop", "price": 1200},
        {"id": 2, "name": "Mouse", "price": 25},
        {"id": 3, "name": "Keyboard", "price": 75}
    ]
}

print("Using print():")
print(data)

print("\nUsing pprint():")
pprint.pprint(data)

Using print():
{'user': {'name': 'Alice', 'age': 30, 'details': {'city': 'New York', 'occupation': 'Engineer', 'hobbies': ['reading', 'hiking', 'coding']}}, 'products': [{'id': 1, 'name': 'Laptop', 'price': 1200}, {'id': 2, 'name': 'Mouse', 'price': 25}, {'id': 3, 'name': 'Keyboard', 'price': 75}]}

Using pprint():
{'products': [{'id': 1, 'name': 'Laptop', 'price': 1200},
              {'id': 2, 'name': 'Mouse', 'price': 25},
              {'id': 3, 'name': 'Keyboard', 'price': 75}],
 'user': {'age': 30,
          'details': {'city': 'New York',
                      'hobbies': ['reading', 'hiking', 'coding'],
                      'occupation': 'Engineer'},
          'name': 'Alice'}}


In [25]:
from spacy import displacy
displacy.render(doc, style = "ent")

In [None]:
#speech to text text to speech
#speech format - NER

#### Rule based tagging- uses a set of pre-defind rules to assign grammatical tags to words, with rules often relying on context from surrounding words

This approach uses contextual information to assign tags to unknown or ambiguous words

Eg: if a word is preceeded by a determiner and succeeded by a noun - it is an adjective

eg. - A nice dress - here nice is the adjective

In [None]:
import nlkt

def rule_based_pos_tag(sentence):
    tagged_words = []
    for word in nltk.word_tokenize(sentence):
        if word.endswith("ing"):
            tagged_words.append((word, "VBG"))#Verb
        elif word.lower() in ["the", "a", "an"]:
            tagged_words.append((word, "DT"))#Determiner
        else:
            tagged_words.append((word, "NN"))#Default - Noun
    return tagged_words



#### Stochastic Tagging

Uses probabilistic and statistical information from the corpus of labeled text to assign a POS tag to each word in a sentence

Word frequency approach - most commonly used approaches are take.

eg. - play cricket vs watch a play - Verb vs Noun

Sees the frequency of both occurences and then makes the modifications accordingly

#### Tagged Sequence

It follows n-gram approach to predice next words or the likelihood of a sequence of words/tokens

n-gram - finds probabilities of prev word to find the nth word's probability
W1 W2 W3 -- W4?
if n = 3 then we will consider the POS tags of previous 3 words

Suppose - W1 = N, W2 = V, W3 = Deter => NVD

Then in the corpus, it searches for this NVD sequence.

Eg. 10 times, it is a Noun, 90 times it (W4) is a Verb => Assigns it as a verb
##### Bayes Theorem is applied here

#### Laplacian Smoothing - adapting our corpus to incorporate new words (with P() = 0)

P_laplace(w) = (C(w)+1/T(w))

basically add 1 to the entire count of words so that the first occurrences so that no probability is 0

In [1]:
from nltk import ngrams

text = "I love cats. Cats are my favorite animal. I have two cats."
n = 2

#Generate 2-grams
n_grams = ngrams(text.split(),n)
#Convert to list of tuples 
n_grams = list(n_grams)
print(n_grams)

[('I', 'love'), ('love', 'cats.'), ('cats.', 'Cats'), ('Cats', 'are'), ('are', 'my'), ('my', 'favorite'), ('favorite', 'animal.'), ('animal.', 'I'), ('I', 'have'), ('have', 'two'), ('two', 'cats.')]


In [3]:
import nltk
Tokens = nltk.word_tokenize(text)
output = list(nltk.bigrams(Tokens))
print(output)

[('I', 'love'), ('love', 'cats'), ('cats', '.'), ('.', 'Cats'), ('Cats', 'are'), ('are', 'my'), ('my', 'favorite'), ('favorite', 'animal'), ('animal', '.'), ('.', 'I'), ('I', 'have'), ('have', 'two'), ('two', 'cats'), ('cats', '.')]


In [4]:
import nltk
output = list(nltk.trigrams(Tokens))
print(output)

[('I', 'love', 'cats'), ('love', 'cats', '.'), ('cats', '.', 'Cats'), ('.', 'Cats', 'are'), ('Cats', 'are', 'my'), ('are', 'my', 'favorite'), ('my', 'favorite', 'animal'), ('favorite', 'animal', '.'), ('animal', '.', 'I'), ('.', 'I', 'have'), ('I', 'have', 'two'), ('have', 'two', 'cats'), ('two', 'cats', '.')]


#### GENERAL SYNTAX FUNCTION

In [5]:
def generate_ngrams(text, wordstocombine):
    words = text.split()
    output = []
    for i in range(len(words) - wordstocombine + 1):
        output.append(words[i:i+wordstocombine])
    return output

generate_ngrams("This is a good book to study", 3)

[['This', 'is', 'a'],
 ['is', 'a', 'good'],
 ['a', 'good', 'book'],
 ['good', 'book', 'to'],
 ['book', 'to', 'study']]

In [7]:
from nltk.corpus import webtext, stopwords
from nltk import bigrams, trigrams, ngrams #can directly take n grams and specify n values as well, for demonstration here
from nltk.probability import FreqDist

fields = webtext.fileids()
print(fields)

['firefox.txt', 'grail.txt', 'overheard.txt', 'pirates.txt', 'singles.txt', 'wine.txt']


In [8]:
text = [w.lower() for w in webtext.words('pirates.txt')]

In [9]:
stop_words = set(stopwords.words("english"))
filtered_word = []
for w in text:
    if w not in stop_words:
        if len(w)>3: #optional conditions based on requirements
            filtered_word.append(w)

In [10]:
A = bigrams(filtered_word)
fdist = FreqDist(A)
fdist.most_common(20)

[(('jack', 'sparrow'), 222),
 (('elizabeth', 'swann'), 93),
 (('davy', 'jones'), 58),
 (('black', 'pearl'), 37),
 (('flying', 'dutchman'), 37),
 (('lord', 'cutler'), 35),
 (('cutler', 'beckett'), 35),
 (('pintel', 'ragetti'), 24),
 (('turner', 'jack'), 14),
 (('cannibal', 'island'), 14),
 (('sparrow', 'jack'), 13),
 (('jack', 'jack'), 13),
 (('scene', 'cannibal'), 13),
 (('port', 'royal'), 11),
 (('captain', 'jack'), 10),
 (('bamboo', 'pole'), 9),
 (('scene', 'black'), 8),
 (('cotton', 'parrot'), 8),
 (('gibbs', 'jack'), 8),
 (('sparrow', 'turner'), 8)]

In [11]:
B = ngrams(filtered_word, 5)
fdist = FreqDist(B)
fdist.most_common(20)

[(('jack', 'sparrow', 'sent', 'settle', 'debt'), 3),
 (('lord', 'cutler', 'beckett', 'east', 'india'), 2),
 (('cutler', 'beckett', 'east', 'india', 'trading'), 2),
 (('beckett', 'east', 'india', 'trading', 'company'), 2),
 (('turner', 'jack', 'jack', 'sparrow', 'turner'), 2),
 (('boom', 'shoo', 'boom', 'boom', 'shoo'), 2),
 (('shoo', 'boom', 'boom', 'shoo', 'boom'), 2),
 (('scene', 'cannibal', 'island', 'throne', 'cannibal'), 2),
 (('cannibal', 'island', 'throne', 'cannibal', 'drums'), 2),
 (('island', 'throne', 'cannibal', 'drums', 'pounding'), 2),
 (('throne', 'cannibal', 'drums', 'pounding', 'jack'), 2),
 (('jack', 'sparrow', 'mister', 'gibbs', 'gibbs'), 2),
 (('sparrow', 'mister', 'gibbs', 'gibbs', 'jack'), 2),
 (('mister', 'gibbs', 'gibbs', 'jack', 'sparrow'), 2),
 (('purpose', 'turner', 'jack', 'sparrow', 'sent'), 2),
 (('turner', 'jack', 'sparrow', 'sent', 'settle'), 2),
 (('sparrow', 'sent', 'settle', 'debt', 'davy'), 2),
 (('sent', 'settle', 'debt', 'davy', 'jones'), 2),
 (('s