# some of the common NLP tasks reference notebook
__(using NLTK)__


## Text transformation tasks

In [1]:
with open("sample_text.txt") as file:
    raw_text = file.read()

In [14]:
from nltk.tokenize import word_tokenize, sent_tokenize

some_text = "hello world! Its Aman Singh from India. This is a reference notebook for some of the common Natural Language Processing (NLP) tasks."
# word tokenization
tokenized_words = word_tokenize(some_text)
print("Text:\n", some_text)
print("\ntokenized words:")
print(tokenized_words)

# sentence tokenization
tokenized_sents = sent_tokenize(some_text)
print("\ntokenized sentences:")
print(tokenized_sents)

# sentence/word tokenization with other laguages
french_text = "Ceci est 1 première phrase. Puis j'en écris une seconde. pour finir en voilà une troisième sans mettre de majuscule"
tokenized_sents = sent_tokenize(french_text, language = 'french')
print("\nFrench Text:\n", french_text)
print("\ntokenized french sentences:")
print(tokenized_sents)

tokenized_words = word_tokenize(french_text, language = 'french')
print("\ntokenized french words:")
print(tokenized_words)

Text:
 hello world! Its Aman Singh from India. This is a reference notebook for some of the common Natural Language Processing (NLP) tasks.

tokenized words:
['hello', 'world', '!', 'Its', 'Aman', 'Singh', 'from', 'India', '.', 'This', 'is', 'a', 'reference', 'notebook', 'for', 'some', 'of', 'the', 'common', 'Natural', 'Language', 'Processing', '(', 'NLP', ')', 'tasks', '.']

tokenized sentences:
['hello world!', 'Its Aman Singh from India.', 'This is a reference notebook for some of the common Natural Language Processing (NLP) tasks.']

French Text:
 Ceci est 1 première phrase. Puis j'en écris une seconde. pour finir en voilà une troisième sans mettre de majuscule

tokenized french sentences:
['Ceci est 1 première phrase.', "Puis j'en écris une seconde.", 'pour finir en voilà une troisième sans mettre de majuscule']

tokenized french words:
['Ceci', 'est', '1', 'première', 'phrase', '.', 'Puis', "j'en", 'écris', 'une', 'seconde', '.', 'pour', 'finir', 'en', 'voilà', 'une', 'troisième'

In [9]:
# nltk's wordnet package includes groups of synonyms, antonyms and also a brief definition for each
# here only examplifying synonyms and antonyms
from nltk.corpus import wordnet

synonyms, antonyms = [], []
for syn in wordnet.synsets("big"):
    for lemma in syn.lemmas():
        synonyms.append(lemma.name())
        if lemma.antonyms():
            antonyms.append(lemma.antonyms()[0].name())
print("SYNONYMS:\n",synonyms)
print("\nANTONYMS:\n",antonyms)

SYNONYMS:
 ['large', 'big', 'big', 'bad', 'big', 'big', 'big', 'large', 'prominent', 'big', 'heavy', 'boastful', 'braggart', 'bragging', 'braggy', 'big', 'cock-a-hoop', 'crowing', 'self-aggrandizing', 'self-aggrandising', 'big', 'swelled', 'vainglorious', 'adult', 'big', 'full-grown', 'fully_grown', 'grown', 'grownup', 'big', 'big', 'large', 'magnanimous', 'big', 'bighearted', 'bounteous', 'bountiful', 'freehanded', 'handsome', 'giving', 'liberal', 'openhanded', 'big', 'enceinte', 'expectant', 'gravid', 'great', 'large', 'heavy', 'with_child', 'big', 'boastfully', 'vauntingly', 'big', 'large', 'big', 'big']

ANTONYMS:
 ['small', 'little', 'small']


In [8]:
# stemming vs lemmatization
from nltk.stem import PorterStemmer, WordNetLemmatizer

# stemming
stemmer = PorterStemmer()
print("stemming:")
print(stemmer.stem("increases"))

# lemmatization
lemmatizer = WordNetLemmatizer()
print("\nlemmatization:")
print(lemmatizer.lemmatize("increases"))

stemming:
increas

lemmatization:
increase


In [13]:
# generating n-grams from tokens
from nltk.tokenize import word_tokenize
from nltk.util import ngrams

text = "In the fields of computational linguistics and probability, an n-gram is a contiguous sequence of n items from a given sample of text or speech."
word_tokens = word_tokenize(text)
bigrams = ngrams(word_tokens, 2)
trigrams = ngrams(word_tokens, 3)
print("Text:\n", text)
print("\nbi-grams:\n", list(bigrams))
print("\ntri-grams:\n", list(trigrams))

Text:
 In the fields of computational linguistics and probability, an n-gram is a contiguous sequence of n items from a given sample of text or speech.

bi-grams:
 [('In', 'the'), ('the', 'fields'), ('fields', 'of'), ('of', 'computational'), ('computational', 'linguistics'), ('linguistics', 'and'), ('and', 'probability'), ('probability', ','), (',', 'an'), ('an', 'n-gram'), ('n-gram', 'is'), ('is', 'a'), ('a', 'contiguous'), ('contiguous', 'sequence'), ('sequence', 'of'), ('of', 'n'), ('n', 'items'), ('items', 'from'), ('from', 'a'), ('a', 'given'), ('given', 'sample'), ('sample', 'of'), ('of', 'text'), ('text', 'or'), ('or', 'speech'), ('speech', '.')]

tri-grams:
 [('In', 'the', 'fields'), ('the', 'fields', 'of'), ('fields', 'of', 'computational'), ('of', 'computational', 'linguistics'), ('computational', 'linguistics', 'and'), ('linguistics', 'and', 'probability'), ('and', 'probability', ','), ('probability', ',', 'an'), (',', 'an', 'n-gram'), ('an', 'n-gram', 'is'), ('n-gram', 'is'

## Information extraction tasks

In [15]:
# Part-Of-Speech (POS) tagging
from nltk.tag import pos_tag

text = "I like to go to the park with my dog"

tokens = word_tokenize(text)
tags = pos_tag(tokens)
print("POS Tags:\n", tags)

POS Tags:
 [('I', 'PRP'), ('like', 'VBP'), ('to', 'TO'), ('go', 'VB'), ('to', 'TO'), ('the', 'DT'), ('park', 'NN'), ('with', 'IN'), ('my', 'PRP$'), ('dog', 'NN')]


#### POS tags References:
---------------------------------

    CC  | Coordinating conjunction |
    CD  | Cardinal number |
    DT  | Determiner |
    EX  | Existential there |
    FW  | Foreign word |
    IN  | Preposition or subordinating conjunction |
    JJ  | Adjective |
    JJR | Adjective, comparative |
    JJS | Adjective, superlative |
    LS  | List item marker |
    MD  | Modal |
    NN  | Noun, singular or mass |
    NNS | Noun, plural |
    NNP | Proper noun, singular |
    NNPS| Proper noun, plural |
    PDT | Predeterminer |
    POS | Possessive ending |
    PRP | Personal pronoun |
    PRP$| Possessive pronoun |
    RB  | Adverb |
    RBR | Adverb, comparative |
    RBS | Adverb, superlative |
    RP  | Particle |
    SYM | Symbol |
    TO  | to |
    UH  | Interjection |
    VB  | Verb, base form |
    VBD | Verb, past tense |
    VBG | Verb, gerund or present participle |
    VBN | Verb, past participle |
    VBP | Verb, non-3rd person singular present |
    VBZ | Verb, 3rd person singular present |
    WDT | Wh-determiner |
    WP  | Wh-pronoun |
    WP$ | Possessive wh-pronoun |
    WRB | Wh-adverb |

In [16]:
# Named-Entity-Recognition (NER)
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

text = "Mark Elliot Zuckerberg (born May 14, 1984) is a co-founder of Facebook."

# tokenize sentence into words
tokens = word_tokenize(text)

# POS tagging of the tokens
tags = pos_tag(tokens)
 
# using the ner function
ner = ne_chunk(tags)

print("Named-Entity-Recognition:\n", ner)

Named-Entity-Recognition:
 (S
  (PERSON Mark/NNP)
  (PERSON Elliot/NNP Zuckerberg/NNP)
  (/(
  born/VBN
  May/NNP
  14/CD
  ,/,
  1984/CD
  )/)
  is/VBZ
  a/DT
  co-founder/NN
  of/IN
  (GPE Facebook/NNP)
  ./.)


In [18]:
#todo: make an abstractive summarizer like smmry.com
#PAPER: https://arxiv.org/pdf/1602.06023.pdf