<a href="https://colab.research.google.com/github/aditi1511/python-beginner-projects/blob/master/NLP(nltk).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import nltk
def do_downloads():
  nltk.download('punkt')               # sentence tokenizer
  nltk.download('averaged_perceptron_tagger')  # pos tagger
  nltk.download('maxent_ne_chunker')           # NE tagger
  nltk.download('words')
  nltk.download('stopwords') 
  nltk.download('vader_lexicon')
  nltk.download('wordnet')

In [0]:
def nltk_tokenize_demo(text):
  for sentence in nltk.sent_tokenize(text):
    tokens = nltk.word_tokenize(sentence)
    print(tokens)
    
demo = "This is a simple sentence. Followed by another!"
#nltk_tokenize_demo(demo)

In [0]:
def nltk_pos_demo(text):
  for sent in nltk.sent_tokenize(text):
    tokens = nltk.word_tokenize(sent)
    tagged = nltk.pos_tag(tokens)
    for t in tagged:
      print(t)

demo = "This is a simple sentence. Followed by another!"
#nltk_pos_demo(demo)

In [0]:
def nltk_ne_demo(text):
  for sent in nltk.sent_tokenize(text):
    tokens = nltk.word_tokenize(sent)
    tagged = nltk.pos_tag(tokens)
    for chunk in nltk.ne_chunk(tagged):
      print(chunk)

demo = 'San Francisco considers banning sidewalk delivery robots'
#nltk_ne_demo(demo)


In [0]:
s1 = 'San Francisco considers banning sidewalk delivery robots'
s2 = 'In San Francisco, Aunt Polly considers paying sidewalk delivery robots $20.00.'
#nltk_ne_demo(s2)

In [0]:
def nltk_find_people_demo(text):
  for sent in nltk.sent_tokenize(text):
    tagged = nltk.pos_tag(nltk.word_tokenize(sent))
    for chunk in nltk.ne_chunk(tagged):
      if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
        name = ' '.join(c[0] for c in chunk)
        print(name)

s3 = 'In San Francisco, Aunt Polly considers paying sidewalk delivery robots $20.00.'
#nltk_find_people_demo(s3)

In [0]:
#nltk.download('stopwords') 
from nltk.corpus import stopwords

def nltk_stop_word_demo():
  stop_words = stopwords.words('english')
  print("nltk", stop_words)

In [0]:
import collections
from nltk import ngrams

def nltk_ngram_demo(text):
  tokens = text.lower().split()
  grams = ngrams(tokens, 2)
  
  c = collections.Counter(grams)
  print(c.most_common(10))

text = "We went to a clump of bushes, and Tom made everybody swear to keep the secret, and then showed them a hole in the hill, right in the thickest part of the bushes. "
#nltk_ngram_demo(text)

In [0]:
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

def nltk_sentiment_demo():
  sentiment_analyzer = SentimentIntensityAnalyzer()
  # helper
  def polarity_scores(doc):
      return sentiment_analyzer.polarity_scores(doc)
  doc1 = "INFO 490 is so fun."
  doc2 = "INFO 490 is so awful."
  doc3 = "INFO 490 is so fun that I can't wait to take the follow on course!"
  doc4 = "INFO 490 is so awful that I am glad there's not a follow on course!"
  print(polarity_scores(doc1)) # most positive
  print(polarity_scores(doc2)) # most negative
  print(polarity_scores(doc3)) # mostly positive, neutral
  print(polarity_scores(doc4)) # mostly negative, a little positive too

#nltk_sentiment_demo()

In [0]:
def nltk_stem_and_lemm_demo():

  words = ["game","gaming","gamed","games","gamer","grows","fairly","nonsensical"]

  ps  = nltk.stem.PorterStemmer()
  sno = nltk.stem.SnowballStemmer('english')
  lan = nltk.stem.lancaster.LancasterStemmer()
 
  for word in words:
    base  = ps.stem(word)
    sbase = sno.stem(word)
    lbase = lan.stem(word)
  
    s = ''
    if (sbase != base):
      s += "(or {})".format(sbase)
    if (lbase != base and lbase != sbase):
      s += "(or {})".format(lbase)
  
    print("{:11s} stems to {:s} {}".format(word, base, s))

In [0]:
#nltk.download('wordnet')

def nltk_wordnet_demo():
  lemma = nltk.stem.WordNetLemmatizer()
  print(lemma.lemmatize('dogs'))


In [0]:
import collections
#import nltk
HUCK_URL= "https://raw.githubusercontent.com/NSF-EC/INFO490Assets/master/src/datasets/pg/huckfinn/huck.txt"

def read_remote(url):
    import requests
    with requests.get(url) as response:
      response.encoding = 'utf-8'
      return response.text

def find_characters_nlp(text, topn):
  # tokens = nltk.word_tokenize(text)
  # counted = collections.Counter(tokens)
  names = []
  for sent in nltk.sent_tokenize(text):
    tagged = nltk.pos_tag(nltk.word_tokenize(sent))
    for chunk in nltk.ne_chunk(tagged):
      if hasattr(chunk, 'label') and chunk.label() == 'PERSON':
        name = ' '.join(c[0] for c in chunk)
        names.append(name)
  counted = collections.Counter(names)
  return counted.most_common(topn)
  # return names
  # returns top n characters found in text

text = read_remote(HUCK_URL)
topn = find_characters_nlp(text, 50)
print(topn)

[('Jim', 336), ('Tom', 150), ('Huck', 41), ('Tom Sawyer', 37), ('Aunt Sally', 37), ('Buck', 32), ('Mary Jane', 23), ('Sid', 15), ('William', 15), ('Miss Watson', 14), ('Huck Finn', 14), ('Mars Tom', 14), ('Bill', 13), ('Harvey', 13), ('Miss Mary Jane', 13), ('Uncle Silas', 13), ('Peter', 12), ('George', 11), ('Mary', 10), ('Miss', 10), ('Bob', 10), ('Bilgewater', 10), ('Pap', 8), ('Ben Rogers', 7), ('Dey', 7), ('Cairo', 7), ('Susan', 7), ('Peter Wilks', 7), ('Aunt', 6), ('Jackson', 6), ('Le', 6), ('Dat', 6), ('Miss Sophia', 6), ('Boggs', 6), ('Miss Mary', 6), ('Phelps', 6), ('Sally', 6), ('Aunty', 6), ('Hello', 5), ('Doan', 5), ('Sarah', 5), ('Packard', 5), ('Levi Bell', 5), ('Mr. Lothrop', 5), ('Silas', 5), ('Aunt Polly', 4), ('Watson', 4), ('Jo Harper', 4), ('Old', 4), ('Goshen', 4)]
