In [1]:
def find_sentiment(sentence, pos, neg):
    """
    This function returns sentiment of sentence
    :param sentence: sentence, a string
    :param pos: set of positive words
    :param neg: set of negative words
    :retun: returns positive, negative or neutral sentiment
    """
    # "this is a sentence!" becomes:
    # ["this", "is", "a", "sentence!"]
    # note that im splitting on all whitesapces
    # if you want to split by space use .split(" ")
    sentence = sentence.split()
    
    # make sentence into a set
    sentence = set(sentence)

    # check number of common words with positive
    num_common_pos = len(sentence.intersection(pos))
    
    # check number of common words with negative
    num_common_neg = len(sentence.intersection(neg))

    # make contitions and return
    # see how return used eliminates if else
    if num_common_pos > num_common_neg:
        return "pos"
    if num_common_pos < num_common_neg:
        return "neg"
    return "neutral"

In [6]:
# tokenization: splitting a string into a list of words 
# one of the most popular is NLTK (Natural Langauage Took Kit)
from nltk.tokenize import word_tokenize

In [7]:
sentence = "hi, how are you?"


In [8]:
sentence.split()

['hi,', 'how', 'are', 'you?']

In [9]:
word_tokenize(sentence)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/insomni_.ak/nltk_data'
    - '/Users/insomni_.ak/anaconda3/nltk_data'
    - '/Users/insomni_.ak/anaconda3/share/nltk_data'
    - '/Users/insomni_.ak/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


In [10]:
# bag of words

from sklearn.feature_extraction.text import CountVectorizer

# create a corpus of sentence
corpus = [
    "hello, how are you?",
    "im getting bored at home. And you? What do you think?",
    "did you know about counts",
    "let's see if this works",
    "YES!!!!"
]

# initialize CountVectorizer
ctv = CountVectorizer()

# fit the vectorizer on corpus
ctv.fit(corpus)

corpus_transformed = ctv.transform(corpus)

In [11]:
print(corpus_transformed)

  (0, 2)	1
  (0, 9)	1
  (0, 11)	1
  (0, 22)	1
  (1, 1)	1
  (1, 3)	1
  (1, 4)	1
  (1, 7)	1
  (1, 8)	1
  (1, 10)	1
  (1, 13)	1
  (1, 17)	1
  (1, 19)	1
  (1, 22)	2
  (2, 0)	1
  (2, 5)	1
  (2, 6)	1
  (2, 14)	1
  (2, 22)	1
  (3, 12)	1
  (3, 15)	1
  (3, 16)	1
  (3, 18)	1
  (3, 20)	1
  (4, 21)	1


In [12]:
print(ctv.vocabulary_)

{'hello': 9, 'how': 11, 'are': 2, 'you': 22, 'im': 13, 'getting': 8, 'bored': 4, 'at': 3, 'home': 10, 'and': 1, 'what': 19, 'do': 7, 'think': 17, 'did': 6, 'know': 14, 'about': 0, 'counts': 5, 'let': 15, 'see': 16, 'if': 12, 'this': 18, 'works': 20, 'yes': 21}


In [18]:
# Let's integrate word_tokenize from scikit-learn
# in CountVectorizer and see what happens.
%%capture corpus_transformed
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize

# create a corpus of sentences
corpus = [
    "hello, how are you?",
    "im getting bored at home. And you? What do you think?",
    "did you know about counts",
    "let's see if this works",
    "YES!!!!"
]

# initialize CountVectorizer with word_tokenize from nltk
# as the tokenizer
ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

# fit the vectorizer on corpus
ctv.fit(corpus)

corpus_transformed = ctv.transform(corpus)
print(ctv.vocabulary_)

LookupError: 
**********************************************************************
  Resource [93mpunkt[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('punkt')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mtokenizers/punkt/PY3/english.pickle[0m

  Searched in:
    - '/Users/insomni_.ak/nltk_data'
    - '/Users/insomni_.ak/anaconda3/nltk_data'
    - '/Users/insomni_.ak/anaconda3/share/nltk_data'
    - '/Users/insomni_.ak/anaconda3/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
    - ''
**********************************************************************


NameError: name 'captured_output' is not defined