In [2]:
def find_sentiment(sentence, pos, neg):
    '''
    This function returns sentiment of sentence
    :param sentence: str
    :param pos: list
    :param neg: list
    :return: returns positive, negative or neutral
    '''

    # Split sentence into words
    sentence = sentence.split()

    # make sentence into a set
    sentence = set(sentence)

    # check number of common words with positive
    num_common_pos = len(sentence.intersection(pos))

    # check number of common words with negative
    num_common_neg = len(sentence.intersection(neg))

    # make conditions and return
    if num_common_pos > num_common_neg:
        return 'positive'
    elif num_common_pos < num_common_neg:
        return 'negative'
    else:
        return 'neutral'

In [1]:
from sklearn.feature_extraction.text import CountVectorizer

# create a corpus of sentences
corpus = [
    'hello, how are you?',
    'im getting bored at home. And you? What do you think?',
    'did you know about counts',
    "let's see if this works!",
    'YES!!'
]

# initialize CountVectorizer
ctv = CountVectorizer()

# fit the vectorizer on corpus
ctv.fit(corpus)

corpus_transformed = ctv.transform(corpus)

In [3]:
print(corpus_transformed)

  (0, 2)	1
  (0, 9)	1
  (0, 11)	1
  (0, 22)	1
  (1, 1)	1
  (1, 3)	1
  (1, 4)	1
  (1, 7)	1
  (1, 8)	1
  (1, 10)	1
  (1, 13)	1
  (1, 17)	1
  (1, 19)	1
  (1, 22)	2
  (2, 0)	1
  (2, 5)	1
  (2, 6)	1
  (2, 14)	1
  (2, 22)	1
  (3, 12)	1
  (3, 15)	1
  (3, 16)	1
  (3, 18)	1
  (3, 20)	1
  (4, 21)	1


In [6]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
# create a corpus of sentences
corpus = [
    'hello, how are you?',
    'im getting bored at home. And you? What do you think?',
    'did you know about counts',
    "let's see if this works!",
    'YES!!'
]

# initialize CountVectorizer with word_tokenize from nltk
ctv = CountVectorizer(tokenizer=word_tokenize, token_pattern=None)

# fit the vectorizer on corpus
ctv.fit(corpus)

corpus_transformed = ctv.transform(corpus)
print(ctv.vocabulary_)

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/billhikari/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


{'hello': 14, ',': 2, 'how': 16, 'are': 7, 'you': 27, '?': 4, 'im': 18, 'getting': 13, 'bored': 9, 'at': 8, 'home': 15, '.': 3, 'and': 6, 'what': 24, 'do': 12, 'think': 22, 'did': 11, 'know': 19, 'about': 5, 'counts': 10, 'let': 20, "'s": 1, 'see': 21, 'if': 17, 'this': 23, 'works': 25, '!': 0, 'yes': 26}


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import word_tokenize

# create a corpus of sentences
corpus = [
    'hello, how are you?',
    'im getting bored at home. And you? What do you think?',
    'did you know about counts',
    "let's see if this works!",
    'YES!!'
]

# initialize TfidfVectorizer with word_tokenize from nltk
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

# fit the vectorizer on corpus
tfv.fit(corpus)

corpus_transformed = tfv.transform(corpus)
print(corpus_transformed)

  (0, 27)	0.2965698850220162
  (0, 16)	0.4428321995085722
  (0, 14)	0.4428321995085722
  (0, 7)	0.4428321995085722
  (0, 4)	0.35727423026525224
  (0, 2)	0.4428321995085722
  (1, 27)	0.35299699146792735
  (1, 24)	0.2635440111190765
  (1, 22)	0.2635440111190765
  (1, 18)	0.2635440111190765
  (1, 15)	0.2635440111190765
  (1, 13)	0.2635440111190765
  (1, 12)	0.2635440111190765
  (1, 9)	0.2635440111190765
  (1, 8)	0.2635440111190765
  (1, 6)	0.2635440111190765
  (1, 4)	0.42525129752567803
  (1, 3)	0.2635440111190765
  (2, 27)	0.31752680284846835
  (2, 19)	0.4741246485558491
  (2, 11)	0.4741246485558491
  (2, 10)	0.4741246485558491
  (2, 5)	0.4741246485558491
  (3, 25)	0.38775666010579296
  (3, 23)	0.38775666010579296
  (3, 21)	0.38775666010579296
  (3, 20)	0.38775666010579296
  (3, 17)	0.38775666010579296
  (3, 1)	0.38775666010579296
  (3, 0)	0.3128396318588854
  (4, 26)	0.52677824987419
  (4, 0)	0.8500027502658362


In [8]:
from nltk import ngrams
from nltk.tokenize import word_tokenize

N = 3

sentence = 'hello, how are you?'
tokenized_sentence = word_tokenize(sentence)
n_grams = list(ngrams(tokenized_sentence, N))
print(n_grams)

[('hello', ',', 'how'), (',', 'how', 'are'), ('how', 'are', 'you'), ('are', 'you', '?')]


In [10]:
from nltk.stem import WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
import nltk
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stemmer = SnowballStemmer("english")

words = ['fishing', 'fishes', 'fished']

for word in words:
    print(f'word = {word}')
    print(f'lemma = {lemmatizer.lemmatize(word)}')
    print(f'stem = {stemmer.stem(word)}')

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/billhikari/nltk_data...


word = fishing
lemma = fishing
stem = fish
word = fishes
lemma = fish
stem = fish
word = fished
lemma = fished
stem = fish


In [16]:
# topic extraction
# non-negative matrix fractionization (NMF)
# latent semantic analysis (LSA), which is also known as singular value decomposition (SVD)

import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer

# create a corpus of sentences
corpus = pd.read_csv('../input/imdb.csv', nrows=10000)
corpus = corpus.review.values  # corpus.review.values is a numpy array. type: numpy.ndarray
# corpus = corpus.review # corpus.review is a pandas series. type: pandas.Series

# initialize TfidfVectorizer
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)

# fit the vectorizer on corpus
tfv.fit(corpus)

# transform the corpus
corpus_transformed = tfv.transform(corpus)

# initialize SVD with 10 components
svd = decomposition.TruncatedSVD(n_components=10)

# fit SVD
corpus_svd = svd.fit(corpus_transformed)

# choose first sample and create a dictionary of feature names and their scores from 
# svd components.
sample_index = 0
feature_scores = dict(
    zip(
        tfv.get_feature_names_out(),
        corpus_svd.components_[sample_index]
    )
)

# sort the feature names based on their scores
N = 5

for sample_index in range(5):
    feature_scores = dict(
        zip(
            tfv.get_feature_names_out(),
            corpus_svd.components_[sample_index]
        )
    )
    print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])

['the', ',', '.', 'a', 'and']
['br', '<', '>', '/', '-']
['i', 'movie', '!', 'it', 'was']
[',', '!', "''", '``', 'you']
['!', 'the', "''", '``', '...']


- Use corpus.review (pandas Series) when:

You need to perform pandas-specific operations, like data manipulation, cleaning, or aggregation.

You need to leverage the rich set of methods available in pandas for text processing, filtering, and more.

- Use corpus.review.values (NumPy array) when:

You need to pass the data to functions or libraries that require NumPy arrays.

You want to perform operations that are more efficient or simpler with NumPy arrays, such as vectorized operations or interfacing with machine learning libraries like scikit-learn.

- Purpose of transform

The transform method is crucial because it converts the text data into numerical form. Machine learning models require numerical input, and the TF-IDF representation is a commonly used method for converting text data into numerical features. Here’s why this step is important:

- Numerical Representation:

Text data cannot be directly used by machine learning algorithms that require numerical input.

The transform method converts the corpus into a sparse matrix of TF-IDF features, which can be used by these algorithms.

- Feature Extraction:

The TF-IDF (Term Frequency-Inverse Document Frequency) method provides a measure of how important a word is to a document in a corpus.

This transformation helps to highlight the important words in the corpus while reducing the impact of less important ones.

- Consistency:

The fit method builds the vocabulary and computes IDF values based on the corpus.

The transform method uses this learned information to ensure that the same vocabulary and weighting are applied consistently across all documents.

In [17]:
# to clean any text data, we can use the following function
import re
import string

def clean_text(s):
    '''
    This function cleans the text data
    :param s: str
    :return: str
    '''

    # split by all whitespaces
    s= s.split()

    # join tokens by singe  space
    # this will remove all kinds of spaces including tab, newline, etc.
    # and replaces it with a single space
    s = ' '.join(s)

    # remove all punctuations using regex and string module
    s = re.sub(f'[{re.escape(string.punctuation)}]', '', s)

    return s

In [18]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn import decomposition
from sklearn.feature_extraction.text import TfidfVectorizer

# create a corpus of sentences
corpus = pd.read_csv('../input/imdb.csv', nrows=10000)
corpus.loc[:, 'review'] = corpus.review.apply(clean_text)

corpus = corpus.review.values
tfv = TfidfVectorizer(tokenizer=word_tokenize, token_pattern=None)
tfv.fit(corpus)

corpus_transformed = tfv.transform(corpus)

svd = decomposition.TruncatedSVD(n_components=10)
corpus_svd = svd.fit(corpus_transformed)

sample_index = 0
feature_scores = dict(
    zip(
        tfv.get_feature_names_out(),
        corpus_svd.components_[sample_index]
    )
)

N = 5

for sample_index in range(5):
    feature_scores = dict(
        zip(
            tfv.get_feature_names_out(),
            corpus_svd.components_[sample_index]
        )
    )
    print(sorted(feature_scores, key=feature_scores.get, reverse=True)[:N])

['the', 'a', 'and', 'of', 'to']
['i', 'movie', 'it', 'was', 'this']
['the', 'was', 'i', 'were', 'of']
['her', 'was', 'she', 'i', 'he']
['br', 'to', 'they', 'he', 'show']


We can also remove stopwords in cleaning function, but is not always a wisw choice and depends a lot on the business problem.

We take all individual word vectors in a given sentence and create a normalized word vector from all word vectors of the tokens. 

This provdie us with a sentence vector.

In [19]:
import numpy as np


def sentence_to_vec(s, embedding_dict, stop_words, tokenizer):
    '''Given a sentence and other information, this function returns embedding for the whole sentence
    :param s: sentence, string
    :param embedding_dict: dictionary, a dictionary that has words as keys and their embeddings as values
    :param stop_words: list of stop words, if any
    :param tokenizer: a Tokenizer object
    '''

    # convert sentence to string and lowercase it
    words = str(s).lower()

    # tokenize the sentence
    words = tokenizer(words)

    # remove stop words
    words = [w for w in words if not w in stop_words]

    # keep only alpha-numeric tokens
    words = [w for w in words if w.isalpha()]

    # initialize empty list to store embeddings

    M = []

    for w in words:
        # for every word, fetch the embedding from the dictionary and append to list of embeddings
        if w in embedding_dict:
            M.append(embedding_dict[w])
    if len(M) == 0:
        return np.zeros(300)

    # convert list of embeddings to array
    M = np.array(M)

    # calculate sum over axis 0
    v = M.sum(axis=0)

    return v / np.sqrt((v ** 2).sum())