In [3]:
documents = [
    "this movie is very good" ,
    "this movie is not very good" ,
    "this movie is very bad"

]
for d in documents :
    print(d)

this movie is very good
this movie is not very good
this movie is very bad


Vocabulary construction

In [5]:
vocab = sorted(set(" ".join(documents).split()))
vocab

['bad', 'good', 'is', 'movie', 'not', 'this', 'very']

ONE HOT ENCODING - filling 1 if the vocab exists, 0 if it doesnt , for each document in a 'V' dimensional matrix

In [6]:
import numpy as np
def one_hot(word,vocab):
  vector= np.zeros(len(vocab))
  vector[vocab.index(word)]=1
  return vector
one_hot("good",vocab)

array([0., 1., 0., 0., 0., 0., 0.])

BAG OF WORDS - DOCUMENT LEVEL REPRESENTATION
counting word frequencies, disregarding grammar and word order.

In [7]:
from sklearn.feature_extraction.text import CountVectorizer
bow= CountVectorizer()
bow_matrix = bow.fit_transform(documents)
print(bow.get_feature_names_out())
print(bow_matrix.toarray())

['bad' 'good' 'is' 'movie' 'not' 'this' 'very']
[[0 1 1 1 0 1 1]
 [0 1 1 1 1 1 1]
 [1 0 1 1 0 1 1]]


TF-IDF : where semantic relationships matter

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf= TfidfVectorizer()
tfidf_matrix= tfidf.fit_transform(documents)
print(tfidf.get_feature_names_out())
print(tfidf_matrix.toarray())

['bad' 'good' 'is' 'movie' 'not' 'this' 'very']
[[0.         0.54134281 0.42040099 0.42040099 0.         0.42040099
  0.42040099]
 [0.         0.44102652 0.34249643 0.34249643 0.57989687 0.34249643
  0.34249643]
 [0.64612892 0.         0.38161415 0.38161415 0.         0.38161415
  0.38161415]]


custom features

In [10]:
positive_words = {"good", "very"}
negative_words = {"bad", "not"}
def custom_features(text):
    words=text.split()
    pos = sum(1 for w in words if w in positive_words)
    neg = sum(1 for w in words if w in negative_words)
    length = len(words)
    return [pos, neg, length]
custom_feature_matrix = [custom_features(d) for d in documents]
custom_feature_matrix



[[2, 0, 5], [2, 1, 6], [1, 1, 5]]

HYBRID FEATURES

In [11]:
import numpy as np

hybrid_features = np.hstack([
    tfidf_matrix.toarray(),
    np.array(custom_feature_matrix)
])

hybrid_features

array([[0.        , 0.54134281, 0.42040099, 0.42040099, 0.        ,
        0.42040099, 0.42040099, 2.        , 0.        , 5.        ],
       [0.        , 0.44102652, 0.34249643, 0.34249643, 0.57989687,
        0.34249643, 0.34249643, 2.        , 1.        , 6.        ],
       [0.64612892, 0.        , 0.38161415, 0.38161415, 0.        ,
        0.38161415, 0.38161415, 1.        , 1.        , 5.        ]])

Bag of N grams

In [12]:
ngram = CountVectorizer(ngram_range=(1,2))
ngram_matrix = ngram.fit_transform(documents)

print("\nBAG OF N-GRAMS - Vocabulary:")
print(ngram.get_feature_names_out())

print("\nBAG OF N-GRAMS - Matrix:")
print(ngram_matrix.toarray())


BAG OF N-GRAMS - Vocabulary:
['bad' 'good' 'is' 'is not' 'is very' 'movie' 'movie is' 'not' 'not very'
 'this' 'this movie' 'very' 'very bad' 'very good']

BAG OF N-GRAMS - Matrix:
[[0 1 1 0 1 1 1 0 0 1 1 1 0 1]
 [0 1 1 1 0 1 1 1 1 1 1 1 0 1]
 [1 0 1 0 1 1 1 0 0 1 1 1 1 0]]
