# Bag of Words

In [None]:
# imports
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
df = pd.DataFrame({
    'Text': ['I love cats', 'Cats are cute', 'I like cats', 'Cats are cats', 'Cats watching cats', 'My life my rules']
})
print(df, '\n')

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(tokenizer=lambda txt: txt.split())
bow = cv.fit_transform(df['Text'])

                 Text
0         I love cats
1       Cats are cute
2         I like cats
3       Cats are cats
4  Cats watching cats
5    My life my rules 





In [None]:
print(cv.vocabulary_)

{'i': 3, 'love': 6, 'cats': 1, 'are': 0, 'cute': 2, 'like': 5, 'watching': 9, 'my': 7, 'life': 4, 'rules': 8}


In [None]:
bow_df = pd.DataFrame(np.concatenate([[np.array(cv.get_feature_names_out())], bow.toarray()]))
bow_df[''] = ['Vocabulary', 'D1', 'D2', 'D3', 'D4', 'D5', 'D6']
bow_df = bow_df.set_index('')
bow_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
,,,,,,,,,,
Vocabulary,are,cats,cute,i,life,like,love,my,rules,watching
D1,0,1,0,1,0,0,1,0,0,0
D2,1,1,1,0,0,0,0,0,0,0
D3,0,1,0,1,0,1,0,0,0,0
D4,1,2,0,0,0,0,0,0,0,0
D5,0,2,0,0,0,0,0,0,0,1
D6,0,0,0,0,1,0,0,2,1,0


# n-grams

In [None]:
# imports
from nltk import ngrams

In [None]:
sentence = 'this is a foo bar sentences and i want to ngramize it'
n = 4
n_grams = ngrams(sentence.split(), 4)
for grams in n_grams:
  print(grams)

('this', 'is', 'a', 'foo')
('is', 'a', 'foo', 'bar')
('a', 'foo', 'bar', 'sentences')
('foo', 'bar', 'sentences', 'and')
('bar', 'sentences', 'and', 'i')
('sentences', 'and', 'i', 'want')
('and', 'i', 'want', 'to')
('i', 'want', 'to', 'ngramize')
('want', 'to', 'ngramize', 'it')


# One Hot Encoding

In [None]:
vocabulary_list = list(set(' '.join(['I love cats', 'Cats are cute', 'I like cats']).lower().split()))
vocabulary_list.sort()

In [None]:
# using pandas
ohe_df = pd.get_dummies(vocabulary_list)
ohe_df['Vocabulary'] = vocabulary_list
ohe_df = ohe_df.set_index('Vocabulary')
ohe_df

Unnamed: 0_level_0,are,cats,cute,i,like,love
Vocabulary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
are,1,0,0,0,0,0
cats,0,1,0,0,0,0
cute,0,0,1,0,0,0
i,0,0,0,1,0,0
like,0,0,0,0,1,0
love,0,0,0,0,0,1


In [None]:
# using sciktlearn
from sklearn.preprocessing import OneHotEncoder

vocabulary_array = np.array(vocabulary_list).reshape(-1, 1)
ohe = OneHotEncoder()
ohe_data = ohe.fit_transform(vocabulary_array)
ohe_data.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.]])

In [None]:
ohe_df = pd.DataFrame(ohe_data.toarray(), columns=vocabulary_list)
ohe_df['Vocabulary'] = vocabulary_list
ohe_df = ohe_df.set_index('Vocabulary')
ohe_df

Unnamed: 0_level_0,are,cats,cute,i,like,love
Vocabulary,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
are,1.0,0.0,0.0,0.0,0.0,0.0
cats,0.0,1.0,0.0,0.0,0.0,0.0
cute,0.0,0.0,1.0,0.0,0.0,0.0
i,0.0,0.0,0.0,1.0,0.0,0.0
like,0.0,0.0,0.0,0.0,1.0,0.0
love,0.0,0.0,0.0,0.0,0.0,1.0


# TF IDF

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
corpus = ['I love cats', 'Cats are cute', 'I like cats']

tf_idf = TfidfVectorizer()
tf_idf.fit_transform(corpus)


<3x5 sparse matrix of type '<class 'numpy.float64'>'
	with 7 stored elements in Compressed Sparse Row format>

In [5]:
tf_idf.get_feature_names_out()

array(['are', 'cats', 'cute', 'like', 'love'], dtype=object)