# **Feature Engineering: Basic Approach**

## **1.  One-Hot Encoding**


In [None]:
import nltk
nltk.download('all')

In [None]:
import nltk
from nltk.tokenize import sent_tokenize

Text = """The cat is on the mat.
          My dog and cat are the best.
          The locals are playing."""

sentences = sent_tokenize(Text)
sentences = [sent.lower().replace(".", "") for sent in sentences]
print('Tokenized Sentences :', sentences)

# Create the vocabulary
vocab = {}
count = 0
for sent in sentences:
    for word in sent.split():
        if word not in vocab:
            count = count + 1
            vocab[word] = count
print('vocabulary :', vocab)

# One Hot Encoding
def OneHotEncoder(text):
    onehot_encoded = []
    for word in text.split():
        temp = [0]*len(vocab)
        if word in vocab:
            temp[vocab[word]-1] = 1
            onehot_encoded.append(temp)
    return onehot_encoded


# print('\n',sentences[0])
print('OneHotEncoded vector for sentence : "',
     sentences[0], '"is \n', OneHotEncoder(sentences[0]))

Tokenized Sentences : ['the cat is on the mat', 'my dog and cat are the best', 'the locals are playing']
vocabulary : {'the': 1, 'cat': 2, 'is': 3, 'on': 4, 'mat': 5, 'my': 6, 'dog': 7, 'and': 8, 'are': 9, 'best': 10, 'locals': 11, 'playing': 12}
OneHotEncoded vector for sentence : " the cat is on the mat "is 
 [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]]


## **2.  Bag Of Words (BOW)**

We use the `CountVectorizer` class to build a bag of words representation of the sentences.

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer

Text = """The cat is on the mat.
          My dog and cat are the best.
          The locals are playing."""

# TOKENIZATION
sentences = sent_tokenize(Text)
sentences = [sent.lower().replace(".","") for sent in sentences]
print('Our Corpus:',sentences)

#CountVectorizer : Convert a collection of text documents to a matrix of token counts.
count_vect = CountVectorizer()

# fit & transform will represent each sentences as BOW representation
BOW = count_vect.fit_transform(sentences)

# Get the vocabulary
print("Our vocabulary: ", count_vect.vocabulary_)

# Get the sorted vocabulary by value
print("Our sorted vocabulary: ", sorted(count_vect.vocabulary_))

#see the BOW representation
print(f"BoW representation for {sentences[0]} {BOW[0].toarray()}")
print(f"BoW representation for {sentences[1]} {BOW[1].toarray()}")
print(f"BoW representation for {sentences[2]} {BOW[2].toarray()}")

# BOW representation for a new text
BOW_ = count_vect.transform(["learning NLP is good"])
print("Bow representation for 'learning NLP is good':", BOW_.toarray())


Our Corpus: ['the cat is on the mat', 'my dog and cat are the best', 'the locals are playing']
Our vocabulary:  {'the': 11, 'cat': 3, 'is': 5, 'on': 9, 'mat': 7, 'my': 8, 'dog': 4, 'and': 0, 'are': 1, 'best': 2, 'locals': 6, 'playing': 10}
Our sorted vocabulary:  ['and', 'are', 'best', 'cat', 'dog', 'is', 'locals', 'mat', 'my', 'on', 'playing', 'the']
BoW representation for the cat is on the mat [[0 0 0 1 0 1 0 1 0 1 0 2]]
BoW representation for my dog and cat are the best [[1 1 1 1 1 0 0 0 1 0 0 1]]
BoW representation for the locals are playing [[0 1 0 0 0 0 1 0 0 0 1 1]]
Bow representation for 'learning NLP is good': [[0 0 0 0 0 1 0 0 0 0 0 0]]


## **3.  Bag of n-grams**

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import CountVectorizer

Text = """The cat is on the mat.
          My dog and cat are the best.
          The locals are playing."""

# TOKENIZATION
sentences = sent_tokenize(Text)
sentences = [sent.lower().replace(".", "") for sent in sentences]
print('Our Corpus:', sentences)

# Ngram vectorization example with count
# vectorizer and uni, bi, trigrams
count_vect = CountVectorizer(ngram_range=(1, 3))

# fit & transform will represent each sentences
# as Bag of n-grams representation
BOW_nGram = count_vect.fit_transform(sentences)

# Get the vocabulary
print("Our vocabulary:\n", count_vect.vocabulary_)

# Get the sorted vocabulary
print("Our sorted vocabulary:\n", sorted(count_vect.vocabulary_) )


# see the Bag of n-grams representation
print('Ngram representation for "{}" is {}'
	.format(sentences[0], BOW_nGram[0].toarray()))
print('Ngram representation for "{}" is {}'
	.format(sentences[1], BOW_nGram[1].toarray()))
print('Ngram representation for "{}" is {}'.
	format(sentences[2], BOW_nGram[2].toarray()))

# Bag of n-grams representation for a new text
BOW_nGram_ = count_vect.transform(["dog and cat are playing together"])
print("Ngram representation for 'dog and cat are playing together' is",
	BOW_nGram_.toarray())


Our Corpus: ['the cat is on the mat', 'my dog and cat are the best', 'the locals are playing']
Our vocabulary:
 {'the': 30, 'cat': 8, 'is': 16, 'on': 26, 'mat': 22, 'the cat': 32, 'cat is': 11, 'is on': 17, 'on the': 27, 'the mat': 36, 'the cat is': 33, 'cat is on': 12, 'is on the': 18, 'on the mat': 28, 'my': 23, 'dog': 13, 'and': 0, 'are': 3, 'best': 7, 'my dog': 24, 'dog and': 14, 'and cat': 1, 'cat are': 9, 'are the': 5, 'the best': 31, 'my dog and': 25, 'dog and cat': 15, 'and cat are': 2, 'cat are the': 10, 'are the best': 6, 'locals': 19, 'playing': 29, 'the locals': 34, 'locals are': 20, 'are playing': 4, 'the locals are': 35, 'locals are playing': 21}
Our sorted vocabulary:
 ['and', 'and cat', 'and cat are', 'are', 'are playing', 'are the', 'are the best', 'best', 'cat', 'cat are', 'cat are the', 'cat is', 'cat is on', 'dog', 'dog and', 'dog and cat', 'is', 'is on', 'is on the', 'locals', 'locals are', 'locals are playing', 'mat', 'my', 'my dog', 'my dog and', 'on', 'on the', 

## **4.  TF-IDF**

In [None]:
import nltk
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

Text = """The cat is on the mat.
          My dog and cat are the best.
          The locals are playing."""

# TOKENIZATION
sentences = sent_tokenize(Text)
sentences = [sent.lower().replace(".", "") for sent in sentences]
print('Our Corpus:', sentences)

# TF-IDF
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(sentences)

# All words in the vocabulary.
print("vocabulary", tfidf.get_feature_names_out())
# IDF value for all words in the vocabulary
print("IDF for all words in the vocabulary :\n", tfidf.idf_)

# TFIDF representation for all documents in our corpus
print('\nTFIDF representation for "{}" is \n{}'
	.format(sentences[0], tfidf_matrix[0].toarray()))
print('TFIDF representation for "{}" is \n{}'
	.format(sentences[1], tfidf_matrix[1].toarray()))
print('TFIDF representation for "{}" is \n{}'
	.format(sentences[2],tfidf_matrix[2].toarray()))

# TFIDF representation for a new text
matrix = tfidf.transform(["dog and cat are playing together"])
print("\nTFIDF representation for 'dog and cat are playing together' is\n",
	matrix.toarray())


Our Corpus: ['the cat is on the mat', 'my dog and cat are the best', 'the locals are playing']
vocabulary ['and' 'are' 'best' 'cat' 'dog' 'is' 'locals' 'mat' 'my' 'on' 'playing'
 'the']
IDF for all words in the vocabulary :
 [1.69314718 1.28768207 1.69314718 1.28768207 1.69314718 1.69314718
 1.69314718 1.69314718 1.69314718 1.69314718 1.69314718 1.        ]

TFIDF representation for "the cat is on the mat" is 
[[0.         0.         0.         0.34101521 0.         0.44839402
  0.         0.44839402 0.         0.44839402 0.         0.52965746]]
TFIDF representation for "my dog and cat are the best" is 
[[0.4261835  0.32412354 0.4261835  0.32412354 0.4261835  0.
  0.         0.         0.4261835  0.         0.         0.25171084]]
TFIDF representation for "the locals are playing" is 
[[0.         0.44451431 0.         0.         0.         0.
  0.5844829  0.         0.         0.         0.5844829  0.34520502]]

TFIDF representation for 'dog and cat are playing together' is
 [[0.490479