# NLP Feature Engineering

## One Hot Encoding

In [1]:
Text = "I am learning NLP"

In [2]:
Text.lower()

'i am learning nlp'

In [3]:
import pandas as pd

In [4]:
pd.get_dummies(Text.lower().split())

Unnamed: 0,am,i,learning,nlp
0,0,1,0,0
1,1,0,0,0
2,0,0,1,0
3,0,0,0,1


## Count Vectorizer

In [17]:
Text = ["I love NLP and I will learn NLP in 2month"]

In [18]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
vectorizer = CountVectorizer()

In [20]:
vectorizer.fit(Text)

CountVectorizer()

In [21]:
vector = vectorizer.transform(Text)

In [22]:
print(vectorizer.vocabulary_)

{'love': 4, 'nlp': 5, 'and': 1, 'will': 6, 'learn': 3, 'in': 2, '2month': 0}


In [23]:
print(vector)

  (0, 0)	1
  (0, 1)	1
  (0, 2)	1
  (0, 3)	1
  (0, 4)	1
  (0, 5)	2
  (0, 6)	1


In [24]:
print(vector.toarray())

[[1 1 1 1 1 2 1]]


## Setup for N-grams

In [28]:
import nltk

In [29]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Hp\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

## N-Grams

In [25]:
Text = "I am learning NLP"

In [26]:
from textblob import TextBlob

### Generating n-gram using TextBlob
#### TextBlob(Text).ngrams(n)
#### n adalah variabel untuk menentukan nilai grams

In [30]:
TextBlob(Text).ngrams(1)

[WordList(['I']), WordList(['am']), WordList(['learning']), WordList(['NLP'])]

In [31]:
TextBlob(Text).ngrams(2)

[WordList(['I', 'am']),
 WordList(['am', 'learning']),
 WordList(['learning', 'NLP'])]

In [32]:
TextBlob(Text).ngrams(3)

[WordList(['I', 'am', 'learning']), WordList(['am', 'learning', 'NLP'])]

In [33]:
TextBlob(Text).ngrams(4)

[WordList(['I', 'am', 'learning', 'NLP'])]

### From N-grams to CountVectorier

In [36]:
Text = ["I love NLP and I will learn NLP in 2month"]

In [35]:
vectorizer = CountVectorizer(ngram_range=(2,2))

#### ngram_range parameter on scikit_learn (Based on scikit_learn docs)
ngram_range: tuple (min_n, max_n), default=(1, 1)
The lower and upper boundary of the range of n-values for different word n-grams or char n-grams to be extracted. All values of n such such that min_n <= n <= max_n will be used. For example an ngram_range of (1, 1) means only unigrams, (1, 2) means unigrams and bigrams, and (2, 2) means only bigrams. Only applies if analyzer is not callable.

In [37]:
vectorizer.fit(Text)

CountVectorizer(ngram_range=(2, 2))

In [38]:
vector = vectorizer.transform(Text)

In [40]:
print(vectorizer.vocabulary_)

{'love nlp': 3, 'nlp and': 4, 'and will': 0, 'will learn': 6, 'learn nlp': 2, 'nlp in': 5, 'in 2month': 1}


In [41]:
print(vector.toarray())

[[1 1 1 1 1 1 1]]


### From N-Grams to One Hot Encoding

In [44]:
def getKey(dict):
    text_list = []
    for key in dict.keys():
        text_list.append(key)
    return text_list

In [46]:
pd.get_dummies(getKey(vectorizer.vocabulary_))

Unnamed: 0,and will,in 2month,learn nlp,love nlp,nlp and,nlp in,will learn
0,0,0,0,1,0,0,0
1,0,0,0,0,1,0,0
2,1,0,0,0,0,0,0
3,0,0,0,0,0,0,1
4,0,0,1,0,0,0,0
5,0,0,0,0,0,1,0
6,0,1,0,0,0,0,0


## Co-occurence Matrix

In [47]:
import numpy as np
import nltk
from nltk import bigrams
import itertools

In [75]:
def co_occurence_matrix(corpus):
    #make the corpus ordered
    vocab = set(corpus)
    vocab = list(vocab)
    
    #Get words and index dict
    vocab_to_index = {word: i for i, word in enumerate(vocab)}
    
    #Create the bigrams
    bi_grams = list(bigrams(corpus))
    
    #frequency distribution of bigrams ((word1, word2), num_occur)
    bigram_freq = nltk.FreqDist(bi_grams).most_common(len(bi_grams))
    #print(bigram_freq)
    
    #initialize co-occurence matrix
    #co_occur_matrix[current][previous]
    co_occur_matrix = np.zeros((len(vocab), len(vocab)))
    
    #loop through the bigrams taking the current and previous word,
    #and the number of occur of bigrams
    for bigram in bigram_freq:
        curr = bigram[0][1]
        prev = bigram[0][0]
        #print("curr: {0}, prev: {1}".format(curr, prev))
        count = bigram[1]
        pos_curr = vocab_to_index[curr]
        pos_prev = vocab_to_index[prev]
        co_occur_matrix[pos_curr][pos_prev] = count
    co_occur_matrix = np.matrix(co_occur_matrix)
    
    #return the matrix and the index
    return co_occur_matrix, vocab_to_index

### Generating co_occurance matrix

In [50]:
sentences = [['I', 'love', 'nlp'],
             ['I', 'love','to' 'learn'],
             ['nlp', 'is', 'future'],
             ['nlp', 'is', 'cool']]

In [54]:
merged = list(itertools.chain.from_iterable(sentences))
#explanation: itertools.chain.from_iterable function simply iterate each word in sentences

In [76]:
matrix, vocab_to_index = co_occurence_matrix(merged)

In [79]:
CoMatrixFinal = pd.DataFrame(matrix, index=vocab_to_index, columns=vocab_to_index)

In [80]:
print(CoMatrixFinal)

         tolearn  love  future  cool    I  nlp   is
tolearn      0.0   1.0     0.0   0.0  0.0  0.0  0.0
love         0.0   0.0     0.0   0.0  2.0  0.0  0.0
future       0.0   0.0     0.0   0.0  0.0  0.0  1.0
cool         0.0   0.0     0.0   0.0  0.0  0.0  1.0
I            0.0   0.0     0.0   0.0  0.0  1.0  0.0
nlp          1.0   1.0     1.0   0.0  0.0  0.0  0.0
is           0.0   0.0     0.0   0.0  0.0  2.0  0.0


## Hash Vectorizer

In [83]:
from sklearn.feature_extraction.text import HashingVectorizer

In [85]:
text = ["Halo dunia NLP yang keren"]

In [84]:
vectorizer = HashingVectorizer(n_features=20)

In [86]:
vector = vectorizer.transform(text)

In [89]:
print(vector.shape)

(1, 20)


In [90]:
print(vector.toarray())

[[ 0.         0.        -0.4472136  0.         0.        -0.4472136
   0.         0.4472136  0.         0.         0.         0.
   0.        -0.4472136  0.        -0.4472136  0.         0.
   0.         0.       ]]


## TF-IDF Vectorizer

In [92]:
text = ["I love nlp",
        "I love to learn",
        "nlp is future",
        "nlp is cool"]

In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [94]:
vectorizer = TfidfVectorizer()

In [96]:
vector = vectorizer.fit(text)

In [97]:
print(vector.vocabulary_)

{'love': 4, 'nlp': 5, 'to': 6, 'learn': 3, 'is': 2, 'future': 1, 'cool': 0}


In [98]:
print(vector.idf_)

[1.91629073 1.91629073 1.51082562 1.91629073 1.51082562 1.22314355
 1.91629073]


In [100]:
type(vector.idf_)

numpy.ndarray

In [115]:
tfidf_matrix = pd.DataFrame(vector.idf_, index=vector.vocabulary_, columns={'value':0})

In [117]:
print(tfidf_matrix)

           value
love    1.916291
nlp     1.916291
to      1.510826
learn   1.916291
is      1.510826
future  1.223144
cool    1.916291
