# Feature Extraction for Natural Language Processing

#### Python imports

In [1]:
import nltk
import pandas as pd
from nltk.tokenize import WordPunctTokenizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
%reload_ext watermark

In [3]:
%watermark -n -v -iv

Python implementation: CPython
Python version       : 3.10.6
IPython version      : 8.7.0

pandas: 1.5.2
nltk  : 3.8.1



#### Set path to nltk data

In [4]:
nltk.data.path.append("./nltk_data")

#### Corpus (set of text documents)

In [5]:
sentences = ['She likes to swim.', 'He loves to read.', 'They like to bike and to swim.']
sentences

['She likes to swim.', 'He loves to read.', 'They like to bike and to swim.']

#### Word and Punctuation Tokenizer

In [6]:
tokenizer = WordPunctTokenizer()

#### Normalize the Corpus

In [7]:
for i in range(len(sentences)):
    tokens = tokenizer.tokenize(sentences[i])
    words = [word.lower() for word in tokens if word.isalpha()]
    sentences[i] = ' '.join(words)
sentences

['she likes to swim', 'he loves to read', 'they like to bike and to swim']

#### Extract unique Unigram words from the Corpus

In [8]:
all_words = []
for sentence in sentences:
    tokens = tokenizer.tokenize(sentence)
    all_words.extend(tokens)
words = sorted(set(all_words), key=all_words.index)
words

['she',
 'likes',
 'to',
 'swim',
 'he',
 'loves',
 'read',
 'they',
 'like',
 'bike',
 'and']

## One-Hot Encoding

#### Create an instance of the One-Hot Encoder

In [9]:
model1 = CountVectorizer(binary=True, vocabulary=words)

#### One-Hot Encoding as a pandas Dataframe

In [10]:
matrix = model1.fit_transform(sentences).toarray()
one_hot_words = pd.DataFrame(data=matrix, columns=words)
one_hot_words

Unnamed: 0,she,likes,to,swim,he,loves,read,they,like,bike,and
0,1,1,1,1,0,0,0,0,0,0,0
1,0,0,1,0,1,1,1,0,0,0,0
2,0,0,1,1,0,0,0,1,1,1,1


## Bag of Words (BoW)

#### Create an instance of the BoW model (Unigram)

In [11]:
model2 = CountVectorizer(vocabulary=words)

#### Bag of Words as a pandas Dataframe (Unigrams)

In [12]:
matrix = model2.fit_transform(sentences).toarray()
bag_of_words = pd.DataFrame(data=matrix, columns=words)
bag_of_words

Unnamed: 0,she,likes,to,swim,he,loves,read,they,like,bike,and
0,1,1,1,1,0,0,0,0,0,0,0
1,0,0,1,0,1,1,1,0,0,0,0
2,0,0,2,1,0,0,0,1,1,1,1


#### Create an instance of the BoW model (n-gram)

In [13]:
model3 = CountVectorizer(ngram_range=(2,2))

#### Bag of Words as a pandas Dataframe (n-grams)

In [14]:
matrix = model3.fit_transform(sentences).toarray()
bag_of_words2 = pd.DataFrame(data=matrix, columns=model3.get_feature_names_out())
bag_of_words2

Unnamed: 0,and to,bike and,he loves,like to,likes to,loves to,she likes,they like,to bike,to read,to swim
0,0,0,0,0,1,0,1,0,0,0,1
1,0,0,1,0,0,1,0,0,0,1,0
2,1,1,0,1,0,0,0,1,1,0,1


## Term Frequency - Inverse Document Frequency (TF-IDF)

#### Create an instance of the TF-IDF model

In [15]:
model4 = TfidfVectorizer(vocabulary=words)

#### TF-IDF as a pandas DataFrame

In [16]:
matrix = model4.fit_transform(sentences).toarray()
tf_idf_words = pd.DataFrame(data=matrix, columns=words)
tf_idf_words

Unnamed: 0,she,likes,to,swim,he,loves,read,they,like,bike,and
0,0.584483,0.584483,0.345205,0.444514,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.322745,0.0,0.546454,0.546454,0.546454,0.0,0.0,0.0,0.0
2,0.0,0.0,0.483296,0.311166,0.0,0.0,0.0,0.409146,0.409146,0.409146,0.409146


#### Display the IDF scores

In [17]:
model4.idf_

array([1.69314718, 1.69314718, 1.        , 1.28768207, 1.69314718,
       1.69314718, 1.69314718, 1.69314718, 1.69314718, 1.69314718,
       1.69314718])

#### Extract unique Bigrams from the Corpus

In [18]:
all_bigrams = []
for sentence in sentences:
    all_tokens = tokenizer.tokenize(sentence)
    bi_tokens = list(nltk.bigrams(all_tokens))
    bi_words = [' '.join([w1, w2]) for w1, w2 in bi_tokens]
    all_bigrams.extend(bi_words)
all_bigrams = sorted(set(all_bigrams), key=all_bigrams.index)
all_bigrams

['she likes',
 'likes to',
 'to swim',
 'he loves',
 'loves to',
 'to read',
 'they like',
 'like to',
 'to bike',
 'bike and',
 'and to']

#### Create an instance of the TF-IDF model (Bigrams)

In [19]:
model5 = TfidfVectorizer(ngram_range=(2,2), vocabulary=all_bigrams)

#### TF-IDF as a pandas DataFrame (Bigrams)

In [20]:
matrix = model5.fit_transform(sentences).toarray()
tf_idf_bigrams = pd.DataFrame(data=matrix, columns=all_bigrams)
tf_idf_bigrams

Unnamed: 0,she likes,likes to,to swim,he loves,loves to,to read,they like,like to,to bike,bike and,and to
0,0.622766,0.622766,0.47363,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.57735,0.57735,0.57735,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.322002,0.0,0.0,0.0,0.423394,0.423394,0.423394,0.423394,0.423394
