# NLP with Python: Text Feature Extraction

__Import Libraries__

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
texts = [
    "blue car and blue window",
    "black crow in the window",
    "i see my reflection in the window"
]

### Binary Encoding

In [4]:
vocab = sorted(set([word for sentence in texts for word in sentence.split()]))
print(vocab)

['and', 'black', 'blue', 'car', 'crow', 'i', 'in', 'my', 'reflection', 'see', 'the', 'window']


In [7]:
def binary_transform(text):
    output = np.empty(shape=len(vocab))
    words = set(text.split())
    for i, v in enumerate(vocab):
        output[i] = v in words
    return output

In [9]:
print(binary_transform('i love sekardayu'))

[0. 0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]


In [19]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(binary=True)
transformed = vectorizer.fit_transform(texts)
print(transformed)

  (0, 2)	1
  (0, 3)	1
  (0, 0)	1
  (0, 10)	1
  (1, 10)	1
  (1, 1)	1
  (1, 4)	1
  (1, 5)	1
  (1, 9)	1
  (2, 10)	1
  (2, 5)	1
  (2, 9)	1
  (2, 8)	1
  (2, 6)	1
  (2, 7)	1


In [20]:
type(transformed)

scipy.sparse.csr.csr_matrix

In [21]:
print([word for word in vectorizer.vocabulary_.keys()])

['blue', 'car', 'and', 'window', 'black', 'crow', 'in', 'the', 'see', 'my', 'reflection']


In [22]:
print(vectorizer.get_feature_names())

['and', 'black', 'blue', 'car', 'crow', 'in', 'my', 'reflection', 'see', 'the', 'window']


In [23]:
pd.DataFrame(transformed.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,1,0,1,1,0,0,0,0,0,0,1
1,0,1,0,0,1,1,0,0,0,1,1
2,0,0,0,0,0,1,1,1,1,1,1


### Counting

In [24]:
vectorizer = CountVectorizer(binary=False)
transformed = vectorizer.fit_transform(texts)
pd.DataFrame(transformed.toarray(), columns=vectorizer.get_feature_names())

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,1,0,2,1,0,0,0,0,0,0,1
1,0,1,0,0,1,1,0,0,0,1,1
2,0,0,0,0,0,1,1,1,1,1,1


### TF-IDF

TF-IDF stands for term frequency-inverse document frequency. We saw that Counting approach assigns weights to the words based on their frequency and it’s obvious that frequently occurring words will have higher weights. But these words might not be important as other words. For example, let’s consider an article about Travel and another about Politics. Both of these articles will contain words like a, the frequently. But words such as flight, holiday will occur mostly in Travel and parliament, court etc. will appear mostly in Politics. Even though these words appear less frequently than the others, they are more important. TF-IDF assigns more weight to less frequently occurring words rather than frequently occurring ones. It is based on the assumption that less frequently occurring words are more important.

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
transformed = vec.fit_transform(texts)
pd.DataFrame(transformed.toarray(), columns=vec.get_feature_names())

Unnamed: 0,and,black,blue,car,crow,in,my,reflection,see,the,window
0,0.396875,0.0,0.793749,0.396875,0.0,0.0,0.0,0.0,0.0,0.0,0.2344
1,0.0,0.534093,0.0,0.0,0.534093,0.406192,0.0,0.0,0.0,0.406192,0.315444
2,0.0,0.0,0.0,0.0,0.0,0.358291,0.47111,0.47111,0.47111,0.358291,0.278245
