# Numerical representations of text

## Bag of words

#### Using python

In [15]:
docs = ["SUPERB, I AM IN LOVE IN THIS PHONE", "I hate this phone"]

words = [word for doc in docs for word in doc.lower().split()]

words = list(set(words))

words

In [16]:
vectors = []
for doc in docs:
    vectors.append([1 if word in doc.lower().split() else 0 for word in words])
print("vocabulary: ", words)   
print("vectors: ", vectors)

vocabulary:  ['i', 'superb,', 'in', 'hate', 'phone', 'this', 'love', 'am']
vectors:  [[1, 1, 1, 0, 1, 1, 1, 1], [1, 0, 0, 1, 1, 1, 0, 0]]


#### Using sci-kit learn

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
docs = ['SUPERB, I AM IN LOVE IN THIS PHONE', 'I hate this phone']
vectorizer = CountVectorizer()
vectorizer.fit(docs)
print('vocabulary: ', vectorizer.vocabulary_)

vocabulary:  {'superb': 5, 'am': 0, 'in': 2, 'love': 3, 'this': 6, 'phone': 4, 'hate': 1}


In [21]:
vector = vectorizer.transform(docs)
print('shape: ', vector.shape)
print('vectors: ', vector.toarray())

shape:  (2, 7)
vectors:  [[1 0 2 1 1 1 1]
 [0 1 0 0 1 0 1]]


## tfidf

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
vectorizer = TfidfVectorizer()
vectorizer.fit(docs)
# summarize
print('vocabulary: ', vectorizer.vocabulary_)
print('idfs: ', vectorizer.idf_)

vocabulary:  {'superb': 5, 'am': 0, 'in': 2, 'love': 3, 'this': 6, 'phone': 4, 'hate': 1}
idfs:  [1.40546511 1.40546511 1.40546511 1.40546511 1.         1.40546511
 1.        ]


In [24]:
# encode document
vector = vectorizer.transform([docs[0]])
# summarize encoded vector
print('vectors: ', vector.toarray())

vectors:  [[0.35327777 0.         0.70655553 0.35327777 0.25136004 0.35327777
  0.25136004]]


In [25]:
docs[0]

'SUPERB, I AM IN LOVE IN THIS PHONE'